*** Maybe you will need to reestart the kernel 1 or 2 times to run the code because of the datamaestro library. ***

In [None]:
!pip install -r http://webia.lip6.fr/~baskiotisn/requirements-amal.txt



In [None]:
import math
import click
from torch.utils.tensorboard import SummaryWriter
import logging
import re
from pathlib import Path
from tqdm import tqdm
import numpy as np
import time
from datamaestro import prepare_dataset
import torch.nn.functional as F
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from itertools import chain

class PositionalEncoding(nn.Module):
    "Position embeddings"

    def __init__(self, d_model: int, max_len: int = 5000):
        """Génère des embeddings de position

        Args:
            d_model (int): Dimension des embeddings à générer
            max_len (int, optional): Longueur maximale des textes.
                Attention, plus cette valeur est haute, moins bons seront les embeddings de position.
        """
        super().__init__()

        pe = torch.zeros(max_len, d_model, dtype=torch.float)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float) *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        pe.requires_grad = False
        self.register_buffer('pe', pe)

    def forward(self, x):
        """Ajoute les embeddings de position"""
        x = x + self.pe[:, :x.size(1)]
        return x


MAX_LENGTH = 500

logging.basicConfig(level=logging.INFO)

class FolderText(Dataset):
    """Dataset basé sur des dossiers (un par classe) et fichiers"""

    def __init__(self, classes, folder: Path, tokenizer, load=False):
        self.tokenizer = tokenizer
        self.files = []
        self.filelabels = []
        self.labels = {}
        for ix, key in enumerate(classes):
            self.labels[key] = ix

        for label in classes:
            for file in (folder / label).glob("*.txt"):
                self.files.append(file.read_text() if load else file)
                self.filelabels.append(self.labels[label])

    def __len__(self):
        return len(self.filelabels)

    def __getitem__(self, ix):
        s = self.files[ix]
        return self.tokenizer(s if isinstance(s, str) else s.read_text()), self.filelabels[ix]
    def get_txt(self,ix):
        s = self.files[ix]
        return s if isinstance(s,str) else s.read_text(), self.filelabels[ix]

def get_imdb_data(embedding_size=50):
    """Renvoie l'ensemble des donnéees nécessaires pour l'apprentissage (embedding_size = [50,100,200,300])

    - dictionnaire word vers ID
    - embeddings (Glove)
    - DataSet (FolderText) train
    - DataSet (FolderText) test

    """
    WORDS = re.compile(r"\S+")

    words, embeddings = prepare_dataset(
        'edu.stanford.glove.6b.%d' % embedding_size).load()
    OOVID = len(words)
    words.append("__OOV__")
    word2id = {word: ix for ix, word in enumerate(words)}
    embeddings = np.vstack((embeddings, np.zeros(embedding_size)))

    def tokenizer(t):
        return [word2id.get(x, OOVID) for x in re.findall(WORDS, t.lower())]

    logging.info("Loading embeddings")

    logging.info("Get the IMDB dataset")
    ds = prepare_dataset("edu.stanford.aclimdb")

    return word2id, embeddings, FolderText(ds.train.classes, ds.train.path, tokenizer, load=False), FolderText(ds.test.classes, ds.test.path, tokenizer, load=False)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

emb_size = 100
batch_size = 200

word2id, embeddings, train_data, test_data = get_imdb_data(emb_size)
id2word = dict((v, k) for k, v in word2id.items())
PAD = word2id["__OOV__"]

embeddings = torch.Tensor(embeddings)
emb_layer = nn.Embedding.from_pretrained(torch.Tensor(embeddings), freeze=False)


def collate(batch):
    """ Collate function for DataLoader """
    data = [torch.LongTensor(item[0][:MAX_LENGTH]) for item in batch]
    lens = [len(d) for d in data]
    labels = [item[1] for item in batch]
    return emb_layer(torch.nn.utils.rnn.pad_sequence(data, batch_first=True,padding_value = PAD)).to(device), torch.LongTensor(labels).to(device), torch.Tensor(lens).to(device)


train_loader = DataLoader(train_data, shuffle=True,
                      batch_size=batch_size, collate_fn=collate)
test_loader = DataLoader(test_data, batch_size=batch_size,collate_fn=collate,shuffle=False)

cuda


INFO:root:Loading embeddings
INFO:root:Get the IMDB dataset


In [None]:
class attentionLayer(nn.Module):
    """
    This class defines the structure of a attention model
    """

    def __init__(self, nfeatures, residual):
        super(attentionLayer, self).__init__()
        self.query = nn.Linear(nfeatures, nfeatures)
        self.key = nn.Linear(nfeatures, nfeatures)
        self.value = nn.Linear(nfeatures, nfeatures)
        
        self.residual = residual
        self.nfeatures = nfeatures

        if self.residual:
            self.norm = nn.LayerNorm(nfeatures)

        self.fc = nn.Linear(nfeatures,nfeatures)
        self.relu = nn.ReLU()
        
    def forward(self, X):
        
        Q = self.query(X)
        K = self.key(X)
        V = self.value(X)

        probs = torch.softmax(torch.bmm(Q,K.transpose(1, 2))/math.sqrt(self.nfeatures), dim=2)

        X_next = torch.bmm(probs, V)
        if self.residual:
            X_next = self.norm(X_next + X)
        out = self.relu(self.fc(X_next))
        
        return out

In [None]:
class selfAttentionModel(nn.Module):
    """
    This class defines the structure of a self attention model
    """

    def __init__(self,nfeatures = 100, nlayers = 3,output_size = 2, residual = False, positional_embedding = False, CLS = False):
        super(selfAttentionModel, self).__init__()

        max_len = MAX_LENGTH +1 if CLS else MAX_LENGTH
        self.positional_embedding = positional_embedding
        self.CLS = CLS
        if CLS:
            self.CLS_embedding = nn.Linear(1, nfeatures)
        if positional_embedding:
            self.pos = PositionalEncoding(nfeatures, max_len)
        

        self.attention = nn.ModuleList([attentionLayer(nfeatures, residual).to(device) for _ in range(nlayers)])
        self.fc = nn.Linear(nfeatures, output_size)
        
    def forward(self, X):

        if self.CLS:
            CLS = torch.tensor([1],dtype=torch.float).to(device)
            CLS_emb = self.CLS_embedding(CLS.unsqueeze(0))
            CLS_emb = CLS_emb.repeat(X.shape[0],1,1) #put them in the same dimension
            X = torch.cat((CLS_emb, X), dim = 1)

        if self.positional_embedding:
            X = self.pos(X)

        for att in self.attention:
            X = att(X)

        if self.CLS:
          return self.fc(X[:, 0])

        return self.fc(torch.mean(X,axis = 1))

In [None]:
def train(train_loader, model):
  
  optimizer = torch.optim.Adam(model.parameters(), 0.003)

  model.train()

  losses = []
  acc = []
  for epoch in range(nepochs):
      l=0
      right_pred=0
      for i, (inp, lab, sizes) in enumerate(train_loader):
          inp = inp.to(device)
          lab = lab.to(device).long()
          
          out = model(inp)
          loss = criterion(out, lab)
          right_pred += sum((out.argmax(axis=1) == lab)).item()
          
          optimizer.zero_grad()
          loss.backward()
          optimizer.step()
          
          l+=loss.item()
          
      acc.append(right_pred/len(train_data))
      losses.append(l/i)
      print("Epoch:", epoch, "loss:", losses[epoch], "accuracy:", acc[epoch])

def test(test_loader):
  model.eval()
  right_pred=0
  l=0
  for i, (inp, lab, sizes) in enumerate(test_loader):
      inp = inp.to(device)
      lab = lab.to(device).long()
      
      out = model(inp)
      loss = criterion(out, lab)
      right_pred += sum((out.argmax(axis=1) == lab)).item()
      
      l+=loss.item()
      i+=1
        
  print("Test loss:", l/i, "accuracy:", right_pred/len(test_data))

No residual, no positional embedding and no CLS

In [None]:
model = selfAttentionModel().to(device)

criterion = nn.CrossEntropyLoss().to(device)
#optimizer = torch.optim.Adam(chain(model.parameters(), emb_layer.parameters()), 0.003)

nepochs=5

print("Train")
train(train_loader, model)
print("Evaluation")
test(test_loader)

Train
Epoch: 0 loss: 0.5570387503793163 accuracy: 0.70416
Epoch: 1 loss: 0.42657091564709143 accuracy: 0.80632
Epoch: 2 loss: 0.4082384508463644 accuracy: 0.81548
Epoch: 3 loss: 0.3927518189434082 accuracy: 0.82472
Epoch: 4 loss: 0.40711657438547383 accuracy: 0.8194
Evaluation
Test loss: 0.38565956223011016 accuracy: 0.82828


Residual but no positional embedding and no CLS

In [None]:
model = selfAttentionModel(residual=True).to(device)

print("Train")
train(train_loader, model)
print("Evaluation")
test(test_loader)

Train
Epoch: 0 loss: 0.5690509910064359 accuracy: 0.67844
Epoch: 1 loss: 0.4041598069090997 accuracy: 0.8208
Epoch: 2 loss: 0.36727936289483504 accuracy: 0.8414
Epoch: 3 loss: 0.34002666187382513 accuracy: 0.85404
Epoch: 4 loss: 0.3264164826081645 accuracy: 0.86176
Evaluation
Test loss: 0.33634555411338807 accuracy: 0.8512


rediual and positional embedding but no CLS

In [None]:
model = selfAttentionModel(residual=True, positional_embedding=True).to(device)

print("Train")
train(train_loader, model)
print("Evaluation")
test(test_loader)

Train
Epoch: 0 loss: 0.7026134191020843 accuracy: 0.50384
Epoch: 1 loss: 0.6982528533666365 accuracy: 0.5106
Epoch: 2 loss: 0.6108506235384172 accuracy: 0.67412
Epoch: 3 loss: 0.5765913277864456 accuracy: 0.70804
Epoch: 4 loss: 0.5546919657818733 accuracy: 0.72164
Evaluation
Test loss: 0.5105567407608033 accuracy: 0.74648


rediual, positional embedding and CLS

In [None]:
model = selfAttentionModel(residual=True, positional_embedding=True, CLS=True).to(device)
print("Train")
train(train_loader, model)
print("Evaluation")
test(test_loader)

Train
Epoch: 0 loss: 0.6590775890696433 accuracy: 0.58608
Epoch: 1 loss: 0.5009846516674564 accuracy: 0.76392
Epoch: 2 loss: 0.4636459586120421 accuracy: 0.78372
Epoch: 3 loss: 0.43648768408644584 accuracy: 0.8
Epoch: 4 loss: 0.4221378841227101 accuracy: 0.80884
Evaluation
Test loss: 0.42838320195674895 accuracy: 0.8102
