In [218]:
import re
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence

In [219]:
# Tokenize a sentence
def clean_str(string, tolower=True):
    """
    Tokenization/string cleaning.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    if tolower:
        string = string.lower()
    return string.strip()


# reads the content of the file passed as an argument.
# if limit > 0, this function will return only the first "limit" sentences in the file.
def loadTexts(filename, limit=-1):
    dataset=[]
    with open(filename) as f:
        line = f.readline()
        cpt=1
        skip=0
        while line :
            cleanline = clean_str(f.readline()).split()
            if cleanline: 
                dataset.append(cleanline)
            else: 
                line = f.readline()
                skip+=1
                continue
            if limit > 0 and cpt >= limit: 
                break
            line = f.readline()
            cpt+=1        

        print("Load ", cpt, " lines from ", filename , " / ", skip ," lines discarded")
    return dataset


In [220]:
LIM = 5000
txtfile = "imdb/imdb.neg"  # path of the file containing positive reviews
postxt = loadTexts(txtfile,limit=LIM)

txtfile = "imdb/imdb.pos"  # path of the file containing negative reviews
negtxt = loadTexts(txtfile,limit=LIM)


Load  5000  lines from  imdb/imdb.neg  /  1  lines discarded
Load  5000  lines from  imdb/imdb.pos  /  1  lines discarded


In [221]:
#Split the data
from sklearn.model_selection import train_test_split

data = postxt + negtxt
labels = [1] * len(postxt) + [0] * len(negtxt)
txt_train, txt_temp, label_train, label_temp = train_test_split(data, labels, test_size=0.3, stratify=labels, random_state=42)
txt_dev, txt_test, label_dev, label_test = train_test_split(txt_temp, label_temp, test_size=0.5, stratify=label_temp, random_state=42)

print(f"Training set: {len(txt_train)}")
print(f"Dev set: {len(txt_dev)}")
print(f"Test set: {len(txt_test)}")


Training set: 7000
Dev set: 1500
Test set: 1500


In [222]:
#Set up dictionary
def map_word(data):
    dico={}
    i = 1
    for sentence in data : 
        for word in sentence :
            if word not in dico:
                dico[word]= i
                i +=1
    return dico
my_dico= map_word(txt_train)

In [223]:
#Convert data to tensors
def convert_to_tensor(data, label, dico):
    sentences_tensor= []
    labels_tensor=[]
    for sentence, sublabel in zip(data, label) : 
        sentence_int = [dico[x] for x in sentence if x in dico]
        if sentence_int :
            sentences_tensor.append(torch.tensor(sentence_int, dtype=torch.long))
            labels_tensor.append(torch.tensor(sublabel, dtype=torch.long))
    return sentences_tensor, labels_tensor

train_sentence, train_label = convert_to_tensor(txt_train, label_train, my_dico)
test_sentence, test_label=convert_to_tensor(txt_test, label_test, my_dico)
dev_sentence, dev_label=convert_to_tensor(txt_dev, label_dev, my_dico)

In [224]:
# BAG of word classifier without hidden layer
class CBOW_classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW_classifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # Optional: hidden layer
        self.output = nn.Linear(embedding_dim, 1)
        
    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        bag_of_words = embeddings.mean(dim=1)
        output = self.output(bag_of_words)
        return torch.sigmoid(output)

In [225]:
#BAG of word classifier with 1 hidden layer
class CBOW_classifier2(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(CBOW_classifier2, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.hidden = nn.Linear(embedding_dim, hidden_dim)
        self.activation = nn.ReLU()
        self.output = nn.Linear(hidden_dim, 1)
        
    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        bag_of_words = embeddings.mean(dim=1)
        hidden_out = self.activation(self.hidden(bag_of_words)) 
        output = self.output(hidden_out)
        return torch.sigmoid(output)


In [231]:
#CNN classifier
class CNN_classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_filters, filter_sizes, output_dim):
        super(CNN_classifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([nn.Conv2d(1, num_filters, (fs, embedding_dim)) for fs in filter_sizes])
        self.fc = nn.Linear(len(filter_sizes) * num_filters, output_dim)

    def forward(self, inputs):
        embedded = self.embedding(inputs)
        embedded = embedded.unsqueeze(1)
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = torch.cat(pooled, dim=1)
        output = self.fc(cat)
        return torch.sigmoid(output)


In [227]:
#Training definition for all the models
def train_loop(model, train_sentence, train_label, dev_sentence, dev_label, criterion, optimizer, num_epochs, batch_size):
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        correct = 0
        total = 0
        for i in range(0, len(train_sentence), batch_size):
            #Batching
            batch_sentences = train_sentence[i:i + batch_size]
            batch_labels = train_label[i:i + batch_size]
            batch_sentences = pad_sequence(batch_sentences, batch_first=True, padding_value=0)
            batch_labels = torch.tensor(batch_labels, dtype=torch.float32)
            
            # Forward pass
            outputs = model(batch_sentences)
            outputs = outputs.squeeze(1)
            
            # Calcul de la perte
            loss = criterion(outputs, batch_labels)
            
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
            # Calculer l'accuracy pour le batch
            predictions = (outputs > 0.5).long()
            correct += (predictions == batch_labels.long()).sum().item()
            total += batch_labels.size(0)
        
        # Perte moyenne et précision
        train_loss = epoch_loss / len(train_sentence)
        train_accuracy = 100 * correct / total
        
        # Évaluer la précision sur l'ensemble de validation
        dev_accuracy = evaluate_model(model, dev_sentence, dev_label, batch_size)
        
        print(f"Epoch {epoch+1}/{num_epochs}:")
        print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%, Dev Accuracy: {dev_accuracy:.2f}%")

def evaluate_model(model, dev_sentence, dev_label, batch_size):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for i in range(0, len(dev_sentence), batch_size):
            batch_sentences = dev_sentence[i:i + batch_size]
            batch_labels = dev_label[i:i + batch_size]
            
            # Padding
            batch_sentences = pad_sequence(batch_sentences, batch_first=True, padding_value=0)
            batch_labels = torch.tensor(batch_labels, dtype=torch.float32)
            
            # Forward pass
            outputs = model(batch_sentences)
            outputs = outputs.squeeze(1)
            predictions = (outputs > 0.5).long()
            
            # Calculer le nombre de prédictions correctes
            correct += (predictions == batch_labels.long()).sum().item()
            total += batch_labels.size(0)
    
    return 100 * correct / total


In [234]:
#CBOW training
print("CBOW")
vocab_size = len(my_dico) + 1
embedding_dim = 50
model = CBOW_classifier(vocab_size, embedding_dim)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
num_epochs = 25
batch_size = 32
train_loop(model, train_sentence, train_label, dev_sentence, dev_label, criterion, optimizer, num_epochs, batch_size)

CBOW
Epoch 1/25:
Train Loss: 0.0215, Train Accuracy: 54.80%, Dev Accuracy: 61.04%
Epoch 2/25:
Train Loss: 0.0209, Train Accuracy: 61.69%, Dev Accuracy: 66.80%
Epoch 3/25:
Train Loss: 0.0202, Train Accuracy: 67.77%, Dev Accuracy: 70.05%
Epoch 4/25:
Train Loss: 0.0192, Train Accuracy: 72.90%, Dev Accuracy: 72.09%
Epoch 5/25:
Train Loss: 0.0180, Train Accuracy: 76.36%, Dev Accuracy: 73.85%
Epoch 6/25:
Train Loss: 0.0167, Train Accuracy: 78.99%, Dev Accuracy: 75.88%
Epoch 7/25:
Train Loss: 0.0155, Train Accuracy: 81.71%, Dev Accuracy: 76.22%
Epoch 8/25:
Train Loss: 0.0143, Train Accuracy: 83.64%, Dev Accuracy: 76.83%
Epoch 9/25:
Train Loss: 0.0133, Train Accuracy: 85.33%, Dev Accuracy: 77.44%
Epoch 10/25:
Train Loss: 0.0124, Train Accuracy: 86.70%, Dev Accuracy: 78.39%
Epoch 11/25:
Train Loss: 0.0116, Train Accuracy: 87.80%, Dev Accuracy: 78.46%
Epoch 12/25:
Train Loss: 0.0109, Train Accuracy: 88.79%, Dev Accuracy: 78.73%
Epoch 13/25:
Train Loss: 0.0102, Train Accuracy: 89.49%, Dev Accurac

In [236]:
#CBOW2 training(one more hidden layer)
print("CBOW 1 hidden layer")
vocab_size = len(my_dico) + 1
embedding_dim = 50
model = CBOW_classifier2(vocab_size, embedding_dim, 25)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
num_epochs = 25
batch_size = 32
train_loop(model, train_sentence, train_label, dev_sentence, dev_label, criterion, optimizer, num_epochs, batch_size)

CBOW 1 hidden layer
Epoch 1/25:
Train Loss: 0.0215, Train Accuracy: 53.61%, Dev Accuracy: 58.54%
Epoch 2/25:
Train Loss: 0.0205, Train Accuracy: 63.70%, Dev Accuracy: 68.16%
Epoch 3/25:
Train Loss: 0.0180, Train Accuracy: 72.27%, Dev Accuracy: 72.22%
Epoch 4/25:
Train Loss: 0.0154, Train Accuracy: 77.40%, Dev Accuracy: 76.22%
Epoch 5/25:
Train Loss: 0.0135, Train Accuracy: 81.16%, Dev Accuracy: 77.37%
Epoch 6/25:
Train Loss: 0.0119, Train Accuracy: 84.09%, Dev Accuracy: 77.24%
Epoch 7/25:
Train Loss: 0.0107, Train Accuracy: 86.04%, Dev Accuracy: 77.51%
Epoch 8/25:
Train Loss: 0.0097, Train Accuracy: 87.61%, Dev Accuracy: 77.91%
Epoch 9/25:
Train Loss: 0.0088, Train Accuracy: 88.77%, Dev Accuracy: 77.78%
Epoch 10/25:
Train Loss: 0.0080, Train Accuracy: 90.00%, Dev Accuracy: 78.25%
Epoch 11/25:
Train Loss: 0.0073, Train Accuracy: 91.20%, Dev Accuracy: 78.05%
Epoch 12/25:
Train Loss: 0.0067, Train Accuracy: 92.14%, Dev Accuracy: 78.32%
Epoch 13/25:
Train Loss: 0.0061, Train Accuracy: 93.0

In [247]:
#CNN training
vocab_size = len(my_dico) + 1
embedding_dim = 50
model = CNN_classifier(vocab_size, embedding_dim, 5, [2, 3, 4], 1)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
num_epochs = 25
batch_size = 32
train_loop(model, train_sentence, train_label, dev_sentence, dev_label, criterion, optimizer, num_epochs, batch_size)

Epoch 1/25:
Train Loss: 0.0204, Train Accuracy: 61.31%, Dev Accuracy: 67.82%
Epoch 2/25:
Train Loss: 0.0168, Train Accuracy: 74.16%, Dev Accuracy: 72.22%
Epoch 3/25:
Train Loss: 0.0137, Train Accuracy: 80.84%, Dev Accuracy: 74.05%
Epoch 4/25:
Train Loss: 0.0110, Train Accuracy: 86.36%, Dev Accuracy: 75.34%
Epoch 5/25:
Train Loss: 0.0086, Train Accuracy: 90.56%, Dev Accuracy: 75.95%
Epoch 6/25:
Train Loss: 0.0066, Train Accuracy: 93.53%, Dev Accuracy: 76.22%
Epoch 7/25:
Train Loss: 0.0050, Train Accuracy: 95.86%, Dev Accuracy: 76.15%
Epoch 8/25:
Train Loss: 0.0037, Train Accuracy: 97.30%, Dev Accuracy: 76.15%
Epoch 9/25:
Train Loss: 0.0028, Train Accuracy: 98.19%, Dev Accuracy: 76.42%
Epoch 10/25:
Train Loss: 0.0021, Train Accuracy: 98.79%, Dev Accuracy: 76.02%
Epoch 11/25:
Train Loss: 0.0016, Train Accuracy: 99.04%, Dev Accuracy: 75.54%
Epoch 12/25:
Train Loss: 0.0013, Train Accuracy: 99.30%, Dev Accuracy: 75.27%
Epoch 13/25:
Train Loss: 0.0010, Train Accuracy: 99.46%, Dev Accuracy: 75