# Preparation

In [2]:
# Libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torchtext.vocab import GloVe
from nltk.tokenize import word_tokenize
from sklearn.metrics import f1_score
import pandas as pd
from sklearn.model_selection import ParameterGrid
torch.manual_seed(0)

<torch._C.Generator at 0x7fbbcad9cdb0>

In [3]:
# Step 1: Load the Dataset
from datasets import load_dataset
dataset = load_dataset("rotten_tomatoes")

In [4]:
# Step 2: Load GloVe word embeddings
glove = GloVe(name='6B', dim=50, cache="/tmp/glove/")
word_index = {word: idx + 1 for idx, word in enumerate(glove.itos)}
embedding_matrix = glove.vectors

In [5]:
# Step 3: Split the data
X_train = dataset['train']['text']
y_train = dataset['train']['label']

X_test = dataset['test']['text']
y_test = dataset['test']['label']

In [6]:
# Step 4: Preparing Embedding Indices and tokenization
embedding_index = {word: glove.vectors[glove.stoi[word]] for word in glove.stoi}

def tokenize_and_index(sentences, embedding_index, max_length):
    indexed_sentences = []
    for sentence in sentences:
        tokens = word_tokenize(sentence.lower())
        indices = [glove.stoi.get(token, 0) for token in tokens]  # Uses index 0 for unknown tokens
        indices = indices[:max_length]  # Truncate or pad to max_length
        padded_indices = indices + [0] * (max_length - len(indices))  # To pad with zeros
        indexed_sentences.append(padded_indices)
    return indexed_sentences


# Step 5: Tokenizing the sentences and then taking the index from static embeddings
max_length = 100 
X_train_indices = tokenize_and_index(X_train, embedding_index, max_length)
X_test_indices = tokenize_and_index(X_test, embedding_index, max_length)

# Converting data to tensors that the models can ingest
X_train_tensor = torch.tensor(X_train_indices, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_indices, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

In [53]:
# Step 5: Defining functions to train and evaluate the model
def train_model(model, X_train, y_train, X_test, y_test, epochs, lr=0.001):
    criterion = nn.BCEWithLogitsLoss()
    optimizer = Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs.squeeze(), y_train.float())
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

def evaluate_model(model, X_test, y_test):
    model.eval()
    with torch.no_grad():
        outputs = model(X_test)
        predictions = torch.round(torch.sigmoid(outputs))
        accuracy = (predictions.squeeze() == y_test).sum().item() / len(y_test)
        f1 = f1_score(y_test.cpu().numpy(), predictions.squeeze().cpu().numpy())
        print(f"Accuracy: {accuracy}, F1 score: {f1}")
        return accuracy, f1

# Final Model: CNN

In [14]:
class StaticEmbeddingCNN(nn.Module):
    def __init__(self, embedding_matrix, num_classes, kernel_sizes=[3, 4, 5], num_filters=100, dropout_prob=0.5):
        super(StaticEmbeddingCNN, self).__init__()

        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)

        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_matrix.shape[1], out_channels=num_filters, kernel_size=ks)
            for ks in kernel_sizes
        ])

        self.dropout = nn.Dropout(dropout_prob)
        self.fc = nn.Linear(num_filters * len(kernel_sizes), num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        embedded = embedded.permute(0, 2, 1)

        conv_outputs = [F.relu(conv(embedded)) for conv in self.convs] 

        pooled_outputs = [F.max_pool1d(conv_output, conv_output.size(2)).squeeze(2) for conv_output in conv_outputs]  # Pooling

        concat = torch.cat(pooled_outputs, dim=1) 

        output = self.dropout(concat)
        output = self.fc(output)
        return output

In [21]:
# Instantiate model and train
torch.manual_seed(0)
num_classes = 1  
model = StaticEmbeddingCNN(embedding_matrix, num_classes)
train_model(model, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor, epochs = 100)

Epoch 1/100, Loss: 0.7160917520523071
Epoch 2/100, Loss: 0.7103981375694275
Epoch 3/100, Loss: 0.7049252390861511
Epoch 4/100, Loss: 0.6942934393882751
Epoch 5/100, Loss: 0.6895110607147217
Epoch 6/100, Loss: 0.6849775314331055
Epoch 7/100, Loss: 0.6795083284378052
Epoch 8/100, Loss: 0.6676356196403503
Epoch 9/100, Loss: 0.6677995324134827
Epoch 10/100, Loss: 0.6604330539703369
Epoch 11/100, Loss: 0.6527548432350159
Epoch 12/100, Loss: 0.6497433185577393
Epoch 13/100, Loss: 0.6440480351448059
Epoch 14/100, Loss: 0.6386942863464355
Epoch 15/100, Loss: 0.6366676092147827
Epoch 16/100, Loss: 0.6334262490272522
Epoch 17/100, Loss: 0.6248083114624023
Epoch 18/100, Loss: 0.623074471950531
Epoch 19/100, Loss: 0.616055965423584
Epoch 20/100, Loss: 0.6153444051742554
Epoch 21/100, Loss: 0.6061391830444336
Epoch 22/100, Loss: 0.607618510723114
Epoch 23/100, Loss: 0.6024330258369446
Epoch 24/100, Loss: 0.5987381935119629
Epoch 25/100, Loss: 0.5954563617706299
Epoch 26/100, Loss: 0.588339447975158

In [22]:
evaluate_model(model, X_test_tensor, y_test_tensor)

Accuracy: 0.7495309568480301, F1 score: 0.7534626038781164


## Tuning

In [58]:
# Tuning

hyperparameters = {
    'num_filters': [50, 100, 150],
    'dropout_prob': [0.3, 0.5, 0.7],
    'kernel_sizes': [[2, 3, 4], [3, 4, 5]],
    'lr': [0.001, 0.01, 0.1]
}

param_grid = list(ParameterGrid(hyperparameters))

best_f1_score = 0
best_params = None

for params in param_grid:
    print("Training with hyperparameters:", params)
    cnn_params = {key: value for key, value in params.items() if key != 'lr'}
    # to tune: kernel_sizes, num_filters, dropout_prob
    model = StaticEmbeddingCNN(embedding_matrix, num_classes, **cnn_params)
    train_model(model, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor, epochs=5, lr=params['lr'])
    _, f1 = evaluate_model(model, X_test_tensor, y_test_tensor)
    
    if f1 > best_f1_score:
        best_f1_score = f1
        best_params = params

print("Best F1 score:", best_f1_score)
print("Best hyperparameters:", best_params)


Training with hyperparameters: {'dropout_prob': 0.3, 'kernel_sizes': [2, 3, 4], 'lr': 0.001, 'num_filters': 50}
Epoch 1/5, Loss: 0.7673466205596924
Epoch 2/5, Loss: 0.723188042640686
Epoch 3/5, Loss: 0.6984388828277588
Epoch 4/5, Loss: 0.6962231397628784
Epoch 5/5, Loss: 0.697930097579956
Accuracy: 0.5037523452157598, F1 score: 0.6675047140163419
Training with hyperparameters: {'dropout_prob': 0.3, 'kernel_sizes': [2, 3, 4], 'lr': 0.001, 'num_filters': 100}
Epoch 1/5, Loss: 0.7386230230331421
Epoch 2/5, Loss: 0.6977089643478394
Epoch 3/5, Loss: 0.695817232131958
Epoch 4/5, Loss: 0.701389729976654
Epoch 5/5, Loss: 0.6976956129074097
Accuracy: 0.5178236397748592, F1 score: 0.6683870967741935
Training with hyperparameters: {'dropout_prob': 0.3, 'kernel_sizes': [2, 3, 4], 'lr': 0.001, 'num_filters': 150}
Epoch 1/5, Loss: 0.6975057125091553
Epoch 2/5, Loss: 0.6967837810516357
Epoch 3/5, Loss: 0.6847727298736572
Epoch 4/5, Loss: 0.6827982664108276
Epoch 5/5, Loss: 0.6737740635871887
Accuracy

Accuracy: 0.5, F1 score: 0.6666666666666666
Training with hyperparameters: {'dropout_prob': 0.5, 'kernel_sizes': [2, 3, 4], 'lr': 0.1, 'num_filters': 50}
Epoch 1/5, Loss: 0.8026230335235596
Epoch 2/5, Loss: 20.590822219848633
Epoch 3/5, Loss: 3.93654465675354
Epoch 4/5, Loss: 0.743939995765686
Epoch 5/5, Loss: 0.7349662184715271
Accuracy: 0.49530956848030017, F1 score: 0.039285714285714285
Training with hyperparameters: {'dropout_prob': 0.5, 'kernel_sizes': [2, 3, 4], 'lr': 0.1, 'num_filters': 100}
Epoch 1/5, Loss: 0.7389894723892212
Epoch 2/5, Loss: 41.76652908325195
Epoch 3/5, Loss: 6.372738361358643
Epoch 4/5, Loss: 0.7537696957588196
Epoch 5/5, Loss: 0.7640511393547058
Accuracy: 0.50093808630394, F1 score: 0.06007067137809187
Training with hyperparameters: {'dropout_prob': 0.5, 'kernel_sizes': [2, 3, 4], 'lr': 0.1, 'num_filters': 150}
Epoch 1/5, Loss: 0.7309091687202454
Epoch 2/5, Loss: 61.086822509765625
Epoch 3/5, Loss: 8.831815719604492
Epoch 4/5, Loss: 0.7799745798110962
Epoch 

Epoch 3/5, Loss: 0.7483795881271362
Epoch 4/5, Loss: 0.7572798728942871
Epoch 5/5, Loss: 0.7350382804870605
Accuracy: 0.5300187617260788, F1 score: 0.14065180102915953
Training with hyperparameters: {'dropout_prob': 0.7, 'kernel_sizes': [3, 4, 5], 'lr': 0.01, 'num_filters': 50}
Epoch 1/5, Loss: 0.7480978965759277
Epoch 2/5, Loss: 1.3628031015396118
Epoch 3/5, Loss: 0.8107272982597351
Epoch 4/5, Loss: 0.7079648971557617
Epoch 5/5, Loss: 0.815915584564209
Accuracy: 0.5, F1 score: 0.6666666666666666
Training with hyperparameters: {'dropout_prob': 0.7, 'kernel_sizes': [3, 4, 5], 'lr': 0.01, 'num_filters': 100}
Epoch 1/5, Loss: 0.7418350577354431
Epoch 2/5, Loss: 2.134641408920288
Epoch 3/5, Loss: 0.8060808181762695
Epoch 4/5, Loss: 1.1376022100448608
Epoch 5/5, Loss: 1.126815676689148
Accuracy: 0.50093808630394, F1 score: 0.003745318352059925
Training with hyperparameters: {'dropout_prob': 0.7, 'kernel_sizes': [3, 4, 5], 'lr': 0.01, 'num_filters': 150}
Epoch 1/5, Loss: 0.7424008846282959
E

# Other Tested NNs

## MoreComplexCNN (from class)

In [48]:
class MoreComplexCNN(nn.Module):
    def __init__(self, embedding_matrix, kernel_num, num_classes, kernel_sizes=[3, 4, 5], dropout_prob=0.5):
        super(MoreComplexCNN, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.conv_block = nn.ModuleList(
            [
                nn.Conv1d(in_channels=embedding_matrix.shape[1], out_channels=kernel_num, kernel_size=k, stride=1) 
                for k in kernel_sizes
            ]
        )
        self.dropout = nn.Dropout(dropout_prob)
        self.linear = nn.Linear(len(kernel_sizes) * kernel_num, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)  
        embedded = embedded.permute(0, 2, 1)

        conv_outputs = [F.relu(conv(embedded)) for conv in self.conv_block] 

        pooled_outputs = [F.max_pool1d(conv_output, conv_output.size(2)).squeeze(2) for conv_output in conv_outputs]  # Pooling

        concat = torch.cat(pooled_outputs, dim=1)

        output = self.dropout(concat)
        output = self.linear(output)
        return output


In [59]:
# Instantiate model and train
torch.manual_seed(0)
num_classes = 1  
model = MoreComplexCNN(embedding_matrix=embedding_matrix, num_classes=num_classes, kernel_num=1)
train_model(model, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor, epochs = 100)

Epoch 1/100, Loss: 0.7030304670333862
Epoch 2/100, Loss: 0.6919169425964355
Epoch 3/100, Loss: 0.6889169216156006
Epoch 4/100, Loss: 0.6857153177261353
Epoch 5/100, Loss: 0.6766760945320129
Epoch 6/100, Loss: 0.6786852478981018
Epoch 7/100, Loss: 0.6756353974342346
Epoch 8/100, Loss: 0.6680417060852051
Epoch 9/100, Loss: 0.6662741303443909
Epoch 10/100, Loss: 0.6678453087806702
Epoch 11/100, Loss: 0.6645626425743103
Epoch 12/100, Loss: 0.6608788967132568
Epoch 13/100, Loss: 0.6608502864837646
Epoch 14/100, Loss: 0.658575713634491
Epoch 15/100, Loss: 0.6522707343101501
Epoch 16/100, Loss: 0.6535550951957703
Epoch 17/100, Loss: 0.6519927382469177
Epoch 18/100, Loss: 0.6485595703125
Epoch 19/100, Loss: 0.6507163643836975
Epoch 20/100, Loss: 0.6473495960235596
Epoch 21/100, Loss: 0.6444655060768127
Epoch 22/100, Loss: 0.6443787813186646
Epoch 23/100, Loss: 0.6461484432220459
Epoch 24/100, Loss: 0.6411661505699158
Epoch 25/100, Loss: 0.641717255115509
Epoch 26/100, Loss: 0.6416459083557129


In [60]:
evaluate_model(model, X_test_tensor, y_test_tensor)

Accuracy: 0.7073170731707317, F1 score: 0.7051039697542533


## LSTM

In [15]:
class StaticEmbeddingLSTM(nn.Module):
    def __init__(self, embedding_matrix, num_classes, hidden_size=100, num_layers=1, dropout_prob=0.5):
        super(StaticEmbeddingLSTM, self).__init__()

        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)

        self.lstm = nn.LSTM(embedding_matrix.shape[1], hidden_size, num_layers=num_layers, batch_first=True, dropout=dropout_prob, bidirectional=True)

        self.dropout = nn.Dropout(dropout_prob)
        self.fc = nn.Linear(hidden_size * 2, num_classes)  # Multiply by 2 for bidirectional LSTM

    def forward(self, x):
        embedded = self.embedding(x)

        lstm_output, _ = self.lstm(embedded)

        # Take the last output of the forward and backward LSTM
        last_output_forward = lstm_output[:, -1, :lstm_output.size(2)//2]
        last_output_backward = lstm_output[:, 0, lstm_output.size(2)//2:]

        concatenated_output = torch.cat((last_output_forward, last_output_backward), dim=1)

        output = self.dropout(concatenated_output)
        output = self.fc(output)
        return output


In [16]:
# Instantiate model and train
torch.manual_seed(0)
num_classes = 1  
model = StaticEmbeddingLSTM(embedding_matrix, num_classes)
train_model(model, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor, epochs = 10)

Epoch 1/10, Loss: 0.6975608468055725
Epoch 2/10, Loss: 0.6935566067695618
Epoch 3/10, Loss: 0.6915398240089417
Epoch 4/10, Loss: 0.6925466060638428
Epoch 5/10, Loss: 0.6916471123695374
Epoch 6/10, Loss: 0.6903289556503296
Epoch 7/10, Loss: 0.6892625093460083
Epoch 8/10, Loss: 0.6874939799308777
Epoch 9/10, Loss: 0.6868316531181335
Epoch 10/10, Loss: 0.6867664456367493


In [17]:
evaluate_model(model, X_test_tensor, y_test_tensor)

Accuracy: 0.5581613508442776, F1 score: 0.642369020501139


## RNN

In [41]:
class StaticEmbeddingRNN(nn.Module):
    def __init__(self, embedding_matrix, num_classes, hidden_size=100, num_layers=1, dropout_prob=0.5):
        super(StaticEmbeddingRNN, self).__init__()

        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
        self.rnn = nn.RNN(input_size=embedding_matrix.shape[1], hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.dropout = nn.Dropout(dropout_prob)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        embedded = self.embedding(x) 
        
        rnn_output, _ = self.rnn(embedded)
        
        last_output = rnn_output[:, -1, :]
        
        output = self.dropout(last_output)
        output = self.fc(output)
        return output

In [42]:
model = StaticEmbeddingRNN(embedding_matrix=embedding_matrix,
                           num_classes=num_classes,
                           hidden_size=128,  
                           num_layers=1,     
                           dropout_prob=0.5)

In [43]:
# Instantiate model and train
torch.manual_seed(0)
train_model(model, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor, epochs = 10)

Epoch 1/10, Loss: 0.7040256857872009
Epoch 2/10, Loss: 0.6959689855575562
Epoch 3/10, Loss: 0.6990081667900085
Epoch 4/10, Loss: 0.6974896192550659
Epoch 5/10, Loss: 0.6958703398704529
Epoch 6/10, Loss: 0.6945583820343018
Epoch 7/10, Loss: 0.6959335803985596
Epoch 8/10, Loss: 0.6951942443847656
Epoch 9/10, Loss: 0.6954545974731445
Epoch 10/10, Loss: 0.6959737539291382


In [44]:
evaluate_model(model, X_test_tensor, y_test_tensor)

Accuracy: 0.5, F1 score: 0.6666666666666666


## GRU

In [37]:
class StaticEmbeddingGRU(nn.Module):
    def __init__(self, embedding_matrix, num_classes, hidden_size=30, num_layers=2, dropout_prob=0.5):
        super(StaticEmbeddingGRU, self).__init__()

        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
        self.gru = nn.GRU(input_size=embedding_matrix.shape[1], hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.dropout = nn.Dropout(dropout_prob)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)  
    
        gru_output, _ = self.gru(embedded)
        
        last_output = gru_output[:, -1, :]
        
        output = self.dropout(last_output)
        output = self.fc(output)
        return output

In [38]:
model = StaticEmbeddingGRU(embedding_matrix=embedding_matrix,
                           num_classes=num_classes,
                           hidden_size=128,  
                           num_layers=1,     
                           dropout_prob=0.5)

In [39]:
# Instantiate model and train
torch.manual_seed(0)
train_model(model, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor, epochs = 20)

Epoch 1/20, Loss: 0.699726402759552
Epoch 2/20, Loss: 0.6957173943519592
Epoch 3/20, Loss: 0.6962710022926331
Epoch 4/20, Loss: 0.6977770924568176
Epoch 5/20, Loss: 0.6956571936607361
Epoch 6/20, Loss: 0.6954718232154846
Epoch 7/20, Loss: 0.6953843832015991
Epoch 8/20, Loss: 0.6956498622894287
Epoch 9/20, Loss: 0.696336030960083
Epoch 10/20, Loss: 0.6949511766433716
Epoch 11/20, Loss: 0.6947197318077087
Epoch 12/20, Loss: 0.6957190632820129
Epoch 13/20, Loss: 0.6948350667953491
Epoch 14/20, Loss: 0.6961090564727783
Epoch 15/20, Loss: 0.6948580145835876
Epoch 16/20, Loss: 0.6947367191314697
Epoch 17/20, Loss: 0.694600522518158
Epoch 18/20, Loss: 0.6949266195297241
Epoch 19/20, Loss: 0.6932806372642517
Epoch 20/20, Loss: 0.6943645477294922


In [40]:
evaluate_model(model, X_test_tensor, y_test_tensor)

Accuracy: 0.5, F1 score: 0.6666666666666666


# Saving  Our Best Model

In [33]:
model

StaticEmbeddingCNN(
  (embedding): Embedding(400000, 50)
  (convs): ModuleList(
    (0): Conv1d(50, 100, kernel_size=(3,), stride=(1,))
    (1): Conv1d(50, 100, kernel_size=(4,), stride=(1,))
    (2): Conv1d(50, 100, kernel_size=(5,), stride=(1,))
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=300, out_features=1, bias=True)
)

In [34]:
torch.save(model, 'saved_models/CNN.pth') 

# Loading 

In [61]:
loaded_model = torch.load('saved_models/CNN.pth')

In [62]:
loaded_model

StaticEmbeddingCNN(
  (embedding): Embedding(400000, 50)
  (convs): ModuleList(
    (0): Conv1d(50, 100, kernel_size=(3,), stride=(1,))
    (1): Conv1d(50, 100, kernel_size=(4,), stride=(1,))
    (2): Conv1d(50, 100, kernel_size=(5,), stride=(1,))
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=300, out_features=1, bias=True)
)

In [63]:
evaluate_model(loaded_model, X_test_tensor, y_test_tensor)

Accuracy: 0.7495309568480301, F1 score: 0.7534626038781164


(0.7495309568480301, 0.7534626038781164)

# Making Predictions

In [98]:
def create_preds(model, X_test):
    model.eval()
    with torch.no_grad():
        outputs = model(X_test)
        predictions = torch.round(torch.sigmoid(outputs))
        predictions = predictions.squeeze()
        
        pred_df = pd.DataFrame({'pred': predictions})
        index_df = pd.DataFrame(list(range(0, len(predictions))), columns=['index'])
        pred_df = pd.concat([index_df, pred_df], axis = 1)
        return pred_df

In [99]:
temp = create_preds(model, X_test_tensor)
temp

Unnamed: 0,index,pred
0,0,0.0
1,1,1.0
2,2,0.0
3,3,1.0
4,4,1.0
...,...,...
1061,1061,1.0
1062,1062,0.0
1063,1063,0.0
1064,1064,0.0


# Library Versions

In [105]:
!pip show torch torchtext nltk datasets pandas scikit-learn

Name: torch
Version: 2.2.1
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3
Location: /Users/franco/opt/anaconda3/lib/python3.9/site-packages
Requires: filelock, fsspec, jinja2, networkx, sympy, typing-extensions
Required-by: torchaudio, torchdata, torchtext, torchvision
---
Name: torchtext
Version: 0.17.1
Summary: Text utilities, models, transforms, and datasets for PyTorch.
Home-page: https://github.com/pytorch/text
Author: PyTorch Text Team
Author-email: packages@pytorch.org
License: BSD
Location: /Users/franco/opt/anaconda3/lib/python3.9/site-packages
Requires: numpy, requests, torch, torchdata, tqdm
Required-by: 
---
Name: nltk
Version: 3.8.1
Summary: Natural Language Toolkit
Home-page: https://www.nltk.org/
Author: NLTK Team
Author-email: nltk.team@gmail.com
License: Apache License, Version 2.0
Location: /Users/franco/opt/anaconda3/lib/python3.