In [2]:
import numpy as np
import sklearn
import torch
import os
import pandas as pd
import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torchmetrics
import urllib.request

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

Using device: cuda


In [4]:
opSys = os.name
print("Operating System:", opSys)

if opSys == 'nt':
    print("Windows")
    url = "https://www.dropbox.com/scl/fi/0c7zc2adk1mgwgut5w80w/IMDB-Dataset.csv?rlkey=1drfg4zw36mhu32ndy2ihnygw&dl=1"
    if not os.path.exists('IMDB-Dataset.csv'):
        urllib.request.urlretrieve(url, 'IMDB-Dataset.csv')

elif opSys == 'posix':
    print("MacOS")
    if not os.path.exists('IMDB-Dataset.csv'):
      !wget -O IMDB-Dataset.csv -q "https://www.dropbox.com/scl/fi/0c7zc2adk1mgwgut5w80w/IMDB-Dataset.csv?rlkey=1drfg4zw36mhu32ndy2ihnygw&dl=1"


Operating System: nt
Windows


In [5]:
df = pd.read_csv('IMDB-Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
text = list(df['review'].str.replace('<br />',''))
labels = np.array(df['sentiment'].map({'negative':0,'positive':1}))

In [7]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [8]:
seq = text[0][:10]
seq

'One of the'

In [9]:
token_ids = tokenizer(seq)['input_ids']
token_ids

[101, 1448, 1104, 1103, 102]

In [10]:
tokenizer.decode(token_ids+[0,0,0])

'[CLS] One of the [SEP] [PAD] [PAD] [PAD]'

In [11]:
#Create TF-IDF weighted histograms (using TfidfVectorizer) using the top 1000 words and train an MLP model (MLPClassifier) to classify them

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(text).toarray()
y = labels


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

#load the data
train_dataset = torch.utils.data.TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
test_dataset = torch.utils.data.TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define the model
model = nn.Sequential(
    nn.Linear(1000, 512),
    nn.ReLU(),
    nn.Linear(512, 512),
    nn.ReLU(),
    nn.Linear(512, 512),
    nn.ReLU(),
    nn.Linear(512, 2)
)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
epochs = 10

model.to(device)

for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in tqdm.tqdm(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f'Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader)}')
model.eval()

# I realize at this point I am supposed to use the SciKit learn model to do this, however, I already used tensors, soo...

# Evaluate the model

y_pred = []

for inputs, labels in tqdm.tqdm(test_loader):
    inputs = inputs.to(device)
    outputs = model(inputs)
    _, predicted = torch.max(outputs, 1)
    y_pred.extend(predicted.cpu().numpy())

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(len(y_test))

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')



100%|██████████| 1407/1407 [00:05<00:00, 277.44it/s]


Epoch 1/10, Loss: 0.3426311409528491


100%|██████████| 1407/1407 [00:02<00:00, 504.34it/s]


Epoch 2/10, Loss: 0.2962006989074838


100%|██████████| 1407/1407 [00:02<00:00, 509.76it/s]


Epoch 3/10, Loss: 0.2383115427249509


100%|██████████| 1407/1407 [00:02<00:00, 514.63it/s]


Epoch 4/10, Loss: 0.12453452044205678


100%|██████████| 1407/1407 [00:02<00:00, 513.00it/s]


Epoch 5/10, Loss: 0.05198218222765759


100%|██████████| 1407/1407 [00:02<00:00, 495.40it/s]


Epoch 6/10, Loss: 0.030754934459507357


100%|██████████| 1407/1407 [00:02<00:00, 507.10it/s]


Epoch 7/10, Loss: 0.02413634446867219


100%|██████████| 1407/1407 [00:02<00:00, 509.42it/s]


Epoch 8/10, Loss: 0.01842017680443912


100%|██████████| 1407/1407 [00:02<00:00, 507.05it/s]


Epoch 9/10, Loss: 0.017388384285417825


100%|██████████| 1407/1407 [00:02<00:00, 512.80it/s]


Epoch 10/10, Loss: 0.015082337522846705


100%|██████████| 157/157 [00:00<00:00, 1172.14it/s]

5000
Accuracy: 0.8564
Precision: 0.8609
Recall: 0.8527
F1 Score: 0.8568





In [12]:
import random
# Tokenize the entire dataset
tokenized = tokenizer(text)

X = tokenized['input_ids']
labels = np.array(df['sentiment'].map({'negative':0,'positive':1}))
y = labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

Token indices sequence length is longer than the specified maximum sequence length for this model (543 > 512). Running this sequence through the model will result in indexing errors


In [13]:
print("this still works")

this still works


In [14]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, vocab_size, max_len=100):
        self.texts = texts
        self.labels = labels
        self.vocab_size = vocab_size
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        # Pad or truncate the sequence
        if len(text) > self.max_len:
            start = random.randint(0, len(text) - self.max_len)
            text = text[start : start + self.max_len]
        else:
            text = text + [0] * (self.max_len - len(text))
        
        return torch.tensor(text, dtype=torch.long), torch.tensor(label, dtype=torch.float)
    
    

In [15]:
# Create the dataset
train_dataset = SentimentDataset(X_train, y_train, vocab_size=tokenizer.vocab_size)
print(train_dataset[35])
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

#Create Test Dataset
test_dataset = SentimentDataset(X_test, y_test, vocab_size=tokenizer.vocab_size)
print(test_dataset[35])
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

(tensor([  119,  1438,   117,  1122,   112,   188,  1145, 20844,  5815,  4179,
          119, 13197,  1137,  1136,  1122,   112,   188, 16841,  6276,   146,
         1274,   112,   189,  1221,   119,  9800, 27788,  1107,  1451,  7631,
          117,  1122,  1145,  1110,  1103,  1178,  2523,   146,  1221,  1104,
         1115,  1144,   122,   114,   170,  7930,  5102,  1217,  1307,  1118,
          170, 17393,  2811,  1107,   170,   113,  1304,  5119,   114,  7930,
         4228,   117,   123,   114,  1126,  8394,  1476,   118,  1380,  3647,
         1773,   170,  1959,  1150,   112,   188,  3155,  1106,  1129,  1107,
         1123,  1523,  2539,   112,   188,   117,  1105,   124,   114,  1103,
         1211, 21215,  1116,  1111,  5828,  3723,  5358, 26868,  1279,  1518]), tensor(0.))
(tensor([  131,  1109,  1148,  1169,  1129,  1562,  1114,   170,  3613,  1440,
         1120,  1122,   112,   188,   146, 18219,  1830,  3674,  2349,  1424,
         1874,   131,  6605,   120,  8909,   120

In [16]:
#define the model
class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim=100, hidden_dim=100, num_layers=3):
        super(SentimentRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.gru(x)
        x = x[:, -1, :]
        x = self.fc(x)
        return self.sigmoid(x)

In [19]:
# Initialize the model
model = SentimentRNN(vocab_size=tokenizer.vocab_size, embedding_dim=100, hidden_dim=100, num_layers=3)
model.to(device)
criterion = nn.BCELoss()

optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

# Train the model
epochs = 20

for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in tqdm.tqdm(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), labels.float())
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f'Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader)}')

  0%|          | 0/1407 [00:00<?, ?it/s]

100%|██████████| 1407/1407 [00:06<00:00, 219.54it/s]


Epoch 1/20, Loss: 0.6280866518839082


100%|██████████| 1407/1407 [00:06<00:00, 233.93it/s]


Epoch 2/20, Loss: 0.47048803533784195


100%|██████████| 1407/1407 [00:05<00:00, 237.72it/s]


Epoch 3/20, Loss: 0.40683270422126183


100%|██████████| 1407/1407 [00:05<00:00, 243.20it/s]


Epoch 4/20, Loss: 0.3794479732532067


100%|██████████| 1407/1407 [00:05<00:00, 241.40it/s]


Epoch 5/20, Loss: 0.35285911018024885


100%|██████████| 1407/1407 [00:05<00:00, 241.77it/s]


Epoch 6/20, Loss: 0.334679246721915


100%|██████████| 1407/1407 [00:05<00:00, 242.34it/s]


Epoch 7/20, Loss: 0.3165627109904872


100%|██████████| 1407/1407 [00:05<00:00, 243.50it/s]


Epoch 8/20, Loss: 0.30114702142208455


100%|██████████| 1407/1407 [00:05<00:00, 241.38it/s]


Epoch 9/20, Loss: 0.28677403510104027


100%|██████████| 1407/1407 [00:05<00:00, 243.15it/s]


Epoch 10/20, Loss: 0.27467467981127874


100%|██████████| 1407/1407 [00:05<00:00, 242.45it/s]


Epoch 11/20, Loss: 0.26680938443850777


100%|██████████| 1407/1407 [00:05<00:00, 241.54it/s]


Epoch 12/20, Loss: 0.2513256392599889


100%|██████████| 1407/1407 [00:05<00:00, 240.39it/s]


Epoch 13/20, Loss: 0.24389778962722888


100%|██████████| 1407/1407 [00:05<00:00, 241.29it/s]


Epoch 14/20, Loss: 0.2298967211739595


100%|██████████| 1407/1407 [00:05<00:00, 243.78it/s]


Epoch 15/20, Loss: 0.22151721863959092


100%|██████████| 1407/1407 [00:05<00:00, 241.75it/s]


Epoch 16/20, Loss: 0.21294897841057966


100%|██████████| 1407/1407 [00:06<00:00, 230.87it/s]


Epoch 17/20, Loss: 0.20589269092802936


100%|██████████| 1407/1407 [00:06<00:00, 231.04it/s]


Epoch 18/20, Loss: 0.1971828638719608


100%|██████████| 1407/1407 [00:05<00:00, 239.12it/s]


Epoch 19/20, Loss: 0.18731500235384207


100%|██████████| 1407/1407 [00:05<00:00, 237.48it/s]

Epoch 20/20, Loss: 0.18284214007474253





In [20]:
# Evaluate the model
model.eval()
y_pred = []
y_true = []
for inputs, labels in tqdm.tqdm(test_loader):
    inputs = inputs.to(device)
    outputs = model(inputs)
    predicted = (outputs.squeeze() > 0.5).float()
    y_pred.extend(predicted.cpu().numpy())
    y_true.extend(labels.numpy())


# Calculate metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)


print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

100%|██████████| 157/157 [00:00<00:00, 652.48it/s]

Accuracy: 0.8372
Precision: 0.8209
Recall: 0.8658
F1 Score: 0.8427





##### For 10 Epochs
Accuracy: 0.8352
Precision: 0.8397
Recall: 0.8317
F1 Score: 0.8357

##### For 20 Epochs
Accuracy: 0.8372
Precision: 0.8209
Recall: 0.8658
F1 Score: 0.8427

##### For 30 Epochs
Accuracy: 0.8240
Precision: 0.8061
Recall: 0.8567
F1 Score: 0.8306