In [24]:
#Loading necessary libraries

import pandas as pd
import numpy as np
import transformers
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import random_split
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
from torchsummary import summary
from tqdm import tqdm

In [25]:
#Model definition

class BertMovieModel(nn.Module):
    def __init__(self):
        super(BertMovieModel, self).__init__()
        self.bert = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.linear = nn.Linear(768, 1)
        self.output = nn.Sigmoid()
    def forward(self, input_embeddings, input_mask, token_type):
        bert_output, pooled_output = self.bert(input_embeddings, attention_mask=input_mask, token_type_ids=token_type).to_tuple()
        linear_output = self.linear(pooled_output)
        final_output = self.output(linear_output)
        return final_output

In [26]:
#Hyper parameters

lr = 0.0001
max_len = 100
num_epochs = 5
batch_size = 32

In [27]:
#Initializing the tokenizer for bert

tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")

In [28]:
#Define a Dataset object for our data

class BertDataset(Dataset):
    def __init__(self, path, tokenizer, max_len, limit=-1):
        tdf = pd.read_csv(path)
        self.df = tdf.sample(frac=1)[:limit]
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        text = self.df.iloc[idx]['text']
        label = self.df.iloc[idx]['label']

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        input_ids = encoding['input_ids'].flatten()
        token_type_ids = encoding['token_type_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()
        
        return {
            'input_ids': torch.as_tensor(input_ids, dtype=torch.long),
            'token_type_ids': torch.as_tensor(token_type_ids, dtype=torch.long),
            'attention_mask': torch.as_tensor(attention_mask, dtype=torch.long),
            'labels': torch.as_tensor(label, dtype=torch.long)
        }

In [29]:
#Create our dataset instance and our dataloaders

dataset = BertDataset('movie_reviews.csv', tokenizer, max_len)

train_dataset, val_dataset, test_dataset = random_split(dataset, [0.8, 0.1, 0.1])

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [30]:
#Check some of the examples from our datasets

def visualize_text_data(dataset, tokenizer, num_samples=5):
    for i in range(min(len(dataset), num_samples)):
        sample = dataset[i]
        input_ids = sample['input_ids']
        label = sample['labels'].item()

        text = tokenizer.decode(input_ids)

        print(f"Review {i+1}:")
        print(text)
        print("Label:", "Positive" if label == 0 else "Negative")
        print("\n" + "-"*50 + "\n")

visualize_text_data(dataset, tokenizer)

Review 1:
[CLS] this is a comedy based on national stereotypes, no doubt. if you leave away pretending you know or you care what communism was about and how real russians or brits are, if you accept and are not hurt by the conventions, you can have fun with this film. nicole kidman is at her best, sexy, moving and funny. ben chaplin succeeds to avoid being completely out - shadowed by nicole, and the rest of the cast does good work as well. the final is moving, [SEP]
Label: Positive

--------------------------------------------------

Review 2:
[CLS] i read the book in 5th grade and now a few years later i saw the movie. there are a few differences : < br / > < br / > 1. billy was oringinally suppose to eat 15 worms in 15 days, not 10 worms in one day by 7 : 00pm. < br / > < br / > 2. billy is suppose to get 30 dollars after he's eaten all the worms. in the movie after billy eats all the worms, joe has [SEP]
Label: Positive

--------------------------------------------------

Review 3:

In [31]:
#Training loop

model = BertMovieModel()

loss_fn = nn.BCELoss()

optimizer= optim.Adam(model.parameters(),lr=lr)

for param in model.bert.parameters():
    param.requires_grad = False
    
for e in range(num_epochs):
    
    print('Begin epoch: ', str(e) + '/' + str(num_epochs))
    
    model.train()
    
    train_loss = 0
    val_loss = 0
    
    for batch in tqdm(train_dataloader, desc=f"Epoch {e + 1}/{num_epochs} - Training"):
        ids = batch['input_ids']
        mask = batch['attention_mask']
        label = batch['labels']
        token_type = batch['token_type_ids']

        optimizer.zero_grad()
        
        pred = model(ids, mask, token_type).squeeze(1)
        
        loss = loss_fn(pred, label.float())
        train_loss += loss.item()
        
        loss.backward()
        
        optimizer.step()

    print(f"Training Loss: {train_loss / len(train_dataloader)}")
        
    model.eval()
    with torch.no_grad():
        for batch in val_dataloader:
            ids = batch['input_ids']
            mask = batch['attention_mask']
            label = batch['labels']
            token_type = batch['token_type_ids']

            pred = model(ids, mask, token_type).squeeze(1)

            loss=loss_fn(pred, label.float())

            val_loss += loss.item()
    print(f"Validation Loss: {val_loss / len(val_dataloader)}")

Begin epoch:  0/5


Epoch 1/5 - Training:  80%|████████  | 503/625 [1:10:46<08:41,  4.27s/it]

In [23]:
model.eval()
correct_predictions = 0
total_predictions = 0

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        ids = batch['input_ids']
        mask = batch['attention_mask']
        labels = batch['labels']
        token_type = batch['token_type_ids']

        outputs = model(ids, mask, token_type)

        predicted_labels = np.where(outputs >= 0.5, 1, 0)
        
        for prediction, label in zip(predicted_labels.tolist(), labels.tolist()):
            if prediction[0] == label:
                correct_predictions += 1
            total_predictions += 1
            
print('Accuracy:', correct_predictions/total_predictions)

Testing: 100%|███████████████████████████| 79/79 [03:59<00:00,  3.03s/it]

Accuracy: 0.5418167266906763



