
My approach involves the following steps:

1) Data Preparation
2) Model Selection and Tokenization
3) Dataset Class Implementation
4) Fine-Tuning the Model
5) Evaluation

In [None]:
# Install Dependencies 
#pip install transformers accelerate-U scikit-learn

In [57]:
import os
import glob
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [58]:
# Data Preparation
def load_imdb_data(data_dir):
    texts = []
    labels = []
    for label_type in ['pos', 'neg']:
        for path in glob.glob(os.path.join(data_dir, label_type, '*.txt')):
            with open(path, 'r', encoding='utf-8') as f:
                texts.append(f.read())
                labels.append(1 if label_type == 'pos' else 0)
    return texts, labels

In [59]:
train_texts, train_labels = load_imdb_data('aclImdb/train')
test_texts, test_labels = load_imdb_data('aclImdb/test')


In [60]:
# Split training data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42)


In [None]:
# Bert-base-uncased was chosen over other models such as DistilBERT and RoBERTa for its good performance and suitability for the task.
# While DistilBERT is a smaller model with fewer parameters, it then results in the less accurate performance of the model. 
# Unlike BERT, RoBERTa with optimized training procedures is sometimes the superior performance BERT alternative in several tasks than the BERT model.
# Nevertheless, in sentiment analysis competitions, the bert-base-uncased model has been proven to offer the highest performance in terms of accuracy, precision, recall, and F1-score.
# Its pre-training on a large number of words allows it to detect the tiny nuances of the text and distinguish the various emotions, 
# it is thus very efficient in recognizing the sentiment of movie reviews. Furthermore, the "uncased" BERT is the most suitable for sentiment analysis because it treats 
# the text as one big chunk of lowercase and uppercase letters thus making it less stringent for the classification of sentiments. 
# These features of bert-base-uncased make it the best model for fine-tuning on the IMDB dataset.

In [61]:
# Model Selection and Tokenization
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

In [62]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')

In [63]:
# A custom dataset class IMDbDataset is implemented to handle the tokenized data and labels.

class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [64]:
train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

In [9]:
# Fine-Tuning the model with 'Trainer' class from the Hugging Face Transformers library

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=15,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=2e-5
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Early stopping callback
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[early_stopping_callback],
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.224,0.209648
2,0.1898,0.185109
3,0.1095,0.219948
4,0.0934,0.283959
5,0.0359,0.318399


TrainOutput(global_step=1565, training_loss=0.15628240988277398, metrics={'train_runtime': 4154.0215, 'train_samples_per_second': 72.219, 'train_steps_per_second': 1.13, 'total_flos': 2.6311105536e+16, 'train_loss': 0.15628240988277398, 'epoch': 5.0})

The training process included early stopping with a patience of 3 epochs. The training was stopped at epoch 5 due to no improvement in the validation loss.

In [34]:
# Save the best model
model.save_pretrained('./imdb_model')
tokenizer.save_pretrained('./imdb_model')

('./imdb_model/tokenizer_config.json',
 './imdb_model/special_tokens_map.json',
 './imdb_model/vocab.txt',
 './imdb_model/added_tokens.json')

In [74]:
model_path = 'imdb_model' 

In [75]:
# Let's evaluate the model

model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [76]:
# Create a DataLoader for the test dataset
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [77]:
# Make predictions
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

In [78]:
# Compute metrics
accuracy = accuracy_score(all_labels, all_preds)
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')

In [79]:
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

Accuracy: 0.93244
Precision: 0.9498959633791094
Recall: 0.91304
F1-score: 0.9311034060779114


Let's Test it with new text


In [None]:
model_path = 'imdb_model' 

In [19]:
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

In [20]:
# Test with new review from the user
def test_new_review(review_text):
    inputs = tokenizer(review_text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_label = torch.argmax(logits, dim=1).item()
    return 'positive' if predicted_label == 1 else 'negative'

In [21]:
# Example usage
new_review = "I loved this movie! It was amazing and the acting was superb."
print(f"The review is: {test_new_review(new_review)}")

The review is: positive
