# Sentiment Analysis with Transfer Learning and Fine-tuning

This notebook demonstrates how to fine-tune a pre-trained model (DistilBERT) for a binary sentiment analysis task.

## 1. Setup and Imports

In [1]:
%pip install torch transformers pandas scikit-learn datasets
%pip install --upgrade ipywidgets
%pip install sweetviz
%pip install ydata-profiling

Looking in indexes: https://pypi.python.org/simple
Collecting torch
  Downloading torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting transformers
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
Collecting pandas
  Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting jinja2 (from torch)
  Downloading jinja2-3.1.4-py3-none-any.whl.metadata (2.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import numpy as np

print(f"PyTorch version: {torch.__version__}")
print(f"GPU available: {torch.cuda.is_available()}")

## 2. Data Loading and Preparation

In [None]:
import os
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'

from datasets import load_dataset


In [None]:
dataset = load_dataset("imdb")

In [None]:
print(dataset)

In [None]:
train_df = pd.DataFrame(dataset['train'])
test_df  = pd.DataFrame(dataset['test'])

train_df.to_csv('../data/imdb_train.csv', index=False, encoding='utf-8')
test_df.to_csv('../data/imdb_test.csv',   index=False, encoding='utf-8')

print(f"Train data saved in ./data/imdb_train.csv: {len(train_df)} samples")
print(f"Test data saved in  ./data/imdb_test.csv:  {len(test_df)} samples")

In [None]:
import sweetviz as sv

analysis = sv.analyze(train_df)
analysis.show_html(filepath="../reports/sweetviz_report.html", open_browser=True)

In [None]:
from ydata_profiling import ProfileReport
import webbrowser

profile = ProfileReport(train_df, title="Dataset IMDB")
profile.to_notebook_iframe()  

report_path = "../reports/ydata_report.html"
profile.to_file(report_path)
webbrowser.open('file://' + os.path.realpath(report_path))


## 3. Custom Dataset Creation

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

## 4. Model and Tokenizer Initialization

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

max_length = 128  # Adjust based on your needs and computational constraints
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer, max_length)

batch_size = 16  # Adjust based on your GPU memory
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

print(f"Number of training batches: {len(train_dataloader)}")
print(f"Number of validation batches: {len(val_dataloader)}")

## 5. Optimizer Configuration

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"Using device: {device}")

## 6. Training and Evaluation Functions

In [None]:
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    return total_loss / len(dataloader)

def evaluate(model, dataloader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in dataloader:
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(inputs['labels'].cpu().tolist())
    
    accuracy = accuracy_score(actual_labels, predictions)
    f1 = f1_score(actual_labels, predictions, average='binary')
    return accuracy, f1

## 7. Main Training Loop

In [None]:
num_epochs = 3  # Adjust based on your needs and time constraints

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    avg_train_loss = train_epoch(model, train_dataloader, optimizer, device)
    print(f"Average training loss: {avg_train_loss:.4f}")
    
    accuracy, f1 = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}, F1-score: {f1:.4f}")
    print("----")

print("Training completed!")

## 8. Save the Model

In [None]:
model.save_pretrained('./models/sentiment_model')
tokenizer.save_pretrained('./models/sentiment_model')
print("Model saved in './modles/sentiment_model' directory")

## 9. Test the Model

In [None]:
def predict_sentiment(text, model, tokenizer, device):
    model.eval()
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        prediction = torch.argmax(outputs.logits, dim=1)
    
    return "Positive" if prediction.item() == 1 else "Negative"

# Test the model with some example reviews
test_reviews = [
    "This movie was fantastic! I loved every minute of it.",
    "Absolutely terrible. Waste of time and money.",
    "It was okay, nothing special but not bad either."
]

for review in test_reviews:
    sentiment = predict_sentiment(review, model, tokenizer, device)
    print(f"Review: {review}")
    print(f"Predicted sentiment: {sentiment}")
    print("----")