# Sentiment Analysis with Transfer Learning and Fine-tuning

This notebook demonstrates how to fine-tune a pre-trained model (DistilBERT) for a binary sentiment analysis task.

## 1. Setup and Imports

In [4]:
%pip install torch transformers pandas scikit-learn datasets


Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-win_amd64.whl.metadata (13 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
Downloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
Downloading xxhash-3.5.0-cp311-cp311-win_amd64.whl (30 kB)
Installing collected packages: xxhash, multiprocess, datasets
Successfully installed datasets-3.0.0 multiprocess-0.70.16 xxhash-3.5.0
Note: you may need to restart the kernel to use updated packages.


In [5]:

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import numpy as np

print(f"PyTorch version: {torch.__version__}")
print(f"GPU available: {torch.cuda.is_available()}")

PyTorch version: 2.4.1+cpu
GPU available: False


## 2. Data Loading and Preparation

In [9]:
!python ../src/huggingface-imdb-loading.py

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


Traceback (most recent call last):
  File "c:\Users\Diego Gerwig\OneDrive\code_dgerwig\42_Cursus_Outer_Core\LLM_sentiment_analysis\42_LLM_sentiment_analysis\src\huggingface-imdb-loading.py", line 33, in <module>
    train_texts, val_texts, train_labels, val_labels = load_imdb_dataset()
                                                       ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Diego Gerwig\OneDrive\code_dgerwig\42_Cursus_Outer_Core\LLM_sentiment_analysis\42_LLM_sentiment_analysis\src\huggingface-imdb-loading.py", line 18, in load_imdb_dataset
    all_data = dataset['train'].shuffle(seed=42)
               ~~~~~~~^^^^^^^^^
  File "c:\Users\Diego Gerwig\AppData\Local\Programs\Python\Python311\Lib\site-packages\datasets\arrow_dataset.py", line 2742, in __getitem__
    return self._getitem(key)
           ^^^^^^^^^^^^^^^^^^
  File "c:\Users\Diego Gerwig\AppData\Local\Programs\Python\Python311\Lib\site-packages\datasets\arrow_dataset.py", line 2726, in _getitem
    pa_subtable = query_table(

In [7]:
def load_imdb_dataset(max_samples=10000):
    df = pd.read_csv('IMDB_dataset.csv')  # Make sure you have this file
    df = df.sample(n=min(max_samples, len(df)), random_state=42)
    texts = df['review'].tolist()
    labels = (df['sentiment'] == 'positive').astype(int).tolist()
    return train_test_split(texts, labels, test_size=0.2, random_state=42)

train_texts, val_texts, train_labels, val_labels = load_imdb_dataset()

print(f"Training set size: {len(train_texts)}")
print(f"Validation set size: {len(val_texts)}")

FileNotFoundError: [Errno 2] No such file or directory: 'IMDB Dataset.csv'

## 3. Custom Dataset Creation

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

## 4. Model and Tokenizer Initialization

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

max_length = 128  # Adjust based on your needs and computational constraints
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer, max_length)

batch_size = 16  # Adjust based on your GPU memory
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

print(f"Number of training batches: {len(train_dataloader)}")
print(f"Number of validation batches: {len(val_dataloader)}")

## 5. Optimizer Configuration

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"Using device: {device}")

## 6. Training and Evaluation Functions

In [None]:
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    return total_loss / len(dataloader)

def evaluate(model, dataloader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in dataloader:
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(inputs['labels'].cpu().tolist())
    
    accuracy = accuracy_score(actual_labels, predictions)
    f1 = f1_score(actual_labels, predictions, average='binary')
    return accuracy, f1

## 7. Main Training Loop

In [None]:
num_epochs = 3  # Adjust based on your needs and time constraints

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    avg_train_loss = train_epoch(model, train_dataloader, optimizer, device)
    print(f"Average training loss: {avg_train_loss:.4f}")
    
    accuracy, f1 = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}, F1-score: {f1:.4f}")
    print("----")

print("Training completed!")

## 8. Save the Model

In [None]:
model.save_pretrained('./models/sentiment_model')
tokenizer.save_pretrained('./models/sentiment_model')
print("Model saved in './modles/sentiment_model' directory")

## 9. Test the Model

In [None]:
def predict_sentiment(text, model, tokenizer, device):
    model.eval()
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        prediction = torch.argmax(outputs.logits, dim=1)
    
    return "Positive" if prediction.item() == 1 else "Negative"

# Test the model with some example reviews
test_reviews = [
    "This movie was fantastic! I loved every minute of it.",
    "Absolutely terrible. Waste of time and money.",
    "It was okay, nothing special but not bad either."
]

for review in test_reviews:
    sentiment = predict_sentiment(review, model, tokenizer, device)
    print(f"Review: {review}")
    print(f"Predicted sentiment: {sentiment}")
    print("----")