# Sentiment Analysis with Transfer Learning and Fine-tuning

This notebook demonstrates how to fine-tune a pre-trained model for a binary sentiment analysis task.

## 1. Setup and Imports

In [1]:
%pip install torch transformers pandas scikit-learn datasets
%pip install --upgrade ipywidgets
%pip install ydata-profiling

Collecting torch
  Using cached torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting transformers
  Using cached transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting datasets
  Using cached datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting filelock (from torch)
  Using cached filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting networkx (from torch)
  Using cached networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Using cached nvidia_cu

In [13]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from transformers import (
    DistilBertTokenizer, DistilBertModel, DistilBertForSequenceClassification,
    BertTokenizer, BertModel,
    RobertaTokenizer, RobertaModel
)
import time
import psutil
from typing import Dict, List

In [12]:

print("=== System Info ===")
print(f"CPU cores: {psutil.cpu_count()}")
print(f"RAM: {psutil.virtual_memory().total / (1024 ** 3):.2f} GB")
print("\n=== GPU Info ===")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

=== System Info ===
CPU cores: 8
RAM: 31.35 GB

=== GPU Info ===
CUDA available: False


## 2. Data Loading and Preparation

In [3]:
df_sample = pd.read_parquet('../data/sample_reviews.parquet')

print(df_sample.head())
print(df_sample.info())

                                            sentence  label  idx
0  at least one scene is so disgusting that viewe...      0  413
1  even the finest chef ca n't make a hotdog into...      0  701
2  collateral damage finally delivers the goods f...      1  834
3  exciting and direct , with ghost imagery that ...      1  821
4  and when you 're talking about a slapstick com...      0  748
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  150 non-null    object
 1   label     150 non-null    int64 
 2   idx       150 non-null    int32 
dtypes: int32(1), int64(1), object(1)
memory usage: 3.1+ KB
None


In [4]:
import os
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'

from datasets import load_dataset


Getting the datasets reviews from Hugging Face:

https://huggingface.co/datasets/stanfordnlp/imdb

https://huggingface.co/datasets/SetFit/amazon_reviews_multi_en

In [5]:
data_name = "stanfordnlp/imdb"
# data_name = "SetFit/amazon_reviews_multi_en"

dataset = load_dataset(data_name)

In [6]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [7]:
import pandas as pd

SAMPLE_SIZE = 300

train_df = pd.DataFrame(dataset['train'])
test_df = pd.DataFrame(dataset['test'])

train_samples_per_class = SAMPLE_SIZE // len(train_df['label'].unique())
test_samples_per_class = SAMPLE_SIZE // len(test_df['label'].unique())

train_balanced = pd.concat([
    train_df[train_df['label'] == label].sample(n=train_samples_per_class, random_state=42)
    for label in train_df['label'].unique()
])

test_balanced = pd.concat([
    test_df[test_df['label'] == label].sample(n=test_samples_per_class, random_state=42)
    for label in test_df['label'].unique()
])

name = data_name.split('/')[1]

train_balanced.to_csv(f'../data/{name}_train_balanced.csv', index=False, encoding='utf-8')
test_balanced.to_csv(f'../data/{name}_test_balanced.csv', index=False, encoding='utf-8')

print(f"Balanced train data saved in ./data/{name}_train_balanced.csv: {len(train_balanced)} samples")
print(f"Balanced test data saved in ./data/{name}_test_balanced.csv: {len(test_balanced)} samples")

print("\nClass distribution in Train:")
print(train_balanced['label'].value_counts())
print("\nClass distribution in Test:")
print(test_balanced['label'].value_counts())

Balanced train data saved in ./data/imdb_train_balanced.csv: 300 samples
Balanced test data saved in ./data/imdb_test_balanced.csv: 300 samples

Class distribution in Train:
label
0    150
1    150
Name: count, dtype: int64

Class distribution in Test:
label
0    150
1    150
Name: count, dtype: int64


In [8]:
train_balanced.head()


Unnamed: 0,text,label
1766,"Wow, what a total let down! The fact people th...",0
11919,"If Bob Ludlum was to see this mini series, he ...",0
8909,To call a film about a crippled ghost taking r...,0
4963,they have sex with melons in Asia.<br /><br />...,0
10099,Although the production and Jerry Jameson's di...,0


In [9]:
train_balanced.info()

<class 'pandas.core.frame.DataFrame'>
Index: 300 entries, 1766 to 19219
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    300 non-null    object
 1   label   300 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 7.0+ KB


In [10]:
train_balanced['label'].value_counts()

label
0    150
1    150
Name: count, dtype: int64

In [15]:
train_balanced['text_length'] = train_balanced['text'].str.len()

length_stats = train_balanced.groupby('label').agg({
    'text_length': [
        ('min_length', 'min'),
        ('max_length', 'max'),
        ('mean_length', 'mean'),
        ('std_length', 'std')
    ]
}).round(2)

overall_stats = pd.DataFrame({
    'min_length': [train_balanced['text_length'].min()],
    'max_length': [train_balanced['text_length'].max()],
    'mean_length': [train_balanced['text_length'].mean()],
    'std_length': [train_balanced['text_length'].std()]
}, index=['Overall']).round(2)

print("Text length statistics per class:")
print(length_stats)
print("\nOverall text length statistics:")
print(overall_stats)

plt.figure(figsize=(10, 6))
train_balanced.boxplot(column='text_length', by='label')
plt.title('Text Length Distribution by Class')
plt.ylabel('Text Length')
plt.show()
plt.close()

Text length statistics per class:
      text_length                                  
       min_length max_length mean_length std_length
label                                              
0             210       4737     1291.05     866.78
1             157       7068     1404.53    1171.32

Overall text length statistics:
         min_length  max_length  mean_length  std_length
Overall         157        7068      1347.79     1030.21


NameError: name 'plt' is not defined

In [None]:
from ydata_profiling import ProfileReport
import webbrowser

profile = ProfileReport(train_balanced, title="Dataset IMDB")
profile.to_notebook_iframe()  

report_path = f"../reports/ydata_report_{name}.html"
profile.to_file(report_path)
webbrowser.open('file://' + os.path.realpath(report_path))


## 3. Model and Platform Research and Selection

Models:
- BERT
- RoBERTa
- DistilBERT
- GPT-2
- Electra
- XLNet

**DistilBERT** is good balance between performance and computational efficiency. It is a lighter and faster version of BERT.

Computing platforms:
- AWS Sagemaker Studio Labs
- QBraid
- Google Colab

In [None]:
class ModelComparator:
    def __init__(self):
        self.models: Dict = {}
        self.tokenizers: Dict = {}
        self.results: List = []
        
    def load_model(self, model_name: str):
        """Loads a model and its tokenizer."""
        print(f"Loading {model_name}...")
        
        model_configs = {
            'distilbert': ('distilbert-base-uncased', DistilBertTokenizer, DistilBertModel),
            'bert': ('bert-base-uncased', BertTokenizer, BertModel),
            'roberta': ('roberta-base', RobertaTokenizer, RobertaModel),
        }
        
        if model_name in model_configs:
            model_path, TokenizerClass, ModelClass = model_configs[model_name]
            try:
                tokenizer = TokenizerClass.from_pretrained(model_path)
                model = ModelClass.from_pretrained(model_path)
                
                # Special handling for GPT2 tokenizer
                if model_name == 'gpt2':
                    tokenizer.pad_token = tokenizer.eos_token
                    
                self.models[model_name] = model
                self.tokenizers[model_name] = tokenizer
                print(f"Successfully loaded {model_name}")
            except Exception as e:
                print(f"Error loading {model_name}: {str(e)}")
        else:
            raise ValueError(f"Unsupported model: {model_name}")

    def measure_performance(self, model_name: str, text: str, num_runs: int = 5):
        """Measures model performance for a given text."""
        model = self.models[model_name]
        tokenizer = self.tokenizers[model_name]
        
        # Performance measurements
        total_time = 0
        memory_usage = []
        
        # Prepare input
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        
        # Special handling for XLNet
        if model_name == 'xlnet':
            inputs['token_type_ids'] = torch.zeros_like(inputs['input_ids'])
        
        # Multiple runs for averaging
        for _ in range(num_runs):
            start_time = time.time()
            with torch.no_grad():
                outputs = model(**inputs)
            end_time = time.time()
            
            total_time += (end_time - start_time)
            memory_usage.append(psutil.Process().memory_info().rss / 1024 / 1024)  # MB
        
        # Calculate averages
        avg_time = total_time / num_runs
        avg_memory = np.mean(memory_usage)
        
        # Save results
        self.results.append({
            'model': model_name,
            'avg_time_ms': round(avg_time * 1000, 2),
            'avg_memory_mb': round(avg_memory, 2),
            'parameters': sum(p.numel() for p in model.parameters()),
            'input_length': len(inputs['input_ids'][0])
        })
    
    def show_results(self):
        """Shows results in a DataFrame."""
        df = pd.DataFrame(self.results)
        # Add relative speed comparison (normalized to BERT)
        if 'bert' in df['model'].values:
            bert_time = df[df['model'] == 'bert']['avg_time_ms'].values[0]
            df['relative_speed'] = bert_time / df['avg_time_ms']
        return df

def main():
    # Sample texts for analysis
    texts = [
        """This is a longer text that allows us to see how different models behave
        with more extensive content. We want to analyze the differences in processing
        time and memory usage across various transformer architectures."""
    ]
    
    # Initialize comparator
    comparator = ModelComparator()
    
    # Load all models
    models = ['distilbert', 'bert', 'roberta']
    for model in models:
        try:
            comparator.load_model(model)
        except Exception as e:
            print(f"Skipping {model} due to error: {str(e)}")
    
    # Run tests
    print("\nRunning performance tests...")
    for text in texts:
        print(f"\nTesting with text of {len(text)} characters")
        for model in comparator.models.keys():
            print(f"Testing {model}...")
            comparator.measure_performance(model, text)
    
    # Show results
    results = comparator.show_results()
    print("\nComparison Results:")
    print(results.to_string(index=False))
      
    return results

if __name__ == "__main__":
    main()

## 4. Model and Tokenizer Initialization

In [None]:
class TextDataset(Dataset):
	def __init__(self, texts, labels, tokenizer, max_length=512):
		self.encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length)
		self.labels = labels

	def __getitem__(self, idx):
		item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
		item['labels'] = torch.tensor(self.labels[idx])
		return item

	def __len__(self):
		return len(self.labels)


def train_model(model, train_loader, val_loader, optimizer, num_epochs=3):
	for epoch in range(num_epochs):
		model.train()
		for batch in train_loader:
			optimizer.zero_grad()
			input_ids = batch['input_ids'].to(device)
			attention_mask = batch['attention_mask'].to(device)
			labels = batch['labels'].to(device)
			outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
			loss = outputs.loss
			loss.backward()
			optimizer.step()
		
		model.eval()
		val_accuracy = []
		for batch in val_loader:
			input_ids = batch['input_ids'].to(device)
			attention_mask = batch['attention_mask'].to(device)
			labels = batch['labels'].to(device)
			with torch.no_grad():
				outputs = model(input_ids, attention_mask=attention_mask)
			predictions = torch.argmax(outputs.logits, dim=1)
			accuracy = (predictions == labels).float().mean().item()
			val_accuracy.append(accuracy)
		
		print(f"Epoch {epoch + 1}, Val Accuracy: {np.mean(val_accuracy)}")
	return model	


In [None]:
model_name = 'distilbert'
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Tokenize text data
train_texts = train_balanced['text'].tolist()
train_labels = train_balanced['label'].tolist()
test_texts = test_balanced['text'].tolist()
test_labels = test_balanced['label'].tolist()

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

# Create PyTorch datasets
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
test_dataset = TextDataset(test_texts, test_labels, tokenizer)

# Create PyTorch data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Move model to GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Train the model
train_model(model, train_loader, val_loader, optimizer, num_epochs=3)

## 5. Optimizer Configuration

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"Using device: {device}")

## 6. Training and Evaluation Functions

In [None]:
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    return total_loss / len(dataloader)

def evaluate(model, dataloader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in dataloader:
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(inputs['labels'].cpu().tolist())
    
    accuracy = accuracy_score(actual_labels, predictions)
    f1 = f1_score(actual_labels, predictions, average='binary')
    return accuracy, f1

## 7. Main Training Loop

In [None]:
num_epochs = 3  # Adjust based on your needs and time constraints

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    avg_train_loss = train_epoch(model, train_dataloader, optimizer, device)
    print(f"Average training loss: {avg_train_loss:.4f}")
    
    accuracy, f1 = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}, F1-score: {f1:.4f}")
    print("----")

print("Training completed!")

## 8. Save the Model

In [None]:
model.save_pretrained('./models/sentiment_model')
tokenizer.save_pretrained('./models/sentiment_model')
print("Model saved in './modles/sentiment_model' directory")

## 9. Test the Model

In [None]:
def predict_sentiment(text, model, tokenizer, device):
    model.eval()
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        prediction = torch.argmax(outputs.logits, dim=1)
    
    return "Positive" if prediction.item() == 1 else "Negative"

# Test the model with some example reviews
test_reviews = [
    "This movie was fantastic! I loved every minute of it.",
    "Absolutely terrible. Waste of time and money.",
    "It was okay, nothing special but not bad either."
]

for review in test_reviews:
    sentiment = predict_sentiment(review, model, tokenizer, device)
    print(f"Review: {review}")
    print(f"Predicted sentiment: {sentiment}")
    print("----")