<a href="https://colab.research.google.com/github/dattali18/IR_Assignments/blob/main/Assignment.04/IR_04_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
!pip install --upgrade pip

Collecting pip
  Downloading pip-24.3.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-24.3.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-24.3.1


In [3]:
!pip install pandas numpy torch transformers datasets scikit-learn tqdm

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
Downloading dill-0.3.8-py3-none-any.whl (116 kB)
Downloading fsspec-2024.9.0-py3-none-any.whl (179 kB)
Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
Installing collected packages: xxhash, fsspec, dill, multiprocess, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 202

In [4]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from transformers import DistilBertTokenizer, DistilBertModel
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [21]:
link = "https://github.com/dattali18/IR_Assignments/blob/main/Assignment.04/sentences.csv?raw=true"
df = pd.read_csv(link)
texts = df['sentence'].values
labels = df['label'].values

In [22]:
# take a subset of the df 100 from each of the 5 classes
df = df.groupby('label').apply(lambda x: x.sample(n=500, random_state=42)).reset_index(drop=True)
texts = df['sentence'].values
labels = df['label'].values

In [23]:
# check if the df contains 500 instances
df.groupby('label').count()

Unnamed: 0_level_0,id,sentence,type
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,500,476,500
1,500,500,500
2,500,500,500
3,500,500,500
4,500,500,500


In [24]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.15, random_state=42
)

In [10]:
# Create dataset class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [11]:
# Create model class
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes=5):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.drop = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        output = self.drop(output[0][:, 0, :])
        return self.fc(output)

In [25]:
# Initialize tokenizer and create datasets
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer)

In [26]:
# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [27]:
# Initialize model and move to GPU
model = SentimentClassifier()
model = model.to(device)

In [28]:
# Training parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()
n_epochs = 5

In [29]:
# Training loop
def train_model():
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc='Training'):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(train_loader)

In [30]:
# Evaluation loop
def evaluate_model():
    model.eval()
    total_loss = 0
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)

            _, predictions = torch.max(outputs, dim=1)

            total_loss += loss.item()
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = np.mean(np.array(all_predictions) == np.array(all_labels))
    return total_loss / len(val_loader), accuracy

In [32]:
# Training
for epoch in range(n_epochs):
    print(f'\nEpoch {epoch + 1}/{n_epochs}')
    train_loss = train_model()
    val_loss, val_accuracy = evaluate_model()

    print(f'Training Loss: {train_loss:.4f}')
    print(f'Validation Loss: {val_loss:.4f}')
    print(f'Validation Accuracy: {val_accuracy:.4f}')


Epoch 1/10


Training: 100%|██████████| 133/133 [00:24<00:00,  5.44it/s]
Evaluating: 100%|██████████| 24/24 [00:01<00:00, 16.35it/s]


Training Loss: 0.0031
Validation Loss: 0.1000
Validation Accuracy: 0.9733

Epoch 2/10


Training: 100%|██████████| 133/133 [00:24<00:00,  5.43it/s]
Evaluating: 100%|██████████| 24/24 [00:01<00:00, 16.54it/s]


Training Loss: 0.0020
Validation Loss: 0.1054
Validation Accuracy: 0.9760

Epoch 3/10


Training: 100%|██████████| 133/133 [00:24<00:00,  5.52it/s]
Evaluating: 100%|██████████| 24/24 [00:01<00:00, 16.69it/s]


Training Loss: 0.0015
Validation Loss: 0.1095
Validation Accuracy: 0.9787

Epoch 4/10


Training: 100%|██████████| 133/133 [00:24<00:00,  5.53it/s]
Evaluating: 100%|██████████| 24/24 [00:01<00:00, 16.49it/s]


Training Loss: 0.0012
Validation Loss: 0.1156
Validation Accuracy: 0.9787

Epoch 5/10


Training: 100%|██████████| 133/133 [00:24<00:00,  5.48it/s]
Evaluating: 100%|██████████| 24/24 [00:01<00:00, 16.55it/s]


Training Loss: 0.0011
Validation Loss: 0.1167
Validation Accuracy: 0.9787

Epoch 6/10


Training: 100%|██████████| 133/133 [00:24<00:00,  5.44it/s]
Evaluating: 100%|██████████| 24/24 [00:01<00:00, 16.71it/s]


Training Loss: 0.0009
Validation Loss: 0.1186
Validation Accuracy: 0.9787

Epoch 7/10


Training: 100%|██████████| 133/133 [00:24<00:00,  5.50it/s]
Evaluating: 100%|██████████| 24/24 [00:01<00:00, 16.49it/s]


Training Loss: 0.0008
Validation Loss: 0.1199
Validation Accuracy: 0.9787

Epoch 8/10


Training: 100%|██████████| 133/133 [00:24<00:00,  5.51it/s]
Evaluating: 100%|██████████| 24/24 [00:01<00:00, 16.53it/s]


Training Loss: 0.0011
Validation Loss: 0.1606
Validation Accuracy: 0.9707

Epoch 9/10


Training: 100%|██████████| 133/133 [00:24<00:00,  5.51it/s]
Evaluating: 100%|██████████| 24/24 [00:01<00:00, 14.75it/s]


Training Loss: 0.0042
Validation Loss: 0.1117
Validation Accuracy: 0.9787

Epoch 10/10


Training: 100%|██████████| 133/133 [00:24<00:00,  5.53it/s]
Evaluating: 100%|██████████| 24/24 [00:01<00:00, 14.55it/s]

Training Loss: 0.0007
Validation Loss: 0.1146
Validation Accuracy: 0.9787





In [34]:
# Save the model
torch.save(model.state_dict(), 'sentiment_model_v2.pth')