In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import pandas as pd

# Configurations
MODEL_NAME = "distilbert-base-uncased"
NUM_LABELS = 2           # Adjust if you have more classes
BATCH_SIZE = 16
NUM_EPOCHS = 3           # Use fewer epochs for debugging
LEARNING_RATE = 2e-5
MAX_SEQ_LEN = 128

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Prepare Dataset
class AmazonReviewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=MAX_SEQ_LEN):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        # Remove batch dimension
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item["labels"] = torch.tensor(label, dtype=torch.long)
        return item



# Training function
def train_epoch(model, data_loader, criterion, optimizer, scheduler, device):
    model.train()
    total_loss = 0.0
    correct_predictions = 0
    total_samples = 0
    for batch in data_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        total_samples += labels.size(0)
    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions.double() / total_samples
    return avg_loss, accuracy

# Evaluation function
def evaluate(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0.0
    correct_predictions = 0
    total_samples = 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()
            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            total_samples += labels.size(0)
    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions.double() / total_samples
    return avg_loss, accuracy

In [5]:
# Load data from CSV
# Expected CSV columns: "review" and "sentiment"
# df = pd.read_csv("amazon_reviews.csv")
train_val = pd.read_csv('amazon_review_polarity_csv/train.csv', header=None, names=['sentiment', 'title', 'review']).head(1000)
# train_val.reset_index(drop=True, inplace=True)
train_val['sentiment'] = train_val['sentiment'].apply(lambda x: 1 if x == 2 else 0)
train_val['review'] = train_val['title'] + ' ' + train_val['review']
train_val.drop(columns=['title'], inplace=True)
train_df, val_df = train_test_split(train_val, test_size=0.2, random_state=42)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
train_dataset = AmazonReviewsDataset(
    texts=train_df["review"].tolist(),
    labels=train_df["sentiment"].tolist(),
    tokenizer=tokenizer,
    max_length=MAX_SEQ_LEN
)
val_dataset = AmazonReviewsDataset(
    texts=val_df["review"].tolist(),
    labels=val_df["sentiment"].tolist(),
    tokenizer=tokenizer,
    max_length=MAX_SEQ_LEN
)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# Model initialization
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)
model = model.to(device)

# Loss, optimizer, and scheduler setup
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_loader) * NUM_EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Training loop
for epoch in range(NUM_EPOCHS):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, scheduler, device)
    val_loss, val_acc = evaluate(model, val_loader, criterion, device)
    
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}")
    print(f"Train Loss: {train_loss:.4f} | Train Accuracy: {train_acc:.4f}")
    print(f"Val Loss:   {val_loss:.4f} | Val Accuracy:   {val_acc:.4f}")

Epoch 1/3
Train Loss: 0.5766 | Train Accuracy: 0.6987
Val Loss:   0.2812 | Val Accuracy:   0.9250
Epoch 2/3
Train Loss: 0.2279 | Train Accuracy: 0.9250
Val Loss:   0.1993 | Val Accuracy:   0.9100
Epoch 3/3
Train Loss: 0.1223 | Train Accuracy: 0.9738
Val Loss:   0.1869 | Val Accuracy:   0.9300
