<a href="https://colab.research.google.com/github/github-ashwin/DeepLearning-Lab/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Sentiment Analysis on SST-2 dataset using a Basic Transformer Encoder
=====================================================================

This program:
1. Loads the SST-2 dataset from HuggingFace Datasets
2. Tokenizes the text data
3. Defines a Transformer Encoder model from scratch using PyTorch
4. Trains the model for binary sentiment classification (positive/negative)
5. Evaluates the model on the validation set
6. Allows the user to test sentiment on custom input text

Dependencies:
- torch
- transformers
- datasets

In [1]:
!pip install transformers datasets



In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import BertTokenizer
from datasets import load_dataset
import torch.optim as optim

In [3]:
# Load dataset

dataset = load_dataset("glue", "sst2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

sst2/train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

sst2/validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

sst2/test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

MAX_LEN = 50

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [6]:
def encode_batch(batch):
    """
    Tokenizes and encodes a batch of sentences with padding/truncation.
    """
    encodings = tokenizer(
        batch["sentence"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
    )
    return {
        "input_ids": encodings["input_ids"],
        "attention_mask": encodings["attention_mask"],
        "labels": batch["label"],
    }

Apply Encoding

In [7]:
dataset = dataset.map(encode_batch, batched=True)

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [8]:
# Format for PyTorch
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

train_loader = DataLoader(dataset["train"], batch_size=32, shuffle=True)
val_loader = DataLoader(dataset["validation"], batch_size=32)

### Transformer Encoder Model

In [9]:
class TransformerSentiment(nn.Module):
    def __init__(self, vocab_size, d_model=128, nhead=4, num_layers=2, num_classes=2):
        super(TransformerSentiment, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=0)
        self.pos_embedding = nn.Embedding(MAX_LEN, d_model)

        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.fc = nn.Linear(d_model, num_classes)

    def forward(self, input_ids, attention_mask=None):
        """
        Forward pass:
        1. Embed tokens and add positional encoding
        2. Pass through Transformer Encoder
        3. Apply pooling (take first token as [CLS])
        4. Classification layer
        """
        seq_len = input_ids.size(1)
        positions = torch.arange(0, seq_len, device=input_ids.device).unsqueeze(0)

        x = self.embedding(input_ids) + self.pos_embedding(positions)

        # Transformer expects [seq_len, batch, hidden_dim]
        x = x.permute(1, 0, 2)

        out = self.transformer(x)

        # Take first token (similar to [CLS])
        out = out[0]

        return self.fc(out)

In [10]:
# Training Setup

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model = TransformerSentiment(vocab_size=tokenizer.vocab_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-4)

Using device: cuda




In [11]:
EPOCHS = 2  # small for demo (increase for better accuracy)

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{EPOCHS}, Training Loss: {avg_loss:.4f}")

Epoch 1/2, Training Loss: 0.6156
Epoch 2/2, Training Loss: 0.4774


In [14]:
# Validation
model.eval()
correct, total = 0, 0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids)
        preds = torch.argmax(outputs, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

acc = correct / total
print(f"Validation Accuracy: {acc:.4f}")

Validation Accuracy: 0.7672


In [15]:
# User Input

def predict_sentiment(sentence):
    model.eval()
    enc = tokenizer(
        sentence,
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
        return_tensors="pt",
    )
    input_ids = enc["input_ids"].to(device)

    with torch.no_grad():
        outputs = model(input_ids)
        pred = torch.argmax(outputs, dim=1).item()

    return "Positive" if pred == 1 else "Negative"

In [16]:
# Custom Input

print("\nSample Predictions:")
print("Sentence: I really loved the movie! →", predict_sentiment("I really loved the movie!"))
print("Sentence: This film was terrible and boring. →", predict_sentiment("This film was terrible and boring."))


Sample Predictions:
Sentence: I really loved the movie! → Positive
Sentence: This film was terrible and boring. → Negative
