## Imports

In [1]:
from data_trf import load_imdb, load_imdb_synth, load_xor

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

import warnings
warnings.filterwarnings("ignore")

  cpu = _conversion_method_template(device=torch.device("cpu"))


## Loading and Inspecting the data

In [10]:
(x_train, y_train), (x_val, y_val), (i2w, w2i), numcls = load_imdb(final=False)

In [12]:
print(type(x_train), type(y_train), type(i2w), type(w2i))

<class 'list'> <class 'list'> <class 'list'> <class 'dict'>


In [16]:
x_train[141]

[880,
 4,
 122,
 19,
 127,
 1035,
 12,
 4,
 481,
 7,
 4372,
 7913,
 618,
 5,
 32,
 15,
 146,
 33,
 585,
 678,
 37,
 34,
 623,
 12,
 348,
 20,
 30,
 14253,
 3414,
 5,
 13112,
 28,
 969,
 147,
 30,
 105,
 5,
 30,
 1816,
 7,
 118]

In [13]:
print(i2w[141], w2i['film'])

ve 23


In [15]:
print([i2w[w] for w in x_train[141]])

['possibly', 'the', 'best', 'movie', 'ever', 'created', 'in', 'the', 'history', 'of', 'jeffrey', 'combs', 'career', 'and', 'one', 'that', 'should', 'be', 'looked', 'upon', 'by', 'all', 'talent', 'in', 'hollywood', 'for', 'his', 'versatility', 'charisma', 'and', 'uniqueness', 'he', 'brings', 'through', 'his', 'characters', 'and', 'his', 'knowledge', 'of', 'acting']


In [21]:
w2i['.pad']

0

## Question 1:  Padding & Converting to torch tensors

In [11]:
def pad_and_convert(seqs, pad_idx):
    # 1. Find max sequence length
    max_len = max(len(s) for s in seqs)

    padded = []
    for s in seqs:
        # 2. Pad with pad_idx to max_len
        padded_seq = s + [pad_idx] * (max_len - len(s))
        padded.append(padded_seq)

    # 3. Convert to tensor (long dtype)
    return torch.tensor(padded, dtype=torch.long)


def prepare_data(x_train, y_train, x_val, y_val, w2i):
    pad_idx = w2i[".pad"]

    x_train_t = pad_and_convert(x_train, pad_idx)
    x_val_t   = pad_and_convert(x_val, pad_idx)

    y_train_t = torch.tensor(y_train, dtype=torch.long)
    y_val_t   = torch.tensor(y_val, dtype=torch.long)

    return x_train_t, y_train_t, x_val_t, y_val_t

x_train_t, y_train_t, x_val_t, y_val_t = prepare_data(x_train, y_train, x_val, y_val, w2i)

print(x_train_t.shape)
print(y_train_t.shape)

torch.Size([20000, 2514])
torch.Size([20000])


In [28]:
x_train_t[2]

tensor([10721,     4, 10956,  ...,     0,     0,     0])

In [29]:
y_train_t[2]

tensor(1)

## Question 2: Building Model

In [31]:
class BaselineClassifier(nn.Module):
    def __init__(self, vocab_size, emb_dim, num_classes):
        super().__init__()
        
        self.emb = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=emb_dim
        )
        
        self.classifier = nn.Linear(emb_dim, num_classes)

    def forward(self, x):
        """
        x: (batch, time), dtype long
        """
        # 1. Embedding
        emb = self.emb(x)  # → (batch, time, emb)

        # 2. Global average pool over time dimension
        pooled = emb.mean(dim=1)  # → (batch, emb)

        # 3. Linear projection to classes
        logits = self.classifier(pooled)  # → (batch, num_classes)

        return logits

model = BaselineClassifier(
    vocab_size=len(w2i),
    emb_dim=300,
    num_classes=numcls
)

# logits = model(x_train_t[:32])
# print(logits.shape)

torch.Size([32, 2])


## Question 3: 3 different global pools

In [2]:
# Model with 3 global pooling options
class BaselinePool(nn.Module):
    def __init__(self, vocab_size, emb_dim, num_classes, pool_type="mean"):
        super().__init__()
        self.pool_type = pool_type 
        self.emb = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=emb_dim
        )
        self.classifier = nn.Linear(emb_dim, num_classes)

    def forward(self, x):
        emb = self.emb(x) 

        if self.pool_type == "mean":
            pooled = emb.mean(dim=1)
        elif self.pool_type == "max":
            pooled = emb.max(dim=1).values
        elif self.pool_type == "first":
            pooled = emb[:, 0, :]

        logits = self.classifier(pooled)
        return logits

In [3]:
def accuracy(logits, y):
    preds = logits.argmax(dim=1)
    return (preds == y).float().mean().item()

def train_one_epoch(model, loader, optimizer, device):
    model.train()
    total_loss = 0

    for xb, yb in loader: # iterate through batches
        xb, yb = xb.to(device), yb.to(device) # move batch to either CPU or GPU 

        optimizer.zero_grad() # clear gradients

        logits = model(xb) # feed the input batch through the model
        loss = F.cross_entropy(logits, yb) # compute loss
        loss.backward() # compute gradients
        optimizer.step() # update model parameters using computed gradients 

        total_loss += loss.item() * xb.size(0)

    return total_loss / len(loader.dataset)

def evaluate(model, loader, device):
    model.eval()
    total_acc = 0

    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            logits = model(xb)
            total_acc += accuracy(logits, yb) * xb.size(0)

    return total_acc / len(loader.dataset)

def make_loaders(x_train, y_train, x_val, y_val, batch_size):
    train_ds = TensorDataset(x_train, y_train)
    val_ds = TensorDataset(x_val, y_val)

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=batch_size)
    return train_loader, val_loader


def run_experiment(dataset_name, x_train, y_train, x_val, y_val, w2i, num_classes,
                   pool_type="mean", emb_dim=300, batch_size=256, epochs=5, lr=1e-3):

    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    vocab_size = len(w2i)

    model = BaselinePool(
        vocab_size=vocab_size,
        emb_dim=emb_dim,
        num_classes=num_classes,
        pool_type=pool_type
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    x_train_t, y_train_t, x_val_t, y_val_t = prepare_data(x_train, y_train, x_val, y_val, w2i)
    train_loader, val_loader = make_loaders(x_train_t, y_train_t, x_val_t, y_val_t, batch_size)

    print(f"\n=== {dataset_name.upper()} — Pool: {pool_type} ===")
    for epoch in range(1, epochs + 1):
        train_loss = train_one_epoch(model, train_loader, optimizer, device)
        val_acc = evaluate(model, val_loader, device)
        print(f"Epoch {epoch}: train_loss={train_loss:.4f}, val_acc={val_acc:.4f}")

    return model


### Train on the imdb dataset

In [42]:

(x_tr1, y_tr1), (x_va1, y_va1), (i2w1, w2i1), num_cls1 = load_imdb()

In [43]:
# batch_size=256, epochs=5, lr=1e-3
for pool in ["mean", "max", "first"]:
    run_experiment(
        "imdb",
        x_tr1, y_tr1,
        x_va1, y_va1,
        w2i1, num_cls1,
        pool_type=pool
    )

Device: cpu

=== IMDB — Pool: mean ===
Epoch 1: train_loss=0.6946, val_acc=0.5064
Epoch 2: train_loss=0.6930, val_acc=0.5762
Epoch 3: train_loss=0.6887, val_acc=0.5362
Epoch 4: train_loss=0.6801, val_acc=0.5218
Epoch 5: train_loss=0.6720, val_acc=0.6184
Device: cpu

=== IMDB — Pool: max ===
Epoch 1: train_loss=0.7418, val_acc=0.5320
Epoch 2: train_loss=0.6763, val_acc=0.5950
Epoch 3: train_loss=0.6406, val_acc=0.6584
Epoch 4: train_loss=0.5974, val_acc=0.6288
Epoch 5: train_loss=0.5665, val_acc=0.7454
Device: cpu

=== IMDB — Pool: first ===
Epoch 1: train_loss=0.7096, val_acc=0.5152
Epoch 2: train_loss=0.6858, val_acc=0.5266
Epoch 3: train_loss=0.6754, val_acc=0.5212
Epoch 4: train_loss=0.6662, val_acc=0.5416
Epoch 5: train_loss=0.6582, val_acc=0.5374


In [46]:
run_experiment(
        "imdb",
        x_tr1, y_tr1,
        x_va1, y_va1,
        w2i1, num_cls1,
        pool_type="max",
        batch_size=256,
        lr=1e-2,
)

Device: cpu

=== IMDB — Pool: max ===
Epoch 1: train_loss=1.1391, val_acc=0.7210
Epoch 2: train_loss=0.4263, val_acc=0.7656
Epoch 3: train_loss=0.2624, val_acc=0.8760
Epoch 4: train_loss=0.1702, val_acc=0.8778
Epoch 5: train_loss=0.1124, val_acc=0.8852


BaselinePool(
  (emb): Embedding(99430, 300)
  (classifier): Linear(in_features=300, out_features=2, bias=True)
)

### Train on the imdb synth dataset 

In [4]:
(x_tr2, y_tr2), (x_va2, y_va2), (i2w2, w2i2), num_cls2 = load_imdb_synth()

In [13]:
# Inspecting data (because it threw an index out of bound error, I had to remove duplicate strings from i2w in load_imdb_synth())
print(len(i2w2))
print(len(w2i2))

max_token = max(max(seq) for seq in x_tr2)
vocab_size = len(w2i2)
print("max token:", max_token)
print("vocab size:", vocab_size)
assert max_token < vocab_size, "ERROR: token index exceeds vocabulary size!"

invalid_tokens = set(
    idx
    for seq in x_tr2
    for idx in seq
    if idx >= vocab_size
)
print("Invalid token IDs:", invalid_tokens)
for bad in invalid_tokens:
    print(bad, "→", i2w2[bad] if bad < len(i2w2) else "(not in i2w2)")

69
69
max token: 68
vocab size: 69
Invalid token IDs: set()


In [14]:
run_experiment("imdb_synth",
               x_tr2, y_tr2,
               x_va2, y_va2,
               w2i2, num_cls2,
               pool_type="max",
               batch_size=256,
               epochs=5,
               lr=1e-2)


=== IMDB_SYNTH — Pool: max ===
Epoch 1: train_loss=0.5434, val_acc=1.0000
Epoch 2: train_loss=0.0186, val_acc=1.0000
Epoch 3: train_loss=0.0045, val_acc=1.0000
Epoch 4: train_loss=0.0021, val_acc=1.0000
Epoch 5: train_loss=0.0013, val_acc=1.0000


BaselinePool(
  (emb): Embedding(69, 300)
  (classifier): Linear(in_features=300, out_features=2, bias=True)
)

### Train on the xor dataset

In [15]:
(x_tr3, y_tr3), (x_va3, y_va3), (i2w3, w2i3), num_cls3 = load_xor()
run_experiment(
        "xor",
        x_tr3, y_tr3,
        x_va3, y_va3,
        w2i3, num_cls3,
        pool_type="max",
        batch_size=256,
        lr=1e-2,
)


=== XOR — Pool: max ===
Epoch 1: train_loss=0.0159, val_acc=1.0000
Epoch 2: train_loss=0.0000, val_acc=1.0000
Epoch 3: train_loss=0.0000, val_acc=1.0000
Epoch 4: train_loss=0.0000, val_acc=1.0000
Epoch 5: train_loss=0.0000, val_acc=1.0000


BaselinePool(
  (emb): Embedding(6, 300)
  (classifier): Linear(in_features=300, out_features=2, bias=True)
)

## Question 4: adding self attention layer

In [16]:
class SelfAttention(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        # x = (batch, time, emb)
        # 1. Compute attention scores: (batch, time, time)
        scores = torch.matmul(x, x.transpose(1, 2))

        # 2. Softmax over the time dimension
        attn = F.softmax(scores, dim=-1)

        # 3. Weighted sum of values
        out = torch.matmul(attn, x)  # (batch, time, emb)
        return out

class BaselineWithAttention(nn.Module):
    def __init__(self, vocab_size, emb_dim, num_classes):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.attn = SelfAttention()
        self.classifier = nn.Linear(emb_dim, num_classes)

    def forward(self, x):
        # x: (batch, time)
        x = self.emb(x)              # (batch, time, emb)
        x = self.attn(x)             # (batch, time, emb)
        x = x.max(dim=1).values      # global max pooling over time
        logits = self.classifier(x)  # (batch, num_classes)
        return logits


## Question 5

In [17]:
class AttnSelectModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, num_classes):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.attn = SelfAttention()          # from Q4
        self.classifier = nn.Linear(emb_dim, num_classes)

    def forward(self, x):
        x = self.emb(x)                      # (batch, time, emb)
        x = self.attn(x)                     # (batch, time, emb)

        # SELECT POOLING: take first position only
        x = x[:, 0, :]                       # (batch, emb)

        logits = self.classifier(x)          # (batch, num_classes)
        return logits

In [22]:
def run_experiment_q5(dataset_name, x_train, y_train, x_val, y_val, w2i, num_classes,
                   pool_type="mean", emb_dim=300, batch_size=256, epochs=5, lr=1e-3):

    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    vocab_size = len(w2i)

    model = BaselineWithAttention(
        vocab_size=vocab_size,
        emb_dim=emb_dim,
        num_classes=num_classes
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    x_train_t, y_train_t, x_val_t, y_val_t = prepare_data(x_train, y_train, x_val, y_val, w2i)
    train_loader, val_loader = make_loaders(x_train_t, y_train_t, x_val_t, y_val_t, batch_size)

    print(f"\n=== {dataset_name.upper()} — Pool: {pool_type} ===")
    for epoch in range(1, epochs + 1):
        train_loss = train_one_epoch(model, train_loader, optimizer, device)
        val_acc = evaluate(model, val_loader, device)
        print(f"Epoch {epoch}: train_loss={train_loss:.4f}, val_acc={val_acc:.4f}")

    return model

In [23]:
# Train on imdb
MAX_LEN = 256

(x_tr1, y_tr1), (x_va1, y_va1), (i2w1, w2i1), num_cls1 = load_imdb()
x_tr1 = [seq[:MAX_LEN] for seq in x_tr1]
x_va1   = [seq[:MAX_LEN] for seq in x_va1]

run_experiment_q5(
    "imdb_attn_select",
    x_tr1, y_tr1,
    x_va1, y_va1,
    w2i1, num_cls1,
    emb_dim=300,
    batch_size=256,
    epochs=5,
    lr=1e-2
)


=== IMDB_ATTN_SELECT — Pool: mean ===
Epoch 1: train_loss=1.1021, val_acc=0.5554
Epoch 2: train_loss=0.4529, val_acc=0.8468
Epoch 3: train_loss=0.2644, val_acc=0.8688
Epoch 4: train_loss=0.1708, val_acc=0.8662
Epoch 5: train_loss=0.1118, val_acc=0.8800


BaselineWithAttention(
  (emb): Embedding(99430, 300)
  (attn): SelfAttention()
  (classifier): Linear(in_features=300, out_features=2, bias=True)
)

In [25]:
# Train on imdb synth
(x_tr2, y_tr2), (x_va2, y_va2), (i2w2, w2i2), num_cls2 = load_imdb_synth()
run_experiment_q5(
    "imdb_synth_attn_select",
    x_tr2, y_tr2,
    x_va2, y_va2,
    w2i2, num_cls2,
    emb_dim=300,
    batch_size=32,
    epochs=5,
    lr=1e-3
)


=== IMDB_SYNTH_ATTN_SELECT — Pool: mean ===
Epoch 1: train_loss=0.2466, val_acc=1.0000
Epoch 2: train_loss=0.0139, val_acc=1.0000
Epoch 3: train_loss=0.0034, val_acc=1.0000
Epoch 4: train_loss=0.0014, val_acc=1.0000
Epoch 5: train_loss=0.0007, val_acc=1.0000


BaselineWithAttention(
  (emb): Embedding(69, 300)
  (attn): SelfAttention()
  (classifier): Linear(in_features=300, out_features=2, bias=True)
)

In [27]:
# Train on xor
(x_tr3, y_tr3), (x_va3, y_va3), (i2w3, w2i3), num_cls3 = load_xor()
run_experiment_q5(
    "xor_attn_select",
    x_tr3, y_tr3,
    x_va3, y_va3,
    w2i3, num_cls3,
    emb_dim=300,
    batch_size=32,
    epochs=5,
    lr=1e-3
)


=== XOR_ATTN_SELECT — Pool: mean ===
Epoch 1: train_loss=0.0126, val_acc=1.0000
Epoch 2: train_loss=0.0002, val_acc=1.0000
Epoch 3: train_loss=0.0001, val_acc=1.0000
Epoch 4: train_loss=0.0000, val_acc=1.0000
Epoch 5: train_loss=0.0000, val_acc=1.0000


BaselineWithAttention(
  (emb): Embedding(6, 300)
  (attn): SelfAttention()
  (classifier): Linear(in_features=300, out_features=2, bias=True)
)