# Train SST-2 Sentiment Classifier

### Inspect Raw Data

In [1]:
import pandas as pd

df = pd.read_csv(".data/sst2/train.tsv", sep="\t")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"Label distribution:\n{df['label'].value_counts()}")
df.head()

Shape: (67349, 2)
Columns: ['sentence', 'label']
Label distribution:
label
1    37569
0    29780
Name: count, dtype: int64


Unnamed: 0,sentence,label
0,hide new secretions from the parental units,0
1,"contains no wit , only labored gags",0
2,that loves its characters and communicates som...,1
3,remains utterly satisfied to remain the same t...,0
4,on the worst revenge-of-the-nerds clichés the ...,0


### Load as SequenceDataset

In [2]:
from ml_project_template.data import SequenceDataset

dataset = SequenceDataset.from_csv(
    ".data/sst2/train.tsv",
    text_column="sentence",
    label_column="label",
    max_vocab_size=10000,
)

print(f"Samples: {len(dataset)}")
print(f"Vocab size: {len(dataset.vocab)}")
print(f"Classes: {dataset.class_names}")

Samples: 67349
Vocab size: 10002
Classes: ['0', '1']


In [3]:
import numpy as np

# Sequence length distribution
lengths = [len(s) for s in dataset.sequences]
print(
    f"Sequence length — min: {min(lengths)}, "\
    f"max: {max(lengths)}, mean: {np.mean(lengths):.1f}, "\
    f"p95: {np.percentile(lengths, 95):.0f}"
)

# Sample encoding
idx = 0
inv_vocab = {v: k for k, v in dataset.vocab.items()}
print(f"\nRaw text: {df['sentence'].iloc[idx]}")
print(f"Token IDs: {dataset.sequences[idx]}")
print(f"Decoded: {[inv_vocab[i] for i in dataset.sequences[idx]]}")

Sequence length — min: 0, max: 48, mean: 8.9, p95: 24

Raw text: hide new secretions from the parental units 
Token IDs: [4564, 87, 1, 33, 2, 7150, 8684]
Decoded: ['hide', 'new', '<UNK>', 'from', 'the', 'parental', 'units']


### Split Dataset

In [4]:
# Time-aware split — no shuffle, splits by position
train_data, val_data = dataset.split(test_size=0.1)

print(f"Train: {len(train_data)} samples")
print(f"Val:   {len(val_data)} samples")

# Both splits share the same vocabulary
assert train_data.vocab is val_data.vocab

Train: 60614 samples
Val:   6735 samples


### Create DataLoaders

In [5]:
# Variable-length mode: sequences padded to longest in each batch
train_loader = train_data.to_pytorch(batch_size=16, shuffle=True)

X_batch, y_batch = next(iter(train_loader))
print(f"Variable-length batch — X: {X_batch.shape}, y: {y_batch.shape}")
print(f"(batch_size=32, seq_len=max length in this batch)")

Variable-length batch — X: torch.Size([16, 28]), y: torch.Size([16])
(batch_size=32, seq_len=max length in this batch)


In [6]:
# Fixed-length mode: all sequences truncated/padded to seq_length
train_loader_fixed = train_data.to_pytorch(batch_size=16, shuffle=True, seq_length=32)

X_batch, y_batch = next(iter(train_loader_fixed))
print(f"Fixed-length batch — X: {X_batch.shape}, y: {y_batch.shape}")
print(f"(batch_size=32, seq_len=64 always)")

Fixed-length batch — X: torch.Size([16, 32]), y: torch.Size([16])
(batch_size=32, seq_len=64 always)


### List Models

In [7]:
from ml_project_template.models import ModelRegistry

print(ModelRegistry.list())

  from .autonotebook import tqdm as notebook_tqdm


['gb_classifier', 'mlp_classifier']


In [8]:
from ml_project_template.modules import SequenceCNN

model = SequenceCNN(
    embed_dims=[len(train_data.vocab), 64],
    kernel_spec=[
        [3, 16, 2],
        [3, 32, 1],
        [3, 64, 1]
    ],
    seq_length=32,
    output_dim=3,
    padding_idx=train_data.vocab['<PAD>'],
    hidden_activation="Tanh",
    output_activation="Identity",
    use_bias=True
)

for batch in train_loader_fixed:
    inputs, labels = batch
    break

outputs = model(inputs)

In [9]:
model

SequenceCNN(
  (embedding): Embedding(10002, 64, padding_idx=0)
  (cnn): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 64), stride=(2, 1))
    (1): Transpose()
    (2): Tanh()
    (3): Conv2d(1, 32, kernel_size=(3, 16), stride=(1, 1))
    (4): Transpose()
    (5): Tanh()
    (6): Conv2d(1, 64, kernel_size=(3, 32), stride=(1, 1))
    (7): Transpose()
    (8): Identity()
  )
  (linear): Linear(in_features=704, out_features=3, bias=True)
)

In [10]:
inputs.shape

torch.Size([16, 32])

In [11]:
outputs.shape

torch.Size([16, 3])