<a href="https://colab.research.google.com/github/dhananjaneyulu/Assignment-2-Neural-Language-Model-Training-PyTorch-/blob/main/Neural_Language_Model_Training_(PyTorch)_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install -q tqdm pandas matplotlib

import os, time, math, json, random, re
from pathlib import Path
from collections import Counter
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

In [2]:
# %%
# 2) Reproducibility & device
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    DEVICE = torch.device('cuda')
else:
    DEVICE = torch.device('cpu')
print('Device ->', DEVICE)

# %%
# 3) Paths and helper functions
DATA_FILE = '/content/Pride_and_Prejudice-Jane_Austen.txt'  # upload this file to Colab
RESULTS_DIR = 'results'
os.makedirs('data', exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

Device -> cpu


In [3]:
def clean_text(in_path, out_path):
    txt = open(in_path, encoding='utf-8').read()
    s = txt.find('*** START OF')
    e = txt.find('*** END OF')
    if s!=-1 and e!=-1:
        txt = txt[s:e]
    txt = txt.replace('\r','\n')
    txt = re.sub('\n{2,}','\n', txt)
    txt = re.sub(' +',' ', txt)
    txt = txt.strip()
    open(out_path,'w',encoding='utf-8').write(txt)
    return txt


In [6]:

CLEAN_FILE = 'data/pride_clean.txt'
if not os.path.exists(CLEAN_FILE):
    print('Cleaning text...')
    clean_text(DATA_FILE, CLEAN_FILE)
else:
    print('Clean text exists.')


Cleaning text...


In [7]:
# %%
# 4) Tokenization & vocab
def tokenize(text):
    return re.findall(r"\w+|[^\w\s]", text)

text = open(CLEAN_FILE, encoding='utf-8').read()
tokens = tokenize(text)
print('Total tokens:', len(tokens))

MIN_FREQ = 2
counter = Counter(tokens)
vocab_list = [tok for tok,cnt in counter.items() if cnt>=MIN_FREQ]
special = ['<pad>','<unk>','<bos>','<eos>']
itos = special + sorted(vocab_list)
stoi = {w:i for i,w in enumerate(itos)}
VOCAB_SIZE = len(itos)
print('Vocab size:', VOCAB_SIZE)

# encode
ids = [stoi.get(t, stoi['<unk>']) for t in tokens]

# %%

Total tokens: 151819
Vocab size: 4417


In [8]:
# %%
# 5) Train/Val/Test split (contiguous)
N = len(ids)
train_ids = ids[:int(0.8*N)]
val_ids = ids[int(0.8*N):int(0.9*N)]
test_ids = ids[int(0.9*N):]
print(len(train_ids), len(val_ids), len(test_ids))

# %%

121455 15182 15182


In [9]:
class LMSeqDataset(Dataset):
    def __init__(self, ids, seq_len, stride=1):
        self.ids = ids
        self.seq_len = seq_len
        self.stride = stride
        self.starts = list(range(0, max(0, len(ids)-seq_len), stride))
    def __len__(self):
        return len(self.starts)
    def __getitem__(self, idx):
        s = self.starts[idx]
        x = torch.tensor(self.ids[s:s+self.seq_len], dtype=torch.long)
        y = torch.tensor(self.ids[s+1:s+self.seq_len+1], dtype=torch.long)
        return x, y

In [10]:
class LSTMLM(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, num_layers=n_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)
    def forward(self, x, hidden=None):
        emb = self.embed(x)
        out, hidden = self.lstm(emb, hidden)
        logits = self.fc(out)
        return logits, hidden

In [11]:
underfit_cfg = {
    'seq_len': 30, 'batch_size': 64, 'stride':5,
    'emb_dim': 50, 'hidden': 32, 'layers':1, 'dropout':0.5,
    'lr':1e-3, 'epochs':8, 'clip':1.0
}

train_seq_len = underfit_cfg['seq_len']
train_stride = underfit_cfg['stride']

train_dataset = LMSeqDataset(train_ids, train_seq_len, stride=train_stride)

print(f"Number of sequences in train_dataset: {len(train_dataset)}")

# You can also access individual items (e.g., the first one)
x_sample, y_sample = train_dataset[0]
print(f"\nSample input sequence (x_sample) shape: {x_sample.shape}")
print(f"Sample target sequence (y_sample) shape: {y_sample.shape}")
print(f"Sample input sequence (first 5 tokens): {x_sample[:5]}")
print(f"Sample target sequence (first 5 tokens): {y_sample[:5]}")

Number of sequences in train_dataset: 24285

Sample input sequence (x_sample) shape: torch.Size([30])
Sample target sequence (y_sample) shape: torch.Size([30])
Sample input sequence (first 5 tokens): tensor([ 347,  297,  155, 1619,   13])
Sample target sequence (first 5 tokens): tensor([ 297,  155, 1619,   13,  295])


In [12]:
underfit_cfg = {
    'seq_len': 30, 'batch_size': 64, 'stride':5,
    'emb_dim': 50, 'hidden': 32, 'layers':1, 'dropout':0.5,
    'lr':1e-3, 'epochs':8, 'clip':1.0
}

In [13]:

# %%
# 8) Training & evaluation functions
def evaluate(model, loader, device):
    model.eval()
    total_loss = 0.0
    total_toks = 0
    criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='mean')
    with torch.no_grad():
        for x,y in loader:
            x=x.to(device); y=y.to(device)
            logits,_ = model(x)
            loss = criterion(logits.view(-1, VOCAB_SIZE), y.view(-1))
            total_loss += loss.item() * x.size(0) * x.size(1)
            total_toks += x.size(0) * x.size(1)
    return total_loss / total_toks


def train_one_epoch(model, loader, opt, device, clip=1.0):
    model.train()
    criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='mean')
    total_loss = 0.0
    total_toks = 0
    t0 = time.time()
    for x,y in loader:
        x=x.to(device); y=y.to(device)
        opt.zero_grad()
        logits,_ = model(x)
        loss = criterion(logits.view(-1, VOCAB_SIZE), y.view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        opt.step()
        total_loss += loss.item() * x.size(0) * x.size(1)
        total_toks += x.size(0) * x.size(1)
    t1 = time.time()
    return total_loss/total_toks, (t1-t0)

# %%

In [14]:

# %%
# 9) Experiment runner (runs until epochs or early stop)
def run_experiment(name, config, early_stopping=None):
    print('\n=== Running:', name, '===')
    seq_len = config['seq_len']
    batch = config['batch_size']
    stride = config.get('stride',1)
    train_ds = LMSeqDataset(train_ids, seq_len, stride=stride)
    val_ds = LMSeqDataset(val_ids, seq_len, stride=seq_len)  # val with non-overlapping windows
    train_loader = DataLoader(train_ds, batch_size=batch, shuffle=True, drop_last=True)
    val_loader = DataLoader(val_ds, batch_size=batch, shuffle=False, drop_last=False)

    model = LSTMLM(VOCAB_SIZE, config['emb_dim'], config['hidden'], config['layers'], config['dropout']).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], weight_decay=config.get('wd',0.0))

    best_val = float('inf'); best_epoch = -1
    history = {'train_loss':[], 'val_loss':[], 'epoch_time':[]}
    for epoch in range(1, config['epochs']+1):
        train_loss, epoch_time = train_one_epoch(model, train_loader, optimizer, DEVICE, clip=config.get('clip',1.0))
        val_loss = evaluate(model, val_loader, DEVICE)
        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        history['epoch_time'].append(epoch_time)
        print(f"{name} Epoch {epoch}/{config['epochs']}  train={train_loss:.4f}  val={val_loss:.4f}  time={epoch_time:.1f}s")
        # save best
        if val_loss < best_val:
            best_val = val_loss
            best_epoch = epoch
            torch.save({'model':model.state_dict(), 'config':config}, os.path.join(RESULTS_DIR, f'{name}_best.pt'))
        # early stopping
        if early_stopping is not None and epoch - best_epoch >= early_stopping:
            print('Early stopping triggered at epoch', epoch)
            break
    # evaluate test
    test_loader = DataLoader(LMSeqDataset(test_ids, seq_len, stride=seq_len), batch_size=batch, shuffle=False)
    test_loss = evaluate(model, test_loader, DEVICE)
    best_state = torch.load(os.path.join(RESULTS_DIR, f'{name}_best.pt'), map_location=DEVICE)
    result = {
        'name': name,
        'config': config,
        'epochs_ran': len(history['train_loss']),
        'train_loss_final': history['train_loss'][-1],
        'val_loss_final': history['val_loss'][-1],
        'val_ppl': math.exp(history['val_loss'][-1]) if history['val_loss'][-1] < 100 else float('inf'),
        'test_loss': test_loss,
        'test_ppl': math.exp(test_loss) if test_loss < 100 else float('inf'),
        'best_val_loss': best_val,
        'best_epoch': best_epoch,
        'history': history
    }
    # save history
    with open(os.path.join(RESULTS_DIR, f'{name}_history.json'),'w') as f:
        json.dump(result, f)
    return result

# %%

In [15]:
# %%
# 10) Experiment configurations
underfit_cfg = {
    'seq_len': 30, 'batch_size': 64, 'stride':5,
    'emb_dim': 50, 'hidden': 32, 'layers':1, 'dropout':0.5,
    'lr':1e-3, 'epochs':8, 'clip':1.0
}

overfit_cfg = {
    'seq_len': 50, 'batch_size':64, 'stride':1,
    'emb_dim': 400, 'hidden': 1024, 'layers':3, 'dropout':0.0,
    'lr':5e-4, 'epochs':60, 'clip':1.0, 'wd':0.0
}

bestfit_cfg = {
    'seq_len':50, 'batch_size':64, 'stride':1,
    'emb_dim':200, 'hidden':256, 'layers':2, 'dropout':0.2,
    'lr':1e-3, 'epochs':40, 'clip':1.0, 'wd':1e-5
}


In [16]:
model = LSTMLM(VOCAB_SIZE, underfit_cfg['emb_dim'], underfit_cfg['hidden'], underfit_cfg['layers'], underfit_cfg['dropout']).to(DEVICE)
print(model)

LSTMLM(
  (embed): Embedding(4417, 50, padding_idx=0)
  (lstm): LSTM(50, 32, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=32, out_features=4417, bias=True)
)




In [None]:
# 11) Run the 3 experiments sequentially
results = []
# 1) Underfit
res_u = run_experiment('underfit', underfit_cfg, early_stopping=5)
results.append(res_u)
# 2) Overfit
res_o = run_experiment('overfit', overfit_cfg, early_stopping=None)
results.append(res_o)
# 3) Best-fit (with early stopping patience=6)
res_b = run_experiment('bestfit', bestfit_cfg, early_stopping=6)
results.append(res_b)



=== Running: underfit ===
underfit Epoch 1/8  train=6.2727  val=5.7538  time=67.3s
underfit Epoch 2/8  train=5.5598  val=5.3675  time=66.2s
underfit Epoch 3/8  train=5.2160  val=5.1171  time=67.4s
underfit Epoch 4/8  train=4.9883  val=4.9648  time=67.0s
underfit Epoch 5/8  train=4.8313  val=4.8617  time=67.8s
underfit Epoch 6/8  train=4.7091  val=4.7846  time=67.1s
underfit Epoch 7/8  train=4.6082  val=4.7288  time=67.2s
underfit Epoch 8/8  train=4.5244  val=4.6841  time=67.8s

=== Running: overfit ===


In [None]:
# %%
# 12) Summarize results and plot curves
summary_rows = []
for r in results:
    h = r['history']
    epochs = len(h['train_loss'])
    x = list(range(1, epochs+1))
    plt.figure(figsize=(6,4))
    plt.plot(x, h['train_loss'], label='train')
    plt.plot(x, h['val_loss'], label='val')
    plt.title(r['name'] + ' Loss curves')
    plt.xlabel('Epoch'); plt.ylabel('Loss'); plt.legend(); plt.grid(True)
    plt.savefig(os.path.join(RESULTS_DIR, f"{r['name']}_loss.png"))
    plt.show()
    summary_rows.append({
        'name': r['name'],
        'epochs_ran': r['epochs_ran'],
        'best_epoch': r['best_epoch'],
        'best_val_loss': r['best_val_loss'],
        'val_ppl': r['val_ppl'],
        'test_ppl': r['test_ppl']
    })

summary_df = pd.DataFrame(summary_rows)
print('\nSummary:')
display(summary_df)

# %%

In [None]:
# 13) Save summary table
summary_df.to_csv(os.path.join(RESULTS_DIR,'summary.csv'), index=False)
print('All artifacts saved to', RESULTS_DIR)


In [None]:
# 14) Quick sampling function using best model from bestfit

def sample_from_checkpoint(ckpt_path, seed_text='It is a truth', length=100, temp=1.0, top_k=50):
    ckpt = torch.load(ckpt_path, map_location=DEVICE)
    cfg = ckpt['config']
    model = LSTMLM(VOCAB_SIZE, cfg['emb_dim'], cfg['hidden'], cfg['layers'], cfg['dropout']).to(DEVICE)
    model.load_state_dict(ckpt['model'])
    model.eval()
    toks = tokenize(seed_text)
    idxs = [stoi.get(t, stoi['<unk>']) for t in toks]
    input_ids = torch.tensor([idxs], dtype=torch.long).to(DEVICE)
    hidden=None
    out = toks.copy()
    for _ in range(length):
        logits, hidden = model(input_ids[:, -cfg['seq_len']:], hidden)
        logits = logits[:, -1, :]/max(1e-8, temp)
        if top_k is not None:
            v, ix = torch.topk(logits, top_k)
            probs = torch.zeros_like(logits).scatter(1, ix, F.softmax(v, dim=-1))
        else:
            probs = F.softmax(logits, dim=-1)
        nxt = torch.multinomial(probs, num_samples=1).item()
        out.append(itos[nxt])
        input_ids = torch.cat([input_ids, torch.tensor([[nxt]], device=DEVICE)], dim=1)
    return ' '.join(out)

# Try sampling
bestfit_ckpt = os.path.join(RESULTS_DIR, 'bestfit_best.pt')
if os.path.exists(bestfit_ckpt):
    print('\nSample from best-fit model:')
    print(sample_from_checkpoint(bestfit_ckpt, seed_text='It is a truth universally acknowledged', length=60, temp=1.0, top_k=40))
else:
    print('Best-fit checkpoint not found yet.')

# Notebook end
print('\nFinished. You can download the results folder for submission.')