# Google Colab (Skip locally)

In [None]:
# Mount your Google Drive
%matplotlib inline
%load_ext autoreload
%autoreload 2

from google.colab import drive
import sys
import os
import shutil
import warnings

drive.mount('/content/gdrive')

folder = "/content/gdrive/MyDrive/IFT6135/HW1_2025/assignment1_release" #@param {type:"string"}
!ln -Ts "$folder" /content/assignment 2> /dev/null

# Add the assignment folder to Python path
if '/content/assignment' not in sys.path:
  sys.path.insert(0, '/content/assignment')

# Check if CUDA is available
import torch
if not torch.cuda.is_available():
  warnings.warn('CUDA is not available.')

# Imports & Setup

In [1]:
%matplotlib inline
import warnings

from main import train_models
from arguments import Arguments
import os
from plotter import generate_plots

models = ['lstm', 'gpt']

# Experiments

## Experiment 1 (Sanity Check)

In [None]:
args = Arguments()
args.log_dir = "logs/experiment1"
args.n_steps = 500

results = {}
for model in models:
    args.model = model
    
    args.exp_name = model
    _, results[model], _ = train_models(args)

generate_plots(results, save_path=args.log_dir, mode="std")

Model 1/2
log_dir : logs/experiment1
n_steps : 500
model : lstm
exp_name : lstm
exp_id : 0
seed : 0
checkpoint_path: logs/experiment1\lstm\0
dataset_size: 480
Number of training epochs (500) & steps (500)


Training:   1% |          | 3/500 [ 7.10it/s, step=3, loss=3.44396, lr=0.001]


Train loss : 3.49893 | Train accuracy : 0.00000 | Test loss : 3.50019 | Test accuracy : 0.00000 | lr = 0.00100


Training:  20% |██        | 101/500 [ 9.96it/s, step=102, loss=1.70876, lr=0.001]


Train loss : 1.70979 | Train accuracy : 0.04583 | Test loss : 1.76863 | Test accuracy : 0.02079 | lr = 0.00100


Training:  40% |████      | 201/500 [ 7.33it/s, step=202, loss=0.89884, lr=0.001]


Train loss : 0.91950 | Train accuracy : 0.66042 | Test loss : 2.69184 | Test accuracy : 0.00000 | lr = 0.00100


Training:  60% |██████    | 301/500 [ 8.18it/s, step=301, loss=0.06859, lr=0.001]


Train loss : 0.06859 | Train accuracy : 1.00000 | Test loss : 3.61786 | Test accuracy : 0.00208 | lr = 0.00100


Training:  80% |████████  | 402/500 [ 8.02it/s, step=402, loss=0.02783, lr=0.001]


Train loss : 0.02796 | Train accuracy : 1.00000 | Test loss : 3.69963 | Test accuracy : 0.00416 | lr = 0.00100


Training: 100% |██████████| 500/500 [ 8.39it/s, step=500, loss=0.01980, lr=0.001]


Train loss : 0.01975 | Train accuracy : 1.00000 | Test loss : 3.64438 | Test accuracy : 0.00416 | lr = 0.00100





Model 2/2
log_dir : logs/experiment1
n_steps : 500
model : lstm
exp_name : lstm
exp_id : 1
seed : 42
checkpoint_path: logs/experiment1\lstm\1
dataset_size: 480
Number of training epochs (500) & steps (500)


Training:   0% |          | 2/500 [ 4.81it/s, step=2, loss=3.54585, lr=0.001]


Train loss : 3.54585 | Train accuracy : 0.01250 | Test loss : 3.54366 | Test accuracy : 0.01663 | lr = 0.00100


Training:  20% |██        | 102/500 [ 8.85it/s, step=102, loss=1.69919, lr=0.001]


Train loss : 1.70050 | Train accuracy : 0.07083 | Test loss : 1.76654 | Test accuracy : 0.00832 | lr = 0.00100


Training:  40% |████      | 200/500 [ 7.25it/s, step=201, loss=0.93105, lr=0.001]


Train loss : 0.93105 | Train accuracy : 0.65208 | Test loss : 2.59708 | Test accuracy : 0.00000 | lr = 0.00100


Training:  60% |██████    | 300/500 [ 8.46it/s, step=301, loss=0.07089, lr=0.001]


Train loss : 0.07089 | Train accuracy : 1.00000 | Test loss : 3.44532 | Test accuracy : 0.00208 | lr = 0.00100


Training:  80% |████████  | 400/500 [ 4.57it/s, step=400, loss=0.02697, lr=0.001]


Train loss : 0.02683 | Train accuracy : 1.00000 | Test loss : 3.51803 | Test accuracy : 0.00416 | lr = 0.00100


Training: 100% |██████████| 500/500 [ 7.81it/s, step=500, loss=0.01865, lr=0.001]


Train loss : 0.01860 | Train accuracy : 1.00000 | Test loss : 3.46558 | Test accuracy : 0.00624 | lr = 0.00100
Model 1/2
logs/experiment1\lstm\0
Model 2/2
logs/experiment1\lstm\1





ValueError: too many values to unpack (expected 3)

<Figure size 640x480 with 0 Axes>

In [3]:
results

{}

## Experiment 2 (Scaling Data Size : Training Ratio)

In [None]:
args = Arguments()
args.log_dir = "logs/experiment2"

results = {}
for model in models:
    args.model = model
    args.log_dir = os.path.join(args.log_dir, model)
    for r in [0.1, 0.3, 0.5, 0.7, 0.9]:
        args.r_train = r
        
        args.exp_name = f"{model}_r_{r}"
        results[model] = train_models(args)

## Experiment 3 (Scaling Data Size : P)

In [None]:
args = Arguments()
args.log_dir = "logs/experiment3"

args.p = 11
# todo: add the rest of the arguments

results = {}
for model in models:
    args.model = model
    
    args.exp_name = model
    results[model] = train_models(args)

## Experiment 4 (Scaling Model Size)

In [None]:
args = Arguments()
args.log_dir = "logs/experiment4"

for model in models:
    args.model = model
    args.log_dir = os.path.join(args.log_dir, model)
    for L in [1, 2, 3]:
        args.num_layers = L
        for d in [2**6, 2**7, 2**8]:
            args.embedding_size = d
            args.hidden_size = d

            args.exp_name = f"{model}_L_{L}_d_{d}"
            train_models(args)

## Experiment 5 (Scaling Compute)

In [None]:
args = Arguments()
args.log_dir = "logs/experiment5"

for model in models:
    args.model = model
    args.log_dir = os.path.join(args.log_dir, model)
    for B in [2**5, 2**6, 2**7, 2**8, 2**9]:
        args.batch_size = B

        args.exp_name = f"{model}_B_{B}"
        train_models(args)

## Experiment 6 (Regularisation)

In [None]:
args = Arguments()
args.log_dir = "logs/experiment6"

for model in models:
    args.model = model
    args.log_dir = os.path.join(args.log_dir, model)
    for wd in [0.25, 0.5, 0.75, 1.0]:
        args.weight_decay = wd

        args.exp_name = f"{model}_wd_{wd}"
        train_models(args)