<a href="https://colab.research.google.com/github/ericaong97/DeepLearningLLM/blob/main/weightdecay_0_05.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/tuanla074/DeepLearningLLM.git
%cd DeepLearningLLM

Cloning into 'DeepLearningLLM'...
remote: Enumerating objects: 62, done.[K
remote: Counting objects: 100% (62/62), done.[K
remote: Compressing objects: 100% (52/52), done.[K
remote: Total 62 (delta 20), reused 46 (delta 10), pack-reused 0 (from 0)[K
Receiving objects: 100% (62/62), 636.56 KiB | 4.90 MiB/s, done.
Resolving deltas: 100% (20/20), done.
/content/DeepLearningLLM


In [2]:
!pip install -r requirements.txt  # If there's a requirements file
# Otherwise install dependencies like
!pip install transformers datasets torch accelerate peft

[31mERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'[0m[31m
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x8

In [3]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=e6af789a05c26612f561fe78ebff94ea808fb732a45dd9da099b870ef82c7326
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [4]:
!python data_utils.py

In [5]:
# optimizer_and_scheduler.py
import torch
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau

# 1. Optimizer setup
def get_optimizer(model):
    return AdamW(
        model.parameters(),
        lr=2e-4,
        eps=1e-6,
        weight_decay=0.05
    )

# 2. Plateau scheduler
def get_plateau_scheduler(optimizer):
    return ReduceLROnPlateau(optimizer, mode='min',
                            patience=2,
                            factor=0.5,
                            threshold=0.01,
                            threshold_mode='rel',
                            min_lr=1e-5)

# 3. Teacher Forcing Ratio Class
class TeacherForcingScheduler:
    def __init__(self, initial_ratio=0.9, min_ratio=0.1,
                decay_type='exp', decay_steps=4487*8,
                decay_rate=0.9998, staircase=False):
        """
        Args:
            decay_type: 'exp' (exponential) or 'linear'
            decay_steps: Steps to decay from initial→min_ratio
            staircase: If True, decay at discrete intervals
        """
        self.initial_ratio = initial_ratio
        self.min_ratio = min_ratio
        self.decay_type = decay_type
        self.decay_steps = decay_steps
        self.decay_rate = decay_rate
        self.staircase = staircase
        self._step = 0  # Critical for resuming

    def step(self):
        """Call this EVERY batch update"""
        self._step += 1

        if self.decay_type == 'linear':
            ratio = self.initial_ratio - (self.initial_ratio - self.min_ratio) * (min(1.0, self._step / self.decay_steps))
        else:  # exponential
            if self.staircase:
                ratio = self.initial_ratio * (self.decay_rate ** (self._step // self.decay_steps))
            else:
                ratio = self.initial_ratio * (self.decay_rate**self._step)

        self.current_ratio = max(ratio, self.min_ratio)
        return self.current_ratio

    def state_dict(self):
        return {k: v for k, v in self.__dict__.items()}

    def load_state_dict(self, state_dict):
        self.__dict__.update(state_dict)


# 4. Exponential decay teacher forcing
exp_teacher_scheduler = TeacherForcingScheduler(
    initial_ratio=0.9,
    min_ratio=0.1,
    decay_rate=0.9998,  # Reaches 0.1 after ~2 epochs
    decay_steps=4487*8)

# 5. Linear decay teacher forcing
linear_teacher_scheduler = TeacherForcingScheduler(
    decay_type='linear',
    decay_steps=4487*6  # Full decay after 8 epochs
)

In [6]:
!python baseline_transformer_architecture.py

In [7]:
!python modeling_functions.py

In [None]:
# import libraries and modules
import torch
from data_utils import (
    get_train_loader,
    get_val_loader,
    set_seed
)
from baseline_transformer_architecture import create_small_transformer

from modeling_functions import (
    train_transformer_teacher_forcing
)

from optimizer_scheduler import (
    get_optimizer, get_plateau_scheduler, linear_teacher_scheduler
)
from tokenizers import Tokenizer

# ============================================================================

# 1. Setting a global seed and device to use
set_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 2. Define model configuration
config = {
    "vocab_size": 20000,
    "dropout": 0.1,
    "max_len": 512,
    "d_model": 512,
    "nhead": 8,
    "num_encoder_layers": 4,
    "num_decoder_layers": 4,
    "dim_feedforward": 2048
}


# 2. load the tokenizer
tokenizer_20 = Tokenizer.from_file("cnn_bpe_tokenizer_20k.json")


# 3. Initialize model
base_model = create_small_transformer(d_model=config['d_model'],
                                nhead=config['nhead'],
                                num_decoder_layers=config['num_decoder_layers'],
                                num_encoder_layers=config['num_encoder_layers'],
                                dim_feedforward=config['dim_feedforward'],
                                dropout=config['dropout'],
                                vocab_size=config['vocab_size']).to(device)

# 4. Data loading
train_loader = get_train_loader(tokenizer=tokenizer_20)
val_loader = get_val_loader(tokenizer=tokenizer_20)

# 5. Define criterion
# It ignores the padding index
transformer_criterion = torch.nn.CrossEntropyLoss(ignore_index=1)

# 6. Training Loop
# please change the filename for your experiment
history = train_transformer_teacher_forcing(
    model=base_model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=get_optimizer(base_model),
    criterion=transformer_criterion,
    plateau_scheduler=get_plateau_scheduler(get_optimizer(base_model)),
    teacher_forcing_scheduler=linear_teacher_scheduler,
    tokenizer=tokenizer_20,
    device=device,
    pad_idx=tokenizer_20.token_to_id("[PAD]"),
    clip_norm=2.0,
    num_epochs=5,
    max_length_generate=40,
    filename='test_weightdecay'
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Epoch 1/5:   0%|          | 0/4487 [00:00<?, ?it/s]

In [None]:
# libraries for inference
import torch
from baseline_transformer_architecture import create_small_transformer
from modeling_functions import generate_with_beam_search,calculate_and_save_rouge
from tokenizers import Tokenizer
from data_utils import get_test_loader,set_seed

# 1. Configuration setup
set_seed(42)
config = {
    "vocab_size": 20000,
    "dropout": 0.1,
    "max_len": 512,
    "d_model": 512,
    "nhead": 8,
    "num_encoder_layers": 4,
    "num_decoder_layers": 4,
    "dim_feedforward": 2048
}

# 2. Load the tokenizer
tokenizer_20 = Tokenizer.from_file("cnn_bpe_tokenizer_20k.json")  # Make sure this is the correct path

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pad_idx = tokenizer_20.token_to_id("[PAD]")

# 3. Recreate the model architecture
loaded_model = create_small_transformer(d_model=config['d_model'],
                                nhead=config['nhead'],
                                num_decoder_layers=config['num_decoder_layers'],
                                num_encoder_layers=config['num_encoder_layers'],
                                dim_feedforward=config['dim_feedforward'],
                                dropout=config['dropout'],
                                vocab_size=config['vocab_size']).to(device)

# 4. Load the saved state dictionary
# change the file based on your model name
loaded_model.load_state_dict(torch.load('updated_test_weightdecay_0.1_final_model.pt'))
loaded_model = loaded_model.to(device)

# 5. Set the model to evaluation mode
loaded_model.eval()

# 6. Load dataset
test_loader = get_test_loader(tokenizer_20)  #  Use the tokenizer_20 instance

# 7. Get a batch of data based on number of samples selected
num_examples_to_show = 5
generated_summaries = []
reference_summaries = []
with torch.no_grad():
    for i, batch in enumerate(test_loader):
        # 1. Move data to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # 2. Verify special tokens (do this once before the loop)
        if i == 0:  # Only print once
            print("[DEBUG] Special tokens:")
            print(f"[PAD]: {tokenizer_20.token_to_id('[PAD]')} -> '{tokenizer_20.decode([tokenizer_20.token_to_id('[PAD]')])}'")
            print(f"[SOS]: {tokenizer_20.token_to_id('[SOS]')}")
            print(f"[EOS]: {tokenizer_20.token_to_id('[EOS]')}")
            print(f"Dot token: {tokenizer_20.token_to_id('.')} -> '{tokenizer_20.decode([tokenizer_20.token_to_id('.')])}'")

        generated_summary = generate_with_beam_search(
                            model=loaded_model,
                            input_ids=input_ids[0],  # Single sequence
                            tokenizer=tokenizer_20,
                            device=device,
                            beam_width=3,
                            max_length=40
                )

        # 4. Decode references
        actual_summary = tokenizer_20.decode(
            labels[0].tolist(),
            skip_special_tokens=True
        )
        input_article = tokenizer_20.decode(
            input_ids[0].tolist(),
            skip_special_tokens=True
        )

        # Store for ROUGE calculation
        generated_summaries.append(generated_summary)
        reference_summaries.append(actual_summary)

        # 5. Print results
        # print(f"\n--- Example {i+1} ---")
        # print(f"Input: {input_article[:200]}...")  # Truncate long inputs
        # print(f"Generated Summary: {generated_summary}")
        # print(f"Actual Summary: {actual_summary}")
        # print("-" * 50)

        if i + 1 >= num_examples_to_show:
            break

# 8. Generating final rouge scores
# change the output_path for your own experiments
rouge_results = calculate_and_save_rouge(
    generated_summaries=generated_summaries,
    reference_summaries=reference_summaries,
    output_path="inf_rouge_scores.json"
)

In [None]:
!python visualization.py