# Assignment 4

In this assignment, you will refactor the entire code to PyTorch, making it more modular and efficient.

## Importing Libraries

In [1]:
import os
from dataclasses import dataclass
from typing import List, Tuple
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import wandb
import random
import numpy as np

def set_seed(seed: int):
    """
    Set the random seed for reproducibility.

    Args:
        seed (int): The seed value to set.
    """
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print(f"Random seed set to {seed}")


def configure_device() -> torch.device:
    """
    Configure the device for training.

    Returns:
        torch.device: The device to use for training.
    """
    if torch.cuda.is_available():
        device = torch.device("cuda")
        num_gpu = torch.cuda.device_count()
        print(f"Running on {num_gpu} {torch.cuda.get_device_name()} GPU(s)")
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
        print(f"Running on {device}")
    else:
        device = torch.device("cpu")
        print(f"Running on {device}")
    return device


def load_text(file_path: str, encoding: str = 'utf-8') -> str:
    """
    Load and read text data from a file.

    Args:
        file_path (str): Path to the text file.
        encoding (str, optional): File encoding. Defaults to 'utf-8'.

    Returns:
        str: The content of the text file.
    """
    if not os.path.isfile(file_path):
        print(f"File not found: {file_path}")
        raise FileNotFoundError(f"File not found: {file_path}")

    with open(file_path, 'r', encoding=encoding) as f:
        text = f.read()

    print(f"Loaded text data from {file_path} (length: {len(text)} characters).")
    return text
%matplotlib inline

## Configuration

In [2]:
@dataclass
class MLPConfig:
    root_dir: str = os.getcwd() + "/../../"
    dataset_path: str = "data/names.txt"
    device: torch.device = torch.device('cpu')  # Automatic device configuration

    # Tokenizer
    vocab_size: int = 0  # Set later
    
    # Model
    context_size: int = 3
    d_embed: int = 8
    d_hidden: int = 64
    
    # Training
    val_size: float = 0.1
    batch_size: int = 32
    max_steps: int = 6000  # Max of max_steps = 6421
    lr: float = 0.01
    val_interval: int = 100
    log_interval: int = 100

    seed: int = 101

## Reproducibility

In [3]:
set_seed(MLPConfig.seed)

Random seed set to 101


## Device

In [4]:
MLPConfig.device = configure_device()

Running on 1 NVIDIA GeForce RTX 3060 Ti GPU(s)


## Tokenizer

In [5]:
chars = [chr(i) for i in range(97, 123)]  # all alphabet characters
chars.insert(0, ".")  # Add special token
MLPConfig.vocab_size = len(chars)
str2idx = {char: idx for idx, char in enumerate(chars)}
idx2str = {idx: char for char, idx in str2idx.items()}

## Dataset

In [6]:
names = load_text(MLPConfig.root_dir + MLPConfig.dataset_path).splitlines()

Loaded text data from /mnt/c/Users/danie/NLP/LLM101n/notebooks/Assignments/../../data/names.txt (length: 228145 characters).


## Preprocessing

In [7]:
# Train-Val Split
train_names, val_names = train_test_split(names, test_size=MLPConfig.val_size, random_state=MLPConfig.seed)

In [8]:
print(f"Train Size: {len(train_names)}")
print(f"Validation Size: {len(val_names)}")
print(f"Train Example: {train_names[0]}")
print(f"Validation Example: {val_names[0]}")

Train Size: 28829
Validation Size: 3204
Train Example: keyler
Validation Example: jessamae


In [9]:
def prepare_dataset(_names):
    _inputs, _targets = [], []

    for name in _names:
        context = [0] * MLPConfig.context_size

        for char in name + ".":
            idx = str2idx[char]
            _inputs.append(context)
            _targets.append(idx)
            context = context[1:] + [idx]  # Shift the context by 1 character

    _inputs = torch.tensor(_inputs)
    _targets = torch.tensor(_targets)

    return _inputs, _targets

### Task 1: PyTorch DataLoader

We have been using plain Python lists to and then converted them to PyTorch tensors. This is not efficient since it is loading the entire dataset into memory.

PyTorch provides `Dataset` and `DataLoader` class to load the data in memory on the fly. [PyTorch Documentation](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html)

Refactor the `prepare_dataset` function into a PyTorch `Dataset` class and use the `DataLoader` to efficiently load the data in batches.

In [10]:
# Dataset
class NamesDataset(Dataset):
    ################################################################################
    # TODO:                                                                        #
    # PyTorch Dataset requires 3 methods:                                          #
    # __init__ method to initialize the dataset                                    #
    # __len__ method to return the size of the dataset                             #
    # __getitem__ method to return a sample from the dataset                       #
    ################################################################################
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    def __init__(self, _names: List[str], context_size: int):
        """
        Initialize the dataset

        Args:
            _names (List[str]): List of names
            context_size (int): Context size of the model
        """
        self.inputs, self.targets = [], []
        for name in _names:
            context = [0] * context_size
            for char in name + ".":
                idx = str2idx[char]
                self.inputs.append(context)
                self.targets.append(idx)
                context = context[1:] + [idx]
                
    def __len__(self) -> int:
        """
        Return the number of samples in the dataset

        Returns:
            (int): Number of samples
        """

        return len(self.inputs)
    
    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Return a sample from the dataset

        Args:
            idx (int): Index of the sample

        Returns:
            Tuple[torch.Tensor, torch.Tensor]: Input and target tensors
        """
        input_idx = torch.tensor(self.inputs[idx])
        target_idx = torch.tensor(self.targets[idx])
        return input_idx, target_idx
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

In [11]:
# Initialize the dataset
train_dataset = NamesDataset(train_names, MLPConfig.context_size)
val_dataset = NamesDataset(val_names, MLPConfig.context_size)

In [12]:
print(f"Number of Train Samples: {len(train_dataset)}")
print(f"Number of Validation Samples: {len(val_dataset)}")
print(f"First train (input, target): {train_dataset[0]}")
print(f"First validation (input, target): {val_dataset[0]}")
print(f"Second train (input, target): {train_dataset[1]}")
print(f"Second validation (input, target): {val_dataset[1]}")

Number of Train Samples: 205456
Number of Validation Samples: 22690
First train (input, target): (tensor([0, 0, 0]), tensor(11))
First validation (input, target): (tensor([0, 0, 0]), tensor(10))
Second train (input, target): (tensor([ 0,  0, 11]), tensor(5))
Second validation (input, target): (tensor([ 0,  0, 10]), tensor(5))


In [13]:
# DataLoader
################################################################################
# TODO:                                                                        #
# Initialize the DataLoader for the training and validation datasets.          #
################################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
train_loader = DataLoader(train_dataset,batch_size=MLPConfig.batch_size,shuffle=True)
val_loader = DataLoader(val_dataset,batch_size=MLPConfig.batch_size,shuffle=False)
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

In [14]:
# Example batch
_x, _y = next(iter(train_loader))
print(f"Input Shape: {_x.shape}")   # (batch_size, context_size)
print(f"Target Shape: {_y.shape}")  # (batch_size)
print(f"Input: {_x[0]}")
print(f"Target: {_y[0]}")

Input Shape: torch.Size([32, 3])
Target Shape: torch.Size([32])
Input: tensor([14,  1, 18])
Target: 15


## Model

### Task 2: MLP Model

Initialize the weights of the model using the `Kaiming` initialization.

What are other activation functions that can be used instead of `tanh`? What are the advantages and disadvantages? Use different activation functions and compare the results.


In [20]:
class MLP(nn.Module):
    ################################################################################
    # TODO:                                                                        #
    # Define the __init__ and forward methods for the MLP model.                   #
    # Use the Kaiming initialization for the weights.                              #
    # Use other activation functions instead of tanh and compare the results.      #
    ################################################################################
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    def __init__(self, vocab_size, context_size, d_embed, d_hidden):
        super().__init__()
        
        self.linear_gain = 1
        self.tanh_gain = 5.0/3.0
        self.ReLU_gain = 2**0.5
        self.embedding_scale = self.linear_gain/(vocab_size*d_embed)**0.5
        self.hidden_scale = self.ReLU_gain / (context_size*d_embed)**0.5
        self.output_scale = self.linear_gain/(d_hidden)**0.5
        
        self.C = nn.Parameter(torch.randn(vocab_size, d_embed))*self.embedding_scale
        self.W1 = nn.Parameter(torch.randn(context_size*d_embed, d_hidden))*self.hidden_scale
        self.b1 = nn.Parameter(torch.randn(d_hidden))
        self.W2 = nn.Parameter(torch.randn(d_hidden, vocab_size))*self.output_scale
        self.b2 = nn.Parameter(torch.randn(vocab_size))
    
    def forward(self, x):
        x_embed = self.C[x]
        x = x_embed.vocab_sizeview(x.size(0),-1)
        # 딴걸로 바꾸고 위 가중치도 바꾸기
        h = F.relu(x@self.W1+self.b1)
        logits = h@self.W2+self.b2
        return logits
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

In [21]:
# Initialize the model
mlp = MLP(MLPConfig.vocab_size, MLPConfig.context_size, MLPConfig.d_embed, MLPConfig.d_hidden)
mlp.to(MLPConfig.device) # Move the model to the device
print(mlp)
print("Number of parameters:", sum(p.numel() for p in mlp.parameters()))

MLP()
Number of parameters: 91


## Training

### Task 3: Wandb Integration

[Weights and Biases](https://wandb.ai/site) is a platform to track your machine learning experiments. It is very useful to log the hyperparameters, metrics, and weights of the model. (We can't use matplotlib every time to visualize the results)

Create a free account on Wandb. Initialize the wandb run and log the hyperparameters and metrics.

**How to set up WANDB API KEY**
- Create an account on Wandb
- Go to `wandb.ai` -> `Settings` -> `API Keys` -> `Copy API Key`
- Set the API key as an environment variable `WANDB_API_KEY`
    - What is an environment variable? How to set it? Google `.env`

Note: Do not hardcode the API key in the script. Use environment variables.



In [22]:
wandb.login(key=os.environ.get("WANDB_API_KEY"))
wandb.init(
    project="assignment-04",
    config={
        "d_embed": MLPConfig.d_embed,
        "d_hidden": MLPConfig.d_hidden,
        "lr": MLPConfig.lr,
    },
    dir=MLPConfig.root_dir
)



### Task 4: Training

Train the model. Change the hyperparameters and configurations. Log the results and analyze it.

In [23]:
def train(
        model: nn.Module,
        train_loader: DataLoader,
        val_loader: DataLoader,
        max_steps: int,
        lr: float,
        val_interval: int,
        log_interval: int,
        device: torch.device,
):
    """
    Train the model for a fixed number of steps.

    Args:
        model (nn.Module): The model to train.
        train_loader (DataLoader): DataLoader for the training data.
        val_loader (DataLoader): DataLoader for the validation data.
        max_steps (int): Maximum number of steps to train.
        lr (float): Learning rate.
        val_interval (int): Interval for validation.
        log_interval (int): Interval for logging.
        device (torch.device): Device to run the model on.
    """
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    wandb.watch(model, log="all", log_freq=log_interval)
    running_loss = 0.0
    progress_bar = tqdm(enumerate(train_loader), total=max_steps, desc="Training")

    for step in range(1, max_steps + 1):
        model.train()
        for batch_idx, (train_inputs, train_targets) in progress_bar:
            train_inputs, train_targets = train_inputs.to(device), train_targets.to(device)
            optimizer.zero_grad()
            logits = model(train_inputs)
            loss = F.cross_entropy(logits, train_targets)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            progress_bar.set_postfix(loss=f"{running_loss / step:.4f}")

            if step % val_interval == 0:
                model.eval()
                val_loss = 0.0
                total_samples = 0
                with torch.no_grad():
                    for val_inputs, val_targets in val_loader:
                        val_inputs, val_targets = val_inputs.to(device), val_targets.to(device)
                        val_logits = model(val_inputs)
                        batch_loss = F.cross_entropy(val_logits, val_targets)
                        val_loss += batch_loss.item() * val_inputs.size(0)
                        total_samples += val_inputs.size(0)
                wandb.log({"Val Loss": val_loss / total_samples}, step=step)

            if step % log_interval == 0:
                wandb.log({"Train Loss": running_loss / step}, step=step)

            step += 1

    progress_bar.close()
    wandb.finish()

Note: Unfortunatley PyTorch does not support infinite DataLoader. The train will stop when it reaches the end of the DataLoader. (max_steps=6421)

In [24]:
train(
    model=mlp,
    train_loader=train_loader,
    val_loader=val_loader,
    max_steps=MLPConfig.max_steps,
    lr=MLPConfig.lr,
    val_interval=MLPConfig.val_interval,
    log_interval=MLPConfig.log_interval,
    device=MLPConfig.device
)

Training:   0%|                                                                                | 0/6000 [00:00<?, ?it/s]


RuntimeError: indices should be either on cpu or on the same device as the indexed tensor (cpu)

In [None]:
################################################################################
# TODO:                                                                        #
# Analyze the results                                                          #
# What hyperparameters worked well? What activation did you use? etc.          #
################################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
#
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

## Inference

In [None]:
def generate_name(model: nn.Module, context_size: int, decoder: dict, end_id: int, device: torch.device) -> str:
    """
    Generate a name using the model.

    Args:
        model (nn.Module): Model to generate the name.
        context_size (int): Context size of the model.
        decoder (dict): Decoder dictionary to convert indices to characters.
        end_id (int): End token id.
        device (torch.device): Device to run the model on

    Returns:
        (str): Generated name
    """
    new_name = []
    context = [end_id] * context_size

    while True:
        x = torch.tensor(context).unsqueeze(0).to(device)
        logits = model(x)
        probs = F.softmax(logits, dim=-1)
        idx = torch.multinomial(probs, num_samples=1).item()
        new_name.append(decoder[idx])
        context = context[1:] + [idx]
        if idx == end_id:
            break

    return "".join(new_name)

In [None]:
for _ in range(5):
    print(generate_name(
        model=mlp,
        context_size=MLPConfig.context_size,
        decoder=idx2str,
        end_id=str2idx["."],
        device=MLPConfig.device
    ))