In [1]:
from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
from pathlib import Path
import json
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim import AdamW

import numpy as np

import re
import os

  from .autonotebook import tqdm as notebook_tqdm


# Creating Tokenizer

In [2]:
class CharacterTokenizer(PreTrainedTokenizer):
    def __init__(self, characters, context_length, **kwargs):

        self.characters  = characters
        self.model_max_length = context_length

        # Let's Add Custom Tokens to Tokenizer
        bos_token = AddedToken("[BOS]", lstrip=False, rstrip=False)
        eos_token = AddedToken("[EOS]", lstrip=False, rstrip=False)
        sep_token = AddedToken("[SEP]", lstrip=False, rstrip=False)
        cls_token = AddedToken("[CLS]", lstrip=False, rstrip=False)
        pad_token = AddedToken("[PAD]", lstrip=False, rstrip=False)
        unk_token = AddedToken("[UNK]", lstrip=False, rstrip=False)
        
        mask_token = AddedToken("[MASK]", lstrip=True, rstrip=False)

        self._vocab_str_to_int = {
            "[BOS]": 0,
            "[EOS]": 1,
            "[SEP]": 2,
            "[CLS]": 3,
            "[PAD]": 4,
            "[RESERVED]": 5,
            "[UNK]" : 6,
            **{ch: i + 7 for i, ch in enumerate(characters)} 
        }

        self._vocab_int_to_str = {
            v: k for k, v in self._vocab_str_to_int.items()
        }

        super().__init__(
        bos_token = bos_token, 
        eos_token = eos_token ,
        sep_token = sep_token, 
        cls_token = cls_token, 
        pad_token = pad_token, 
        unk_token = unk_token,
        mask_token = mask_token,
        add_prefix_space=False,
        model_max_length=context_length,
        **kwargs
        )

    @property
    # Get the vocabulary size
    def vocab_size(self):
        return len(self._vocab_str_to_int)
    
    # Tokenize the characters 
    def _tokenize(self, text):
        return list(text)
    
    # Convert token to token_id
    # If token doesn't exist in vocab it returns id for UNK token
    def _convert_token_to_id(self, token):
        return self._vocab_str_to_int.get(token, self._vocab_str_to_int["[UNK]"])
    
    # Convert token_id to token
    def _convert_id_to_token(self, index):
        return self._vocab_int_to_str[index]
    
    # Convert tokens to string
    # The "" part means there’s no separator between tokens, so it will simply join them in the order they appear.
    def convert_tokens_to_string(self, tokens):
        return "".join(tokens)
    
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1 = None):

        # token_ods_0 : Means First list of tokens
        # token_ids_1 : Means Second list of tokens 
        
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        result = cls + token_ids_0 + sep

        if token_ids_1 is not None:
            result += token_ids_1 + sep
        return result
    
    def get_special_tokens_mask(self, token_ids_0, token_ids_1 = None, already_has_special_tokens = False):
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0,
                token_ids_1=token_ids_1,
                already_has_special_tokens=True,
            )
     
        result = [1] + ([0] * len(token_ids_0)) + [1]

        if token_ids_1 is not None:
            result += ([0] * len(token_ids_1) + [1])

        return result
    
    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1 = None):
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]


        result = len(cls + token_ids_0 + sep) * [0]
        if token_ids_1 is not None:
            result += len(token_ids_1 + sep) * [1]

        return result
    
    # Get the tokenzier configuration
    
    def get_config(self):
        return {
            "char_ords": [ord(ch) for ch in self.characters],
            "model_max_length": self.model_max_length,
        }
    
    @classmethod
    def from_config(cls, config):
        cfg = {}
        cfg["characters"] = [chr(o) for o in config["char_ords"]]
        cfg["context_length"] = config["model_max_length"]
        return cls(**cfg)
    
    def save_pretrained(self, save_directory, **kwargs):
        cfg_file = Path(save_directory) / "tokenizer_config.json"
        cfg = self.get_config()
        with open(cfg_file, "w") as f:
            json.dump(cfg, f, indent=4)
    
    @classmethod
    def from_pretrained(cls, save_directory, **kwargs):
        cfg_file = Path(save_directory)/"tokenizer_config.json"
        with open(cfg_file) as f:
            cfg = json.load(f)
        return cls.from_config(cfg)

    def get_vocab(self):
        """Returns the vocabulary dictionary: token (str) -> id (int)."""
        return self._vocab_str_to_int

### **1. Initialization (`__init__`)**
```python
characters = ['a', 'b', 'c', ..., 'z']
context_length = 128
tokenizer = CharacterTokenizer(characters, context_length)
```
- **Input**:
  - `characters`: List of all valid characters (e.g., `['a', 'b', ..., 'z']`).
  - `context_length`: Maximum number of tokens a sequence can have (e.g., `128`).
- **Explanation**:
  - Special tokens like `[BOS]`, `[EOS]`, `[PAD]` are initialized and added to the vocabulary.
  - `_vocab_str_to_int`: A mapping of tokens (e.g., `'a'` → 7, `'b'` → 8).
  - `_vocab_int_to_str`: Reverse mapping (e.g., `7` → `'a'`).
- **Example Vocabulary**:
  ```python
  {
      "[BOS]": 0,
      "[EOS]": 1,
      "[SEP]": 2,
      "a": 7,
      "b": 8,
      ...
  }
  ```

---

### **2. `vocab_size` Property**
Returns the size of the vocabulary:
```python
tokenizer.vocab_size  # Example output: 33 (special tokens + 26 letters of alphabet)
```

---

### **3. `_tokenize()`**
Tokenizes input text into a list of characters:
```python
tokenizer._tokenize("abc")  # Output: ['a', 'b', 'c']
```

---

### **4. `_convert_token_to_id()`**
Converts a token to its integer ID:
```python
tokenizer._convert_token_to_id("a")  # Output: 7
tokenizer._convert_token_to_id("[UNK]")  # Output: 6 (default ID for unknown tokens)
```

---

### **5. `_convert_id_to_token()`**
Converts an ID back to its token:
```python
tokenizer._convert_id_to_token(7)  # Output: "a"
```

---

### **6. `convert_tokens_to_string()`**
Joins tokens into a string:
```python
tokens = ["a", "b", "c"]
tokenizer.convert_tokens_to_string(tokens)  # Output: "abc"
```

---

### **7. `build_inputs_with_special_tokens()`**
Adds special tokens `[CLS]` and `[SEP]` to a sequence of token IDs:
```python
token_ids_0 = [7, 8, 9]  # 'abc'
token_ids_1 = [10, 11]   # 'de'
tokenizer.build_inputs_with_special_tokens(token_ids_0, token_ids_1)
# Output: [3, 7, 8, 9, 2, 10, 11, 2]  # [CLS] a b c [SEP] d e [SEP]
```

---

### **8. `get_special_tokens_mask()`**
Creates a mask to identify special tokens:
```python
token_ids_0 = [7, 8, 9]
tokenizer.get_special_tokens_mask(token_ids_0)
# Output: [1, 0, 0, 0, 1]  # 1 for special tokens, 0 for normal tokens
```

---

### **9. `create_token_type_ids_from_sequences()`**
Generates token type IDs for distinguishing multiple sequences:
```python
token_ids_0 = [7, 8, 9]  # Sequence 1
token_ids_1 = [10, 11]   # Sequence 2
tokenizer.create_token_type_ids_from_sequences(token_ids_0, token_ids_1)
# Output: [0, 0, 0, 0, 0, 1, 1, 1]  # 0 for seq 1, 1 for seq 2
```

---

### **10. `get_config()`**
Returns tokenizer configuration (characters and max length):
```python
tokenizer.get_config()
# Output: {'char_ords': [97, 98, ..., 122], 'model_max_length': 128}
```

---

### **11. `from_config()`**
Reconstructs the tokenizer from a configuration:
```python
config = tokenizer.get_config()
new_tokenizer = CharacterTokenizer.from_config(config)
```

---

### **12. Saving and Loading**
- **Save**: Save tokenizer configuration to a file:
  ```python
  tokenizer.save_pretrained("path/to/save")
  ```
- **Load**: Load tokenizer configuration from a file:
  ```python
  loaded_tokenizer = CharacterTokenizer.from_pretrained("path/to/save")
  ```

---

### **13. `get_vocab()`**
Returns the vocabulary:
```python
tokenizer.get_vocab()
# Output: {"[BOS]": 0, "[EOS]": 1, ..., "a": 7, "b": 8, ...}
```

---

### Example Usage:
```python
input_text = "abc"
tokens = tokenizer._tokenize(input_text)  # ['a', 'b', 'c']
token_ids = [tokenizer._convert_token_to_id(t) for t in tokens]  # [7, 8, 9]
special_tokens = tokenizer.build_inputs_with_special_tokens(token_ids)  # [3, 7, 8, 9, 2]
decoded = tokenizer.convert_tokens_to_string([tokenizer._convert_id_to_token(id) for id in special_tokens])
# Output: '[CLS]abc[SEP]'
```

This implementation customizes how text is tokenized, encoded, and decoded for a model designed to process character-level input.

In [3]:
# Define the character set and context length
characters = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "+", " ", "="]

# Context length
context_length = 32

# Initialize the tokenizer
tokenizer = CharacterTokenizer(characters=characters, context_length=context_length)

In [4]:
# Get vocab
tokenizer_vocab = tokenizer.get_vocab()
tokenizer_vocab

{'[BOS]': 0,
 '[EOS]': 1,
 '[SEP]': 2,
 '[CLS]': 3,
 '[PAD]': 4,
 '[RESERVED]': 5,
 '[UNK]': 6,
 '0': 7,
 '1': 8,
 '2': 9,
 '3': 10,
 '4': 11,
 '5': 12,
 '6': 13,
 '7': 14,
 '8': 15,
 '9': 16,
 '+': 17,
 ' ': 18,
 '=': 19}

In [5]:
# Example of tokenization
text = "1 + 2 = 3"
tokens = tokenizer.tokenize(text)
tokens_ids = tokenizer.convert_tokens_to_ids(tokens)

In [6]:
tokens, tokens_ids

(['1', ' ', '+', ' ', '2', ' ', '=', ' ', '3'],
 [8, 18, 17, 18, 9, 18, 19, 18, 10])

In [7]:
# Tokens with special tokens
special_tokens = tokenizer.build_inputs_with_special_tokens(tokens_ids)
special_tokens

[3, 8, 18, 17, 18, 9, 18, 19, 18, 10, 2]

In [8]:
# Convert tokens to string
tokenizer.convert_tokens_to_string(tokens)

'1 + 2 = 3'

In [9]:
# Convert token IDs to tokens
tokens_with_special_tokens = [tokenizer._convert_id_to_token(id) for id in special_tokens]

# Convert tokens to string with special tokens
tokenizer.convert_tokens_to_string(tokens_with_special_tokens)

'[CLS]1 + 2 = 3[SEP]'

In [10]:
# Get the vocab size and tikenizer configuration
vocab_size = tokenizer.vocab_size
tokenizer_config = tokenizer.get_config()
print(f"Vocab Size: {vocab_size}")
print(f"Tokenizer Config: {tokenizer_config}")

Vocab Size: 20
Tokenizer Config: {'char_ords': [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 43, 32, 61], 'model_max_length': 32}


# Creating Dataset

In [11]:
class AdditionDataset(Dataset):
    def __init__(
            self, 
            tokenizer,
            context_length, 
            numbers = range(100,200),
            include_intermediate_steps = True):
        
        self.tokenizer = tokenizer
        self.context_length = context_length
        self.numbers = numbers
        self.include_intermediate_steps = include_intermediate_steps

    def __len__(self):
        return len(self.numbers) ** 2
    
    # Get the sample pair of numbers  
    def __getitem__(self, index):

        # Find the maximum number of digits from the numbers
        max_digits = len(str(max(self.numbers)))
        
        # Calculate the first number by integer division of index by the length of the numbers list
        # Calculate the second number by modulo operation of index by the length of the numbers list
        a = self.numbers[index // len(self.numbers)]
        b = self.numbers[index % len(self.numbers)]

        # calculate the sum of the two numbers
        c = a + b

        # Pad c with leading zeros to match max_digits
        c_str = str(c).zfill(max_digits) 

        # Convert c to its ones, tens, hundreds, etc. digits
        # Say c = 123, then
        # - Convert to string and reverse: `'321'`
        # - Enumerate: `[(0, '3'), (1, '2'), (2, '1')]`
        # - Convert and multiply: `[3 * 10^0, 2 * 10^1, 1 * 10^2]` which results in `[3, 20, 100]`
        
        c_parts = [int(d) * 10 ** i for i, d in enumerate(c_str[::-1])]

        x = f"{a}+{b}="  # x = "123+456="

        if self.include_intermediate_steps:
            
            # x = "123+456=9+70+500=" if include_intermediate_steps is True
            x += "+".join([f"{p}" for p in c_parts]) + "="

        # x = "123+456=579" if include_intermediate_steps is False
        x += f"{c}"

        # the length of x should be less than or equal to the context length
        if len(x) > self.context_length:
            raise ValueError(f"Input length {len(x)} exceeds context length {self.context_length}")
        
        # Predict the next token in the sequence
        y = x[1:] # y = "23+456=9+70+500=579"

        # Tokenize the input and output sequences
        x = self.tokenizer.encode(x)
        y = self.tokenizer.encode(y)

        # Pad the sequences to the context length
        x = x + [self.tokenizer.pad_token_id] * (self.context_length - len(x))
        y = y + [self.tokenizer.pad_token_id] * (self.context_length - len(y))

        # Convert to PyTorch tensors
        x = torch.tensor(x, dtype=torch.long)
        y = torch.tensor(y, dtype=torch.long)

        # Mask the output tokens with -1 to only calculate loss on the other tokens
        mask_start = len(f"{a}+{b}=")
        y[:mask_start] = -1
        
        return x, y

## Number of Samples 
---
### 1. **Explanation of `__len__`**

The `__len__` method defines the total number of samples in your dataset. In your case:
```python
def __len__(self):
    return len(self.numbers) ** 2
```

- `len(self.numbers)` is the size of the range provided in the `numbers` attribute.
- For `numbers=range(100, 200)`, the size of `numbers` is $ 200 - 100 = 100 $.
- Therefore, `len(self.numbers) ** 2` evaluates to:
  $$
  100^2 = 10,000 \text{ samples.}
  $$

This means your dataset will be treated as having 10,000 samples.

---

### 2. **Impact on the DataLoader**

The PyTorch `DataLoader` divides the dataset into batches based on the `batch_size`. The total number of batches is:
$$
\text{Number of batches} = \frac{\text{Total samples}}{\text{Batch size}}
$$

Given:
- **Total samples (`__len__`)** = 10,000 (as defined by your dataset's `__len__` method).
- **Batch size** = 2 (as defined in the `dataloader`).

The number of batches becomes:
$$
\text{Number of batches} = \frac{10,000}{2} = 5,000
$$

This directly explains why `len(dataloader)` returns **5,000**.

---

### 3. **Why is `__len__` Squared?**
The design choice to square the length of `numbers` (`len(self.numbers) ** 2`) likely stems from how your dataset generates samples. For example:
- If each number in `numbers` is combined with every other number (e.g., for pairwise operations like addition), the total number of samples would be the square of the number of elements in `numbers`.

For $ \text{numbers} = [100, 101, 102, \ldots, 199] $:
- The dataset would generate all combinations of these numbers, resulting in:
  $$
  100 \times 100 = 10,000 \text{ samples.}
  $$
---

## `__getitem__` function
### 1. **How the `index` is Passed**
1. **When Using a `DataLoader`:**
   - When you pass the `AdditionDataset` instance to a PyTorch `DataLoader`, the `DataLoader` automatically generates indices for the dataset.
   - These indices are passed to the `__getitem__` method by the `DataLoader` for each batch.

   **Example:**
   ```python
   from torch.utils.data import DataLoader

   # Create an instance of the dataset
   dataset = AdditionDataset(tokenizer=tokenizer, context_length=32, numbers=range(100, 200))

   # Create a DataLoader
   dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

   # Iterate over the dataloader
   for x, y in dataloader:
       print(x, y)  # __getitem__ is called for each batch
   ```

   - **What Happens Internally:**
     - The `DataLoader` internally creates indices (e.g., `[0, 1, 2, ...]`). Numer of elements in list `[0, 1, 2, ...]` is equal to number of samples i.e `len(self.numbers) ** 2`.
     - It passes these indices to the dataset's `__getitem__` method.
     - For a batch size of 2, `__getitem__` will take indices `[0, 1]` for first batch and for second batch it will take `[2,3]` and so on. 
     - `index` argument is used by `__getitem__` to select the numbers `a` and `b` based on the logic:
        ```python
        a = self.numbers[index // len(self.numbers)]
        b = self.numbers[index % len(self.numbers)]
        ```
---
### 2. How `a`  and `b` are choosen and input and target are created?
Assume the `numbers` list is `[100, 101, 102, 103, 104]` and the `index` is `7`.

1. **Initialization**:
    ```python
    numbers = [100, 101, 102, 103, 104]
    index = 7
    ```

2. **Calculate the first number (`a`)**:
    ```python
    a = numbers[index // len(numbers)]
    ```
    - `index // len(numbers)` calculates the integer division of `index` by the length of the `numbers` list.
    - `len(numbers)` is `5`.
    - `index // len(numbers)` is `7 // 5` which equals `1`.
    - `a = numbers[1]` which is `101`.

3. **Calculate the second number (`b`)**:
    ```python
    b = numbers[index % len(numbers)]
    ```
    - `index % len(numbers)` calculates the modulo of `index` by the length of the `numbers` list.
    - `index % len(numbers)` is `7 % 5` which equals `2`.
    - `b = numbers[2]` which is `102`.

4. **Calculate the sum (c)**:
    ```python
    c = a + b
    ```
    - c = 101 + 102 which equals `203`.

5. **Return the sum (c)**:
    ```python
    return c
    ```
    So, when the `__getitem__` method is called with `index = 7`, the steps are as follows:

    - `a` is calculated as `101`.
    - `b` is calculated as `102`.
    - `c` is calculated as `203`.
    
6. `c_parts = [int(d) * 10 ** i for i, d in enumerate(str(c)[::-1])]` is converting a number `c`
 into its constituent digits, each multiplied by its corresponding place value (ones, tens, hundreds, etc.). Here's a step-by-step explanation:
 if `c` is `123`, the steps would be:
    - Convert to string and reverse: `'321'`
    - Enumerate: `[(0, '3'), (1, '2'), (2, '1')]`
    - Convert and multiply: `[3 * 10^0, 2 * 10^1, 1 * 10^2]` which results in `[3, 20, 100]`
    
So, `c_parts` will be `[3, 20, 100]`.

---

### 3. Input String
if `self.include_intermediate_steps` is `False` input will be `x = "123+456="` but if `self.include_intermediate_steps` is `True` then input will be `x = "101+102=3+00+200="`. The `x` will be tokenized using the tokenizer as `x = self.tokenizer.encode(x)` and pad it to context length as `x = x + [self.tokenizer.pad_token_id] * (self.context_length - len(x))`. 

Example: For `x = 44+354=8+90+300=398`
```python
x = [ 3 11 11 17 10 12 11 19 15 17 16  7 17 10  7  7 19 10 16 15  2  4  4  4  4  4  4  4  4  4  4  4]
```
---
### 4. Target String
Target string `y` is shifted by one character to predict the next token `y = "01+102=3+0+200=203"` (`self.include_intermediate_steps` is `True`). Then it is tokenized using the tokenizer as `y = self.tokenizer.encode(y)` and pad it to context length as `y = y + [self.tokenizer.pad_token_id] * (self.context_length - len(y))`. 

For the target sequence `y`, mask tokens corresponding to the input prompt (a+b=) with -1 so that the model doesn't calculate loss for those tokens. For `y = 4+354=8+90+300=398`:
```python 
y = [-1 -1 -1 -1 -1 -1 -1 15 17 16  7 17 10  7  7 19 10 16 15  2  4  4  4  4  4  4  4  4  4  4  4  4]
```

In [12]:
# Create Dataset
dataset = AdditionDataset(tokenizer, context_length, numbers=range(100, 200), include_intermediate_steps=True)

In [13]:
# Visualize the dataset
def visualize_dataset(dataset, tokenizer, max_num=10, seed=42 , skip_special_tokens=True):
    
    # Generate random indices
    indices = torch.randperm(
        len(dataset), generator=torch.Generator().manual_seed(seed)
    )[:max_num]

    # Retrieve input-target pairs for the selected indices
    for ix in indices:
        x, y = dataset[ix]

        print(f"=== Example {ix.item()} ===")  # Ensure index is displayed as a number

        # Configure NumPy to display arrays without truncation
        np.set_printoptions(linewidth=999)

        # Print the input tensor
        print(f"x = {x.numpy()}")
        print(f"y = {y.numpy()}")

        # Show lengths of x and y
        print(f"x length = {len(x)}")
        print(f"y length = {len(y)}")

        # Decode x and print
        x_decoded = tokenizer.decode(x.tolist(), skip_special_tokens=skip_special_tokens)
        print(f"x decoded = {x_decoded}")

        # Decode y and print, replacing -1 tokens with '_'
        y_list = y.tolist()
        num_unknowns = y_list.count(-1)
        if num_unknowns > 0:
            y_decoded = tokenizer.decode(y_list[num_unknowns:], skip_special_tokens=skip_special_tokens)
            print(f"y decoded = {'_' * num_unknowns}{y_decoded}")
        else:
            y_decoded = tokenizer.decode(y_list, skip_special_tokens=skip_special_tokens)
            print(f"y decoded = {y_decoded}")

        print("\n")  # Add a blank line between examples

## Visualizing the Dataset

#### **1. Generate Random Indices**
```python
indices = torch.randperm(
    len(dataset), generator=torch.Generator().manual_seed(seed)
)[:max_num]
```
- `torch.randperm(len(dataset))`: Creates a random permutation of indices based on the dataset size.
- `manual_seed(seed)`: Ensures reproducibility.
- `[:max_num]`: Selects the first `max_num` random indices.

---

#### **2. Loop Through the Selected Indices**
```python
for ix in indices:
    x, y = dataset[ix]
```
For each selected index:
- Retrieve the input (`x`) and target (`y`) tensors.

---

#### **3. Print Tensor Values**
```python
print(f"x = {x.numpy()}")
print(f"y = {y.numpy()}")
```
- Converts the tensors into NumPy arrays and prints their raw numerical values.

---

#### **4. Show Lengths of Tensors**
```python
print(f"x length = {len(x)}")
print(f"y length = {len(y)}")
```
- Displays the length of each sequence.

---

#### **5. Decode `x`**
```python
x_decoded = tokenizer.decode(x.tolist(), skip_special_tokens=skip_special_tokens)
print(f"x decoded = {x_decoded}")
```
- Converts the numerical `x` sequence back to a human-readable string using the tokenizer.

---

#### **6. Decode `y`**
```python
y_list = y.tolist()
num_unknowns = y_list.count(-1)
if num_unknowns > 0:
    y_decoded = tokenizer.decode(y_list[num_unknowns:], skip_special_tokens=skip_special_tokens)
    print(f"y decoded = {'_' * num_unknowns}{y_decoded}")
else:
    y_decoded = tokenizer.decode(y_list, skip_special_tokens=skip_special_tokens)
    print(f"y decoded = {y_decoded}")
```
- `y_list.count(-1)`: Counts how many tokens in `y` are masked (`-1`).
- Decodes only the unmasked portion of `y`.
- Replaces the masked portion with `_`.

---

In [14]:
visualize_dataset(dataset, tokenizer, max_num=2, seed=42, skip_special_tokens=True)

=== Example 7542 ===
x = [ 3  8 14 12 17  8 11  9 19 14 17  8  7 17 10  7  7 19 10  8 14  2  4  4  4  4  4  4  4  4  4  4]
y = [-1 -1 -1 -1 -1 -1 -1 -1 14 17  8  7 17 10  7  7 19 10  8 14  2  4  4  4  4  4  4  4  4  4  4  4]
x length = 32
y length = 32
x decoded = 175+142=7+10+300=317
y decoded = ________7+10+300=317


=== Example 8214 ===
x = [ 3  8 15  9 17  8  8 11 19 13 17 16  7 17  9  7  7 19  9 16 13  2  4  4  4  4  4  4  4  4  4  4]
y = [-1 -1 -1 -1 -1 -1 -1 -1 13 17 16  7 17  9  7  7 19  9 16 13  2  4  4  4  4  4  4  4  4  4  4  4]
x length = 32
y length = 32
x decoded = 182+114=6+90+200=296
y decoded = ________6+90+200=296




# Creating Dataloader


In [15]:
# Create DataLoader
data_loader = DataLoader(
            dataset=dataset,
            batch_size=1,
            shuffle=True,
            drop_last=True,  
            num_workers=0,)

In [16]:
def visualize_data_loader(data_loader, tokenizer, max_batches=2, skip_special_tokens=True):
    print("=== Data Loader ===")
    print(f"Number of batches = {len(data_loader)}\n")
    print(f"Showing first {max_batches} batches:")
    print(f"Number of samples in each batch = {data_loader.batch_size}\n")
    
    for i, (x_batch, y_batch) in enumerate(data_loader):
        if i >= max_batches:
            break
        
        print(f"=== Batch {i + 1} ===")
        
        for j in range(len(x_batch)):
            x = x_batch[j]
            y = y_batch[j]
            
            print(f"--- Sample {j + 1} ---")
            print(f"x = {x.numpy()}")
            print(f"y = {y.numpy()}")
            
            x_decoded = tokenizer.decode(x.tolist(), skip_special_tokens=skip_special_tokens)
            print(f"x decoded = {x_decoded}")
            
            y_list = y.tolist()
            num_unknowns = y_list.count(-1)
            y_decoded = tokenizer.decode(y_list[num_unknowns:], skip_special_tokens=skip_special_tokens)
            print(f"y decoded = {'_' * num_unknowns}{y_decoded}\n")

In [17]:
visualize_data_loader(data_loader, tokenizer, max_batches=2, skip_special_tokens=True)

=== Data Loader ===
Number of batches = 10000

Showing first 2 batches:
Number of samples in each batch = 1

=== Batch 1 ===
--- Sample 1 ---
x = [ 3  8 10 15 17  8 16 10 19  8 17 10  7 17 10  7  7 19 10 10  8  2  4  4  4  4  4  4  4  4  4  4]
y = [-1 -1 -1 -1 -1 -1 -1 -1  8 17 10  7 17 10  7  7 19 10 10  8  2  4  4  4  4  4  4  4  4  4  4  4]
x decoded = 138+193=1+30+300=331
y decoded = ________1+30+300=331

=== Batch 2 ===
--- Sample 1 ---
x = [ 3  8 15 11 17  8  9  9 19 13 17  7 17 10  7  7 19 10  7 13  2  4  4  4  4  4  4  4  4  4  4  4]
y = [-1 -1 -1 -1 -1 -1 -1 -1 13 17  7 17 10  7  7 19 10  7 13  2  4  4  4  4  4  4  4  4  4  4  4  4]
x decoded = 184+122=6+0+300=306
y decoded = ________6+0+300=306



# Step by Step Creation of GPT like Model

## 1. Multi-Head Attention

In [18]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=True):
        super().__init__()
        assert (d_out % num_heads == 0), \
        "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads

        # Linear layers for queries, keys, and values with bias
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)

        # Linear layer for output projection
        self.out_proj = nn.Linear(d_out, d_out, bias=True)
        self.dropout = nn.Dropout(dropout)

        self.register_buffer(
            "mask", 
            torch.triu(torch.ones(context_length, context_length), diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        # Split into multiple heads
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)

        # Compute scaled dot-product attention
        attn_scores = queries @ keys.transpose(-2, -1)
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        attn_scores.masked_fill_(mask_bool, -torch.inf)
        attn_weights = self.dropout(torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1))

        # Context vector
        context_vec = (attn_weights @ values).transpose(1, 2).contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.dropout(self.out_proj(context_vec))  # Apply dropout after projection
        return context_vec

Here’s a detailed explanation of the `MultiHeadAttention` class line by line, with examples where necessary:

---

### **1. Class Initialization**
```python
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=True):
        super().__init__()
        assert (d_out % num_heads == 0), \
        "d_out must be divisible by num_heads"
```
- **Explanation:**
  - `MultiHeadAttention` is a custom PyTorch module for multi-head self-attention.
  - **Parameters:**
    - `d_in`: Input dimension (size of each input vector).
    - `d_out`: Output dimension (size of each output vector).
    - `context_length`: Maximum sequence length (e.g., for input tokens).
    - `dropout`: Dropout probability to regularize attention.
    - `num_heads`: Number of attention heads.
    - `qkv_bias`: Whether to include biases in the query, key, and value projections.
  - The `assert` ensures that `d_out` is divisible by `num_heads`, as each head must process an equal portion of `d_out`.

- **Example:**
  ```python
  attn = MultiHeadAttention(d_in=64, d_out=128, context_length=32, dropout=0.1, num_heads=4)
  ```

---

### **2. Defining Key Components**
```python
self.d_out = d_out
self.num_heads = num_heads
self.head_dim = d_out // num_heads
```
- **Explanation:**
  - `d_out`: Total output dimension.
  - `num_heads`: Number of attention heads.
  - `head_dim`: Dimension of each head. It is calculated as `d_out // num_heads`.

- **Example:**
  - For `d_out=128` and `num_heads=4`, `head_dim = 128 // 4 = 32`.

---

### **3. Query, Key, Value, and Output Projection Layers**
```python
self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
self.out_proj = nn.Linear(d_out, d_out, bias=True)
self.dropout = nn.Dropout(dropout)
```
- **Explanation:**
  - `W_query`, `W_key`, `W_value`: Linear layers to project the input `x` into query, key, and value spaces.
  - `out_proj`: Linear layer to project the concatenated outputs from all heads back to the original `d_out` dimension.
  - `dropout`: Regularizes attention weights during training.

---

### **4. Causal Mask for Attention**
```python
self.register_buffer(
    "mask", 
    torch.triu(torch.ones(context_length, context_length), diagonal=1)
)
```
- **Explanation:**
  - A **causal mask** ensures that the model cannot attend to future tokens.
  - `torch.triu`: Creates an upper triangular matrix of ones (0 below the diagonal).
  - The result is stored as a non-trainable buffer (`mask`).
- **Example:**
  For `context_length=4`, the mask looks like:
  ```
  [[0, 1, 1, 1],
   [0, 0, 1, 1],
   [0, 0, 0, 1],
   [0, 0, 0, 0]]
  ```

---

### **5. Forward Method**
#### **Input Shape**
- `x`: Tensor of shape `(batch_size, num_tokens, d_in)`.

---

#### **5.1 Compute Keys, Queries, Values**
```python
keys = self.W_key(x)
queries = self.W_query(x)
values = self.W_value(x)
```
- **Explanation:**
  - Projects the input `x` into keys, queries, and values using the learned linear layers.
  - Shape after projection: `(batch_size, num_tokens, d_out)`.

---

#### **5.2 Split into Multiple Heads**
```python
keys = keys.view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)
queries = queries.view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)
values = values.view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)
```
- **Explanation:**
  - Reshapes the `keys`, `queries`, and `values` to include the number of heads.
  - Shape changes:
    - `(batch_size, num_tokens, d_out)` → `(batch_size, num_tokens, num_heads, head_dim)`.
    - Then transposes to `(batch_size, num_heads, num_tokens, head_dim)`.

---

#### **5.3 Scaled Dot-Product Attention**
```python
attn_scores = queries @ keys.transpose(-2, -1)
```
- **Explanation:**
  - Computes attention scores by taking the dot product between `queries` and transposed `keys`.
  - Shape:
    - `queries`: `(batch_size, num_heads, num_tokens, head_dim)`.
    - `keys.transpose(-2, -1)`: `(batch_size, num_heads, head_dim, num_tokens)`.
    - Result: `(batch_size, num_heads, num_tokens, num_tokens)`.

```python
mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
attn_scores.masked_fill_(mask_bool, -torch.inf)
```
- **Explanation:**
  - Applies the causal mask to prevent attending to future tokens by replacing masked positions with `-inf`.

```python
attn_weights = self.dropout(torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1))
```
- **Explanation:**
  - Normalizes attention scores with `softmax` and scales by the square root of `head_dim` to stabilize gradients.
  - Applies dropout to the attention weights.

---

#### **5.4 Compute Context Vector**
```python
context_vec = (attn_weights @ values).transpose(1, 2).contiguous().view(b, num_tokens, self.d_out)
```
- **Explanation:**
  - Multiplies the attention weights with the `values` to compute the context vectors.
  - Shape:
    - `attn_weights`: `(batch_size, num_heads, num_tokens, num_tokens)`.
    - `values`: `(batch_size, num_heads, num_tokens, head_dim)`.
    - Result after multiplication and reshaping: `(batch_size, num_tokens, d_out)`.

```python
context_vec = self.dropout(self.out_proj(context_vec))
```
- **Explanation:**
  - Applies the output projection layer (`out_proj`) and dropout.

---

### **6. Return the Context Vectors**
```python
return context_vec
```
- The final output is a tensor of shape `(batch_size, num_tokens, d_out)`.

---

In [19]:
# Example of MultiHeadAttention
d_in = 32 # Input Embedding Size
d_out = 64 # Output Embedding Size
num_heads = 4 # Number of heads
dropout = 0.1 # Dropout rate
batch_size = 2 # Batch size
context_length = 16 # Number of tokens in the sequence

mha = MultiHeadAttention(d_in, d_out, context_length, dropout, num_heads)

# Create a random input tensor
x = torch.randn(batch_size, context_length, d_in)
attn = mha(x)
print(f"Input shape: {x.shape}")
print(f"Output shape: {attn.shape}")

Input shape: torch.Size([2, 16, 32])
Output shape: torch.Size([2, 16, 64])


### What is meaning of calculating attention weights? 
Attention weights represent how strongly each token is related to other tokens in the sequence. They indicate how much focus or importance each token places on every other token, effectively capturing semantic relationships.
In other words, attention weights answer the question: "__Which tokens are important for understanding this token?__"
### What is the meaning of calculating context vectors?
The context vector captures why a token is related to other tokens and in what way it is related. It provides a contextualized representation of a token, combining its own meaning with relevant information from other tokens.
The context vector answers the question: "__On what basis are these tokens semantically related?__" or "__What features or attributes describe this relationship?__"

## 2. Layer Normalization 
The idea behind the layer normalization is to adjust the activation(output) of a neural network layer to have a __mean__ of 0 and __variance__ of 1. 

The formula for **Layer Normalization** is as follows:

$$
\text{LayerNorm}(x) = \gamma \cdot \frac{x - \mu}{\sqrt{\sigma^2 + \epsilon}} + \beta
$$

where:

- $ x $: Input vector to be normalized (of shape $(n, d)$ where $n$ is the batch size and $d$ is the feature dimension).
- $\mu = \frac{1}{d} \sum_{i=1}^{d} x_i $: Mean of the features for each input, calculated across the feature dimension.
- $ \sigma^2 = \frac{1}{d} \sum_{i=1}^{d} (x_i - \mu)^2 $: Variance of the features for each input.
- $\epsilon$: Small constant added to variance to avoid division by zero, typically a very small value like $^{-5}$.
- $\gamma$: Learnable scaling parameter (of shape $(d,)$).
- $\beta$: Learnable shifting parameter (of shape $(d,)$).

**Explanation:**

- **Normalization**: The input $x$ is normalized by subtracting the mean ($\mu$) and dividing by the standard deviation ($\sqrt{\sigma^2 + \epsilon}$). This process transforms the input to have zero mean and unit variance.
- **Scaling and Shifting**: After normalization, the input is scaled by a learnable parameter $\gamma$ and shifted by a learnable parameter $\beta$. These parameters allow the model to learn an appropriate scale and shift for the normalized values, enabling the network to maintain the representational power of the original input.

Layer normalization is different from batch normalization in that it normalizes across the feature dimension for each input individually rather than across the batch dimension, making it especially suitable for recurrent or sequence-based models.

In [20]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()

        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
    
    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

## 3. Feed Forward Network
### Gaussian Error Linear Unit(GELU) Activation Function
The GELU activation function can be implemented in several ways. The extact version is defined as 
$$
GELU(x) = x.\Phi(x)
$$
where, $\Phi(x)$ is the cumulative distribution fuction of the standard Gaussian distribution. 

The computationally cheaper approximation if above equation is 
$$
GELU(x) \approx 0.5\,.\,x\,.\left(1+tanh\left[\sqrt\frac{2}{\pi}.(x + 0.044715.x^3)\right]\right)
$$

In [21]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
    torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))
    ))

class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]), # The first linear layer increases the embedding dimension by a factor of 4
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]) # The second linear layer decreases the embedding dimension by a factor of 4
        )
    def forward(self, x):
        return self.layers(x)

### Expansion and Contrcation of Layers
`FeedForward` module plays a crucial role in enhancing the model's ability to learn from and generalize the data. Although the input and output dimensions of the module are the same, it internally expands the embedding dimension into a higher-dimensional space through the first linear layer as illustrated in the figure abvoe. The expansion is followed by a nonlinear GELU activation and then contraction back to the original dimesnion with the second linear transformation.

## 4. Transformer Block with residual connection
Now, we'll connect layer normalization, __GELU activations__, feed forward module and __shortcut connections__ in a transofrmer block, which is the final building block of the GPT architecture. Figure below shows a transformer block that combines several components, including the masked multi-head attention and `FeedForward` module. __Layer normalization__ is applied before each of these two components and __dropout__ is applied after them to regularize the model and prevent overfitting. This is also known as _Pre-LayerNorm_. Older architectures, such as original transformer model, applied layer normalization after the self-attention and feed forward networks instead, known as _Post-LayerNorm_, which often leads to worse traning dynamics. 

The transformer block maintains the input dimensions in its output, indicating that the transfomer architecture processes sequences of data without altering their shape throughout the network. Actually, the actual output is the context vector that encapsulates information from the entire input sequence. 

__The preservation of shape through the transformer block architecture is not incidental but a crucial aspect of its design.__ 


In [22]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout = cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"]
        )
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        shortcut = x # Shortcut connection for attention block
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        x = x + shortcut # Add original input back

        shortcut = x # Shortcut connection for feed forward block
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut # Add the original input back
        return x

## 5. GPT Architecture

In [None]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()

        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range (cfg["n_layers"])]
        )
        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_leng = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(
            torch.arange(seq_leng, device=in_idx.device)
        )

        # Add token and positional embeddings, apply dropout
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)

        # Transformer blocks
        x = self.trf_blocks(x)

        # Final layer normalization and output projection
        x = self.final_norm(x)
        logits = self.out_head(x)

        return logits
    

Thanks to the `TransformerBlock` class, the `GPTModel` class is realtively small and compact. 

The `__init__` constrcutor of this `GPTModel` class initializes the token and positional embedding layers using the configurations passed in via a Python dictionary `cfg`. These embedding layers are responsible for converting input token indices into dense vectors and adding positinal information. Next, the `__init__` method creates a sequential stack of `TransformerBlock` modules equal to the number of layers specified in `cfg`. 

Following the transformer blocks, a `LayerNorm` layers is applied, standardizing the outputs from the transformer blocks to stabilize the learning process. Finally, a linear output head wthout bias is defined which projects the transfomer's output inot a vocabulary space of the tokenizer to generate logits for each token in the vocabulary. 

The forward method takes a batch of input token indices, computes their embeddings, applies the positional embeddings, passes the sequence through the transformer blocks, normalizes the final output, and then computes the logits, representing the next token's unnormazlied probabilities.

In [None]:
# Define the model configuration
model_config = {
    "vocab_size": tokenizer.vocab_size,        # Vocabulary Size
    "context_length": 32,                      # Context Length
    "emb_dim" : 48,                            # Embedding dimension
    "n_heads": 4,                              # Number of attention heads
    "n_layers": 3,                             # Number of layers
    "drop_rate": 0.1,                          # Dropout rate     
    "qkv_bias":True                            # Query-Key-Value bias
}

In [25]:
torch.manual_seed(123)
# Instantiate the GPTModel
model = GPTModel(model_config)

In [26]:
def print_model_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total Number of Parameters: {total_params:,}")
    actual_total_params = (
        total_params - sum(
            p.numel() for p in model.out_head.parameters()
        )
    )

    total_size_bytes = total_params * 4
    total_size_mb = total_size_bytes / (1024 * 1024)

    print(f"Actual Number of trainable parameters "
          f"considering weight tying: {actual_total_params:,}\n"
          f"Total size of the model is: {total_size_mb:.2f} MB")
# Call the function with the model
print_model_parameters(model)

Total Number of Parameters: 88,368
Actual Number of trainable parameters considering weight tying: 87,408
Total size of the model is: 0.34 MB


# Training the Model


In [None]:
def train_and_evaluate(
    model,
    train_dataset,
    test_dataset=None,
    config=None,
    device=None,
    save_path="model.pth"
):
    """
    Train and evaluate the given model.

    Args:
        model: The PyTorch model to train.
        train_loader: DataLoader for the training dataset.
        test_loader: DataLoader for the test dataset (optional).
        config: Configuration dictionary containing hyperparameters like
                learning rate, weight decay, batch size, max_epochs, etc.
        device: Device to use ('cuda', 'mps', or 'cpu'). If None, it will be auto-detected.
    """
    # Determine device
    if device is None:
        device = (
            "cuda" if torch.cuda.is_available() else
            "mps" if torch.backends.mps.is_available() else
            "cpu"
        )
    
    # Move model to device
    model = torch.nn.DataParallel(model).to(device)

    # Optimizer setup
    def create_optimizer(model, config):
        no_decay = ["bias", "LayerNorm.weight"]
        params_decay = [
            p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)
        ]
        params_no_decay = [
            p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)
        ]
        optim_groups = [
            {"params": params_decay, "weight_decay": config["weight_decay"]},
            {"params": params_no_decay, "weight_decay": 0.0},
        ]
        return AdamW(optim_groups, lr=config["learning_rate"], betas=config["betas"])

    optimizer = create_optimizer(model, config)

    # Data Loader
    train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=config["batch_size"],
        shuffle=True,
        drop_last=True,
        num_workers=config["num_workers"]
    )

   
    test_loader = (
        DataLoader(
            dataset=test_dataset,
            batch_size=config["batch_size"],
            shuffle=False,
            drop_last=False,  
            num_workers=config["num_workers"],
        )
        if test_dataset
        else None
    )

    # Single epoch logic
    def run_epoch(loader, is_train):
        model.train(is_train)
        total_loss = 0.0
        loader = tqdm(loader, desc="Training" if is_train else "Evaluating")
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            with torch.set_grad_enabled(is_train):
                logits= model(x)
                # loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), y.flatten(), ignore_index=-1)
                loss = torch.nn.functional.cross_entropy(
                    logits.view(-1, logits.size(-1)), y.view(-1), ignore_index=-1
                )
                loss = loss.mean()  # Handle multiple GPUs
                total_loss += loss.item()

                if is_train:
                    optimizer.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), config["grad_norm_clip"])
                    optimizer.step()
            loader.set_postfix(loss=loss.item())
        return total_loss / len(loader)
    
    # Track training and validation loss
    train_losses = []
    valid_losses = []

    # Training loop
    for epoch in range(config["max_epochs"]):
        print(f"Epoch {epoch + 1}/{config['max_epochs']}")
        train_loss = run_epoch(train_loader, is_train=True)
        train_losses.append(train_loss)
        print(f"Train Loss: {train_loss:.4f}")

        if test_dataset is not None:
            test_loss = run_epoch(test_loader, is_train=False)
            valid_losses.append(test_loss)
            print(f"Test Loss: {test_loss:.4f}")
    
    # Save the trained model
    torch.save(model.state_dict(), save_path)
    print(f"Model saved to {save_path}")

    return train_losses, valid_losses

### **Function Header and Docstring**
```python
def train_and_evaluate(
    model,
    train_dataset,
    test_dataset=None,
    config=None,
    device=None,
    save_path="model.pth"
):
    """
    Train and evaluate the given model.

    Args:
        model: The PyTorch model to train.
        train_loader: DataLoader for the training dataset.
        test_loader: DataLoader for the test dataset (optional).
        config: Configuration dictionary containing hyperparameters like
                learning rate, weight decay, batch size, max_epochs, etc.
        device: Device to use ('cuda', 'mps', or 'cpu'). If None, it will be auto-detected.
    """
```

- **Purpose**: Defines a function to train and evaluate a model using PyTorch.
- **Arguments**:
  - `model`: The neural network model to train and evaluate.
  - `train_dataset`: The training dataset.
  - `test_dataset` (optional): The test/validation dataset.
  - `config`: A dictionary containing training hyperparameters.
  - `device`: The computational device (GPU/CPU) to use.
  - `save_path`: Path to save the trained model's weights.

---

### **Device Selection**
```python
if device is None:
    device = (
        "cuda" if torch.cuda.is_available() else
        "mps" if torch.backends.mps.is_available() else
        "cpu"
    )
```

- **Checks if a specific `device` is provided**. If not:
  - Prioritizes `cuda` (NVIDIA GPU) if available.
  - Falls back to `mps` (Apple silicon) or `cpu`.

---

### **Move Model to Device**
```python
model = torch.nn.DataParallel(model).to(device)
```

- Wraps the model in `DataParallel` to allow multi-GPU training.
- Moves the model to the selected `device`.

---

### **Optimizer Setup**
```python
def create_optimizer(model, config):
    no_decay = ["bias", "LayerNorm.weight"]
    params_decay = [
        p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)
    ]
    params_no_decay = [
        p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)
    ]
    optim_groups = [
        {"params": params_decay, "weight_decay": config["weight_decay"]},
        {"params": params_no_decay, "weight_decay": 0.0},
    ]
    return AdamW(optim_groups, lr=config["learning_rate"], betas=config["betas"])
```

- **Purpose**: Defines an optimizer with different weight decay for parameters.
  - `no_decay`: Specifies parameters like `bias` and `LayerNorm.weight` that should not have weight decay.
  - Groups parameters based on their need for weight decay.
  - Returns an `AdamW` optimizer with the specified learning rate and betas.

```python
optimizer = create_optimizer(model, config)
```
- Instantiates the optimizer using the `create_optimizer` function.

---

### **Data Loaders**
```python
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=config["batch_size"],
    shuffle=True,
    drop_last=True,
    num_workers=config["num_workers"]
)
```

- Creates a PyTorch `DataLoader` for the training dataset with:
  - Shuffling for randomness.
  - Dropping the last batch if it's incomplete.
  - `num_workers`: Number of parallel data loading workers.

```python
test_loader = (
    DataLoader(
        dataset=test_dataset,
        batch_size=config["batch_size"],
        shuffle=False,
        drop_last=False,  
        num_workers=config["num_workers"],
    )
    if test_dataset
    else None
)
```

- Creates a `DataLoader` for the test dataset (if provided), without shuffling and without dropping incomplete batches.

---

### **Single Epoch Logic**
```python
def run_epoch(loader, is_train):
    model.train(is_train)
    total_loss = 0.0
    loader = tqdm(loader, desc="Training" if is_train else "Evaluating")
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        with torch.set_grad_enabled(is_train):
            logits= model(x)
            loss = torch.nn.functional.cross_entropy(
                logits.view(-1, logits.size(-1)), y.view(-1), ignore_index=-1
            )
            loss = loss.mean()  # Handle multiple GPUs
            total_loss += loss.item()

            if is_train:
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), config["grad_norm_clip"])
                optimizer.step()
        loader.set_postfix(loss=loss.item())
    return total_loss / len(loader)
```

- **Purpose**: Executes one epoch (training or evaluation).
  - **`is_train`**:
    - `True`: Training mode (enables gradient computation).
    - `False`: Evaluation mode (disables gradient computation).
  - Uses `tqdm` for a progress bar.
  - Iterates through batches:
    - Moves input (`x`) and target (`y`) to the device.
    - Computes model predictions (`logits`).
    - Calculates cross-entropy loss (ignoring index `-1`).
    - For training:
      - Zeroes gradients.
      - Backpropagates loss.
      - Clips gradients (to avoid explosion).
      - Updates model parameters using the optimizer.
  - Tracks total loss.

---

### **Training Loop**
```python
train_losses = []
valid_losses = []

for epoch in range(config["max_epochs"]):
    print(f"Epoch {epoch + 1}/{config['max_epochs']}")
    train_loss = run_epoch(train_loader, is_train=True)
    train_losses.append(train_loss)
    print(f"Train Loss: {train_loss:.4f}")

    if test_dataset is not None:
        test_loss = run_epoch(test_loader, is_train=False)
        valid_losses.append(test_loss)
        print(f"Test Loss: {test_loss:.4f}")
```

- Tracks training and validation losses over epochs.
- For each epoch:
  - Runs training (`run_epoch` with `is_train=True`).
  - Optionally runs evaluation (`run_epoch` with `is_train=False`).
  - Logs losses.

---

### **Save Trained Model**
```python
torch.save(model.state_dict(), save_path)
print(f"Model saved to {save_path}")
```

- Saves the model's state dictionary (weights and biases) to the specified path.

---

### **Return Results**
```python
return train_losses, valid_losses
```

- Returns lists of training and validation losses for further analysis.

In [None]:
# Traning Configuration
config = {
    "dataset_range": range(0,1000),  # Testing dataset range
    "learning_rate": 3e-4,
    "weight_decay": 0.01,
    "betas": (0.9, 0.999),
    "batch_size": 100,
    "num_workers": 0,
    "grad_norm_clip": 1.0,
    "max_epochs": 6,
}

In [None]:
# Create train and test dataloaders
addition_dataset = AdditionDataset(
    context_length=model_config["context_length"],
    tokenizer=tokenizer,
    numbers=config["dataset_range"],
    include_intermediate_steps=True
)

# Define the proportions for traning and validation datasets
train_size = int(0.8 * len(addition_dataset)) # 80% for training
val_size = len(addition_dataset) - train_size # 20 % for validation

# Split the dataset into traning and validation datasets
train_dataset, val_dataset = random_split(
    addition_dataset, [train_size, val_size]
)

In [None]:
# Number of Samples in Train and Test Dataset
len(train_dataset), len(val_dataset)

In [None]:
# Train and Evaluate the Model
train_losses, valid_losses =  train_and_evaluate(
    model=model,
    train_dataset=train_dataset,
    test_dataset=val_dataset,
    config=config,
    save_path="trained_model_1000_v2.pth"
)

# Generation Function 

In [27]:
# A function for the GPT model to generate tokens
def generate(model, indx, max_new_tokens, temperature=1.0, top_k=None, stop_tokens=None):
    
    for _ in range(max_new_tokens):
        context_length = model.pos_emb.weight.shape[0]
        # If given sequence is longer than the context length we must trim it to context length
        if indx.shape[-1] > context_length:
            indx = indx[:, -context_length:]
        with torch.no_grad():
            logits = model(indx)
        logits = logits[:, -1, :] / temperature
        
        # Sample tokens to the top-k most likely tokens and exclude all other tokens
        if top_k is not None:
            top_logits, _ = torch.topk(logits, top_k)
            # Minimum value of top k logits
            min_value = top_logits[:, -1]
            # Set all logits less than min_value to -inf
            logits = torch.where(logits < min_value, torch.tensor(-np.inf).to(logits.device), logits)

        if temperature > 0.0:
            # Apply temperature scaling
            logits = logits / temperature
            probs = torch.softmax(logits, dim=-1)
            index_next = torch.multinomial(probs, num_samples=1)

        else:
            # If temperature is 0, use argmax to select the next token
            index_next = torch.argmax(logits, dim=-1, keepdim=True)

        # Append the generated token to the input sequence
        indx = torch.cat((indx, index_next), dim=1)

        # Check if the generated token is in the stop tokens
        if stop_tokens is not None and index_next.item() in stop_tokens:
            #print("Stop token generated. Ending sequence.", index_next.item())
            break

    return indx

### Explanation of the Code

This function `generate` is designed for generating a sequence of tokens using a GPT model. Here's a line-by-line breakdown of the code:

---

### **Function Header**
```python
def generate(model, indx, max_new_tokens, temperature=1.0, top_k=None, stop_tokens=None):
```

- **Purpose**: Generates a sequence of tokens using a pretrained GPT model.
- **Parameters**:
  - `model`: The GPT model to use for token generation.
  - `indx`: A tensor containing the input token sequence (starting context).
  - `max_new_tokens`: The maximum number of tokens to generate.
  - `temperature`: Controls randomness in sampling; lower values make the model more deterministic.
  - `top_k`: Restricts token selection to the top-k most probable tokens (for diversity control).
  - `stop_tokens`: A list of tokens that, if generated, will stop the generation process early.

---

### **Token Generation Loop**
```python
for _ in range(max_new_tokens):
```

- Loops for up to `max_new_tokens` iterations to generate tokens one at a time.

---

### **Context Trimming**
```python
context_length = model.pos_emb.weight.shape[0]
if indx.shape[-1] > context_length:
    indx = indx[:, -context_length:]
```

- **Why?** GPT models have a fixed context length (the number of tokens they can attend to at once).
- If the input sequence (`indx`) exceeds the model's context length, it trims the input to the most recent tokens (last `context_length` tokens).

---

### **Get Model Predictions**
```python
with torch.no_grad():
    logits = model(indx)
```

- Passes the input sequence (`indx`) through the model to get the logits (raw predictions for each token in the vocabulary).
- Uses `torch.no_grad()` to prevent gradient computation during inference (saves memory and computation).

---

### **Extract Predictions for Next Token**
```python
logits = logits[:, -1, :] / temperature
```

- Extracts logits for the last token in the sequence (next token prediction).
- Divides by `temperature`:
  - **High Temperature (>1.0)**: Makes predictions more random.
  - **Low Temperature (<1.0)**: Focuses on high-probability predictions.
  - **Temperature = 0**: Fully deterministic (argmax is used later).

---

### **Apply Top-k Sampling**
```python
if top_k is not None:
    top_logits, _ = torch.topk(logits, top_k)
    min_value = top_logits[:, -1]
    logits = torch.where(logits < min_value, torch.tensor(-np.inf).to(logits.device), logits)
```

- **Top-k Sampling**: Limits the token selection to the top-k most probable tokens.
  - Finds the smallest value in the top-k logits (`min_value`).
  - Sets all other logits to `-inf`, ensuring they are ignored during sampling.

---

### **Sample Next Token**
```python
if temperature > 0.0:
    logits = logits / temperature
    probs = torch.softmax(logits, dim=-1)
    index_next = torch.multinomial(probs, num_samples=1)
```

- If `temperature > 0`, applies softmax to the logits to convert them into probabilities.
- Uses `torch.multinomial` to randomly sample a token based on these probabilities.

```python
else:
    index_next = torch.argmax(logits, dim=-1, keepdim=True)
```

- If `temperature == 0`, selects the token with the highest probability (`argmax`).

---

### **Update Input Sequence**
```python
indx = torch.cat((indx, index_next), dim=1)
```

- Appends the newly generated token (`index_next`) to the input sequence (`indx`) for the next iteration.

---

### **Check for Stop Tokens**
```python
if stop_tokens is not None and index_next.item() in stop_tokens:
    break
```

- If a stop token is generated, the function stops further token generation.

---

### **Return Generated Sequence**
```python
return indx
```

- Returns the complete sequence of tokens, including the original input and all newly generated tokens.

---

### **Key Features**
1. **Context Trimming**: Handles sequences longer than the model's context length.
2. **Temperature Scaling**: Balances randomness vs. determinism in token selection.
3. **Top-k Sampling**: Ensures diversity by limiting token selection to top-k candidates.
4. **Stop Tokens**: Enables early termination based on specific conditions.

This function can be used to generate text from a GPT model with fine control over randomness, diversity, and termination criteria.

# Model Inference

## Load the Saved Model

In [None]:
model = GPTModel(model_config)

# Load the state dictionary
state_dict = torch.load("trained_model_1000_v1.pth", map_location=torch.device('cpu'))

# Remove 'module.' prefix from the keys
from collections import OrderedDict
new_state_dict = OrderedDict()
for k, v in state_dict.items():
    # k[7:] is "module." characters
    name = k[7:] if k.startswith("module.") else k  # remove 'module.' prefix in the keys
    new_state_dict[name] = v

model.load_state_dict(new_state_dict)
model.eval()  # Ensure the model is in evaluation mode
model.to("cpu")  # Move the model to the CPU

GPTModel(
  (tok_emb): Embedding(20, 48)
  (pos_emb): Embedding(32, 48)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=48, out_features=48, bias=True)
        (W_key): Linear(in_features=48, out_features=48, bias=True)
        (W_value): Linear(in_features=48, out_features=48, bias=True)
        (out_proj): Linear(in_features=48, out_features=48, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=48, out_features=192, bias=True)
          (1): GELU()
          (2): Linear(in_features=192, out_features=48, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=48, out_features=48,

---

### **Model Initialization**
```python
model = GPTModel(model_config)
```

- **Creates a new GPT model instance** using `GPTModel` with the specified `model_config`.
- This new instance will later be updated with the trained parameters from the saved model file.

---

### **Load the State Dictionary**
```python
state_dict = torch.load("trained_model_1000_v1.pth", map_location=torch.device('cpu'))
```

- **Loads the saved model weights** from the file `trained_model_1000_v1.pth` into `state_dict`.
- The `map_location=torch.device('cpu')` ensures that the weights are loaded onto the CPU, regardless of where they were originally saved (e.g., GPU).

Using both `map_location="cpu"` and `model.to("cpu")` ensures that:

1. **`map_location="cpu"`**: Loads the saved model weights onto the CPU, regardless of where they were saved (GPU or CPU).
2. **`model.to("cpu")`**: Configures the model itself to run computations on the CPU.

Together, they prevent device mismatches and ensure the model and weights align for consistent CPU-based inference.

---

### **Adjust the Keys in the State Dictionary**
```python
from collections import OrderedDict
new_state_dict = OrderedDict()
for k, v in state_dict.items():
    name = k[7:] if k.startswith("module.") else k  # remove 'module.' prefix in the keys
    new_state_dict[name] = v
```

- **Problem**: The saved `state_dict` contains keys prefixed with `"module."`, which occurs when models are trained using `torch.nn.DataParallel`. This prefix must be removed for compatibility with a non-`DataParallel` model.
- **Solution**:
  - Loops through all key-value pairs in the `state_dict`.
  - Checks if a key starts with `"module."`.
  - If true, removes the prefix by slicing (`k[7:]`).
  - Creates a new `state_dict` (`new_state_dict`) without the `"module."` prefix.

---

### **Load the Processed State Dictionary**
```python
model.load_state_dict(new_state_dict)
```

- Updates the initialized model (`model`) with the weights from the processed `new_state_dict`.
- Ensures that the model now has the trained parameters.

---

### **Set Evaluation Mode**
```python
model.eval()
```

- **Switches the model to evaluation mode**:
  - Disables dropout and batch normalization layers, ensuring deterministic behavior during inference.

---

### **Move Model to CPU**
```python
model.to("cpu")
```

- Moves the model's parameters and buffers to the CPU.
- This step ensures compatibility with systems that don't have a GPU or when inference will occur on the CPU.

---

### **Summary of Steps**
1. **Initialize the Model**: A new model instance is created using the desired configuration.
2. **Load Weights**: The trained weights are loaded from a saved file.
3. **Key Adjustment**: Any prefixes from multi-GPU training (`module.`) are removed to ensure compatibility.
4. **Load State Dictionary**: The processed weights are loaded into the model.
5. **Prepare for Evaluation**: The model is switched to evaluation mode and moved to the CPU.

---

### **Result**
The `model` is now fully loaded with trained weights, in evaluation mode, and ready for inference on the CPU.

## Model Output

In [50]:
# Ask the user for two inputs
a = 0
b = 0
# Create the input string
input_str = f"{a}+{b}="
input_ids = torch.tensor(tokenizer.encode(input_str)[:-1], dtype=torch.long).unsqueeze(0)

generated_tokens = generate(
    model=model,
    indx=input_ids,  # Starting token
    max_new_tokens=32,  # Number of tokens to generate
    temperature=1.0,
    top_k=10,
    stop_tokens=[tokenizer._convert_token_to_id(x) for x in ["[SEP]", "[CLS]"]])

In [51]:
import sys
import time

tokens = tokenizer.convert_ids_to_tokens(generated_tokens[0].tolist(), skip_special_tokens=True)

for token in tokens:
    sys.stdout.write(token)  # Print the token horizontally
    sys.stdout.flush()  # Ensure it's displayed immediately
    time.sleep(0.2)  # Add a delay in seconds to slow down the output

0+0=0+0+0=0