In [6]:
import os
import pandas as pd
import sys
import json
import time
from pathlib import Path
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm

In [8]:
# Path to the dataset folder one level above the current working directory
DATA_PATH = os.path.join(".", "tig_dataset")
print(os.path.abspath(DATA_PATH))  # Shows the full path


d:\projects\aman\transformer\tig_dataset


In [9]:
import torch
import torch.nn as nn

In [14]:
def dataset_List(data_path):
    """
    Reads our trainig data (list of files that have the training text)
    
    Returns list of list sentences for each file
    """
    list_files=os.listdir(data_path)
    dataset_list=[]
    for file_path in list_files:
        if file_path.endswith(".txt"):
            with open(os.path.join(data_path,file_path), 'r') as file:
                lines=file.readlines()
            sentences=[]
            for line in lines:
                if line!="\n":
                    sentences.append(line)
                else:
                    dataset_list.append(sentences)
                    sentences=[]
    return dataset_list

In [15]:
list_files=os.listdir(DATA_PATH)
content = ""
for filename in list_files:
    if filename.endswith(".txt"):
        print(f"Reading file: {filename}")
        with open(os.path.join(DATA_PATH, filename), 'r', encoding='utf-8') as f:
            content += f.read()


Reading file: bible_web_data.txt
Reading file: commun_aff.txt
Reading file: menfesawi.txt
Reading file: misc_data.txt
Reading file: poemes.txt
Reading file: social_media.txt
Reading file: tghat_web_data.txt
Reading file: tig_books.txt
Reading file: tvml_t.txt
Reading file: tvml_v.txt
Reading file: xlum_web_data.txt


In [17]:
#Setting the parameters of the GPT2
GPT_CONFIG = {
    "vocab_size": 16000,  # Vocabulary size
    "context": 1024,      # Context length in number of tokens
    "emb_dim": 768,       # Embedding dimension
    "n_heads": 12,        # Number of attention heads
    "n_layers": 12,       # Number of layers
    "dropout_rate": 0.1,  # Dropout rate
    "qkv_bias": False     # Query-Key-Value bias
}

### Building the GPT model 

### Building the attention layer (self and multi-head attention layers)

#### The Transformer block which is composed of the above layers, linear Layer, GELU, Feedforward and attnetion heads 

#### GPT (Generative pretraining models) which is composzed of multiple transformer blocks and softmax layetr at the top. 
#### It takes embedings of the tokens and their positions

### Playing around the transformer block

In [25]:
torch.manual_seed(123)

x = torch.rand(2, 4, 768)  # Shape: [batch_size, num_tokens, emb_dim]
block = TransformerBlock(GPT_CONFIG)
output = block(x)

print("Input shape:", x.shape)
print("Output shape:", output.shape)

Input shape: torch.Size([2, 4, 768])
Output shape: torch.Size([2, 4, 768])


#### Getting the tokenizer: Here we use the GPT2 tokenizer. However, we have also trained our own bpe based tokenizer for the model

In [26]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("./tokenizers/Tig_BPE_16000")
# tokenizer = Tokenizer.from_file("tokenizer_TIG_Trial.json")

  from .autonotebook import tqdm as notebook_tqdm


In [27]:
tokenizer.pad_token = "[መልእ]"
tokenizer.cls_token = "[ጀመረ]"
tokenizer.sep_token = "[ከፋሊ]"
tokenizer.mask_token = "[ሽፉን]"
tokenizer.unk_token = "[ለየለ]"

### Playing aroung with the tokenizer

In [28]:
tokenizer.padding_size = 'right'
tokenizer.add_special_tokens({'pad_token': '[መልእ]'})

0

In [29]:
C = 128
batch = []

txt1 = "ስነ ስርዓት ብሄራዊ ሓዘን፣መርድእን ዝኽርን ስውኣት ሃርበርኛታት ትግራይ ተግባራዊ ንምግባር ካብ ክልል ክሳብ ጣብያ ዝተፈላለዩ ኮሚቴታት ተጣይሾም ምይይጥ ይካየድ ከም ዘሎ እናተሓበረ እዩ።"
txt2 = "ደቅና ዝወደቕሉ ዕላማ ህዝቢ ትግራይ ካብ ብርሰት ንምድሓን ስለዝኾነ መስዋእቶም ህያውን ብኽብሪ እናተዘከረ ዝነብርን እዩ።"

batch.append(torch.tensor(tokenizer.encode(txt1,padding=True,truncation=True,max_length=C)))
batch.append(torch.tensor(tokenizer.encode(txt2,padding=True,truncation=True,max_length=14)))

batch = tokenizer(
    [txt1, txt2],
    padding="max_length",  # pad to max_length
    truncation=True,
    max_length=35,         # or whatever length you want
    return_tensors="pt"    # returns PyTorch tensors
)["input_ids"]

print(batch.shape)
print(batch)

# batch = torch.stack(batch, dim=0)
print(batch)

torch.Size([2, 35])
tensor([[12532,  1756,  5733,  2531,   329,   615,   221,   646,  1783,   518,
          7969, 13986,   355,   896,   745,  8944,  3521,   288,  1972,   606,
          6694,  1490,  5058,   265,  5213,   198, 13030, 12298,  5072,   262,
           402,  8177,   353,   300,   204],
        [  239,  9499,  4970, 14472,  1678,   611,   745,   288, 15817,  6887,
          2388,  1824,  6068,  2219,   296,  7624,  1018,   279,  6035,   377,
          3846,   300,   204,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2]])
tensor([[12532,  1756,  5733,  2531,   329,   615,   221,   646,  1783,   518,
          7969, 13986,   355,   896,   745,  8944,  3521,   288,  1972,   606,
          6694,  1490,  5058,   265,  5213,   198, 13030, 12298,  5072,   262,
           402,  8177,   353,   300,   204],
        [  239,  9499,  4970, 14472,  1678,   611,   745,   288, 15817,  6887,
          2388,  1824,  6068,  2219,   296,  7624,  10

In [30]:
GPT_CONFIG["vocab_size"]=tokenizer.vocab_size

In [48]:
GPT_CONFIG

{'vocab_size': 16000,
 'context': 1024,
 'emb_dim': 768,
 'n_heads': 12,
 'n_layers': 12,
 'dropout_rate': 0.1,
 'qkv_bias': False}

#### Trying the GPT Models

In [31]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG)

out = model(batch)
print("Input batch:\n", batch)
print("\nOutput shape:", out.shape)
print(out)

Input batch:
 tensor([[12532,  1756,  5733,  2531,   329,   615,   221,   646,  1783,   518,
          7969, 13986,   355,   896,   745,  8944,  3521,   288,  1972,   606,
          6694,  1490,  5058,   265,  5213,   198, 13030, 12298,  5072,   262,
           402,  8177,   353,   300,   204],
        [  239,  9499,  4970, 14472,  1678,   611,   745,   288, 15817,  6887,
          2388,  1824,  6068,  2219,   296,  7624,  1018,   279,  6035,   377,
          3846,   300,   204,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2]])

Output shape: torch.Size([2, 35, 16000])
tensor([[[ 0.1725, -0.7187, -0.6800,  ..., -0.7375, -0.5550, -0.3602],
         [-0.3978, -0.4335, -1.2455,  ...,  0.5939,  0.0061, -0.7532],
         [ 1.0028, -0.3691, -0.7015,  ..., -0.7258, -1.1962, -0.2808],
         ...,
         [ 0.3745, -0.2554,  0.1445,  ..., -0.0289, -0.0688,  0.5678],
         [ 0.1980,  0.3967,  0.7678,  ..., -0.1246, -0.4080, -0.4212],
         [

In [32]:
out.shape

torch.Size([2, 35, 16000])

In [33]:
out.flatten(0,1).shape

torch.Size([70, 16000])

In [34]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")

Total number of parameters: 110,390,784


In [35]:
print("Token embedding layer shape:", model.tok_emb.weight.shape)
print("Output layer shape:", model.out_head.weight.shape)

Token embedding layer shape: torch.Size([16000, 768])
Output layer shape: torch.Size([16000, 768])


In [36]:
# Calculate the total size in bytes (assuming float32, 4 bytes per parameter)
size_in_gb = (total_params * 4)/(1024 * 1024 * 1024)

print(f"Total memory size of the model: {size_in_gb:.2f} GB")


Total memory size of the model: 0.41 GB


In [37]:
logits = model(batch)
print("Output shape:", logits.shape)
print(logits)

Output shape: torch.Size([2, 35, 16000])
tensor([[[ 0.3796, -0.3549, -0.2432,  ..., -0.3788, -0.2950, -0.0339],
         [-0.8383, -1.0397, -0.6864,  ...,  0.9860, -0.4093, -0.6160],
         [ 0.8875, -0.3274, -0.0910,  ..., -0.9472, -1.2487, -0.3357],
         ...,
         [ 0.4272, -0.2393,  0.6474,  ..., -0.3958, -0.0268, -0.0401],
         [ 0.2197,  0.4848,  0.9294,  ..., -0.0233, -0.1967,  0.2135],
         [-0.0419,  0.8995, -0.3028,  ...,  0.0717,  0.5144,  0.4318]],

        [[-0.4489, -0.0767, -0.0254,  ..., -0.2010,  0.2316,  0.4263],
         [-0.1254, -0.1688, -0.3220,  ...,  0.5243,  0.7971, -0.3397],
         [ 0.4661, -0.0124, -0.1558,  ...,  0.6615, -0.4667, -0.8819],
         ...,
         [ 1.1056, -0.7179, -0.2374,  ...,  0.4384, -0.8988, -0.1041],
         [ 0.3817,  0.1849,  0.5021,  ...,  0.5512, -0.5971, -0.4292],
         [ 0.3433,  0.2350, -0.3063,  ...,  0.7231, -0.2712,  0.4317]]],
       grad_fn=<UnsafeViewBackward0>)


In [38]:
logits.shape

torch.Size([2, 35, 16000])

##### Preparing for Training 
######   Data need to be loaded 
######   A generator to check what the model can generate during training to see how it is progressing


In [39]:
def generate_text(model, idx, max_new_tokens, context_size):
    # idx is (batch, n_tokens) array of indices in the current context
    for _ in range(max_new_tokens):
        
        # Crop current context if it exceeds the supported context size
        # E.g., if LLM supports only 5 tokens, and the context size is 10
        # then only the last 5 tokens are used as context
        idx_cond = idx[:, -context_size:]
        
        # Get the predictions
        with torch.no_grad():
            logits = model(idx_cond)
        
        # Focus only on the last time step
        # (batch, n_tokens, vocab_size) becomes (batch, vocab_size)
        logits = logits[:, -1, :]  

        # Apply softmax to get probabilities
        probas = torch.softmax(logits, dim=-1)  # (batch, vocab_size)

        # Get the idx of the vocab entry with the highest probability value
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)  # (batch, 1)

        # Append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)

    return idx

import torch

def generate_predictions(model, input_data, top_k=3, temperature=1.0):
  """Generates predictions from a generative model.

  Args:
      model: The generative model to use for prediction.
      input_data: The input data for the model (tensor).
      top_k: The number of top probable predictions to return (optional, defaults to 3).
      temperature: The temperature for sampling (optional, defaults to 1.0).

  Returns:
      A list containing the top predicted tokens/classes and their probabilities.
  """
  # Ensure input data is a tensor
  if not torch.is_tensor(input_data):
    input_data = torch.tensor(input_data)

  # Get logits (unnormalized outputs) from the model
  with torch.no_grad():  # Deactivate gradient calculation for efficiency
    logits = model(input_data)

  # Apply temperature (optional)
  if temperature != 1.0:
    logits /= temperature

  # Generate different predictions based on the provided arguments:
  if top_k > 0:
    # Top-k sampling
    probs = torch.softmax(logits, dim=-1)  # Calculate probabilities
    top_k_values, top_k_indices = torch.topk(probs, top_k, dim=-1)  # Get top k probabilities and indices
    #print(top_k_indices)
    #print(top_k_values)
    predictions = []
    for i in range(len(top_k_indices)):
      prediction = []
      for j in range(top_k):
        index = top_k_indices[i][i][j].item()  # Convert tensor index to int
        value = top_k_values[i][i][j].item()  # Convert tensor value to float
        prediction.append((index, value))
      predictions.append(prediction)
    return predictions
  else:
    # Sample from the probability distribution
    probs = torch.softmax(logits, dim=-1)
    predictions = torch.multinomial(probs, 1).squeeze().tolist()  # Sample one index and convert to list
    return predictions


In [40]:
out.flatten(0,1).shape

torch.Size([70, 16000])

In [41]:
b = logits[0, -1, :]
b[0] = -1.4929
b[1] = 4.4812
b[2] = -1.6093

print(b[:3])
res=torch.softmax(b, dim=0)

tensor([-1.4929,  4.4812, -1.6093], grad_fn=<SliceBackward0>)


In [42]:
res

tensor([1.1865e-05, 4.6644e-03, 1.0562e-05,  ..., 5.6727e-05, 8.8318e-05,
        8.1311e-05], grad_fn=<SoftmaxBackward0>)

In [43]:
res.shape

torch.Size([16000])

In [44]:
start_context = "ጀጋኑ ስዉኣት ትግራይ "

encoded = tokenizer.encode(start_context)
print("encoded:", encoded)

encoded_tensor = torch.tensor(encoded).unsqueeze(0)
print("encoded_tensor.shape:", encoded_tensor.shape)

encoded: [557, 5091, 7505, 745, 149]
encoded_tensor.shape: torch.Size([1, 5])


In [45]:
model.eval() # disable dropout

out = generate_text(
    model=model,
    idx=encoded_tensor, 
    max_new_tokens=6, 
    context_size=GPT_CONFIG["context"]
)

print("Output:", out)
print("Output length:", len(out[0]))

Output: tensor([[  557,  5091,  7505,   745,   149, 10899, 15225, 10251,  6717, 11341,
         15808]])
Output length: 11


In [46]:
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)

ጀጋኑ ስዉኣት ትግራይ  ተሪፋ ኣዐ ዝጀመር ተሽከ ወርቅን ትኣምን


### Preparing dataset for training a GPT model using a dataset

In [48]:
def combine_files_by_size(file_paths, target_dir, max_size_mb=500, separator="<|endoftext|>", fallback_encoding="latin1"):
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    current_content = []
    current_size = 0
    file_counter = 1

    for file_path in file_paths:
        try:
            with open(os.path.join(DATA_PATH,file_path), "r", encoding="utf-8") as file:
                content = file.read()
        except UnicodeDecodeError:
            # Attempt to read the file with a fallback encoding
            print(f"Warning: UnicodeDecodeError encountered. Trying fallback encoding for {file_path}")
            with open(file_path, "r", encoding=fallback_encoding) as file:
                content = file.read()
        conts=content.split("<TITRE>")
        estimated_size = len(content.encode("utf-8"))

        for cont in conts:
            if current_size + estimated_size > max_size_mb * 1024 * 1024:
                target_file_path = os.path.join(target_dir, f"combined_{file_counter}.txt")
                with open(target_file_path, "w", encoding="utf-8") as target_file:
                    target_file.write(separator.join(current_content))
                file_counter += 1
                current_content = [cont]
                current_size = estimated_size
            else:
                current_content.append(cont)
                current_size += estimated_size

    if current_content:
        target_file_path = os.path.join(target_dir, f"combined_{file_counter}.txt")
        with open(target_file_path, "w", encoding="utf-8") as target_file:
            target_file.write(separator.join(current_content))

### Calculating lost at batch level and data loader 
##### It uses cross entropy loss
##### The loss can be computed at batch levele and datasets level (train and validation loaders)

In [49]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    #print(logits.flatten(0, 1).shape, target_batch.flatten().shape)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss


def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches


def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

### Visualization of what is happening by ploting the loss and generating 10 words ginen an input text

In [51]:
def generate_and_print(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text(
            model=model, idx=encoded,
            max_new_tokens=50, context_size=context_size)
        decoded_text = token_ids_to_text(token_ids, tokenizer)
        print(decoded_text.replace("\n", " "))  # Compact print format
    model.train()


def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text)#, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)  # Add batch dimension
    return encoded_tensor


#Decoding the token ids (taking greedy algorithm to choose the probabilities) using the tokenizer
def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)  # Remove batch dimension
    return tokenizer.decode(flat.tolist())

# Ploting loss and token that have been seen
def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses, output_dir):
    fig, ax1 = plt.subplots()

    # Plot training and validation loss against epochs
    ax1.plot(epochs_seen, train_losses, label="Training loss")
    ax1.plot(epochs_seen, val_losses, linestyle="-.", label="Validation loss")
    ax1.set_xlabel("Epochs")
    ax1.set_ylabel("Loss")
    ax1.legend(loc="upper right")

    # Create a second x-axis for tokens seen
    ax2 = ax1.twiny()  # Create a second x-axis that shares the same y-axis
    ax2.plot(tokens_seen, train_losses, alpha=0)  # Invisible plot for aligning ticks
    ax2.set_xlabel("Tokens seen")

    fig.tight_layout()  # Adjust layout to make room
    plt.savefig(output_dir / "losses.pdf")

### Creating dataloader  class for training 

In [68]:
from torch.utils.data import Dataset, DataLoader
class GPTDataset(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)#, allowed_special={'<|endoftext|>'})

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True):
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    dataset = GPTDataset(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)

    return dataloader

In [52]:
def read_text_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        text_data = file.read()
    return text_data


def create_dataloaders(text_data, train_ratio, batch_size, max_length, stride):
    split_idx = int(train_ratio * len(text_data))
    train_loader = create_dataloader(
        text_data[:split_idx],
        batch_size=batch_size,
        max_length=max_length,
        stride=stride,
        drop_last=True,
        shuffle=True
    )
    val_loader = create_dataloader(
        text_data[split_idx:],
        batch_size=batch_size,
        max_length=max_length,
        stride=stride,
        drop_last=False,
        shuffle=False
    )
    return train_loader, val_loader

In [53]:
len(content)

101193740

In [54]:
GPT_CONFIG["context"]

1024

In [55]:
lines = content.split("\n")  # or content.split(".") for sentences
lines = [line for line in lines if line.strip()]  # remove empty lines
print(f"Total lines: {len(lines)}")

Total lines: 1252695


In [57]:
def tokenized_batches(lines, tokenizer, batch_size, max_length, stride):
    batch = []
    for line in lines:
        tokens = tokenizer(
            line,
            max_length=max_length,
            truncation=True,
            stride=stride,
            return_overflowing_tokens=True,
            padding="max_length",   # ensures uniform length
            return_tensors="pt"
        )
        for t in tokens["input_ids"]:
            t = t.squeeze(0)  # remove leading 1 dimension
            batch.append(t)  # remove (1, max_length) -> (max_length)
            if len(batch) == batch_size:
                yield torch.stack(batch)  # safe now, all same size
                batch = []
    if batch:
        yield torch.stack(batch)


In [62]:
def tokenized_batches(lines, tokenizer, batch_size, max_length, stride):
    batch = []
    for line in lines:
        tokens = tokenizer(
            line,
            max_length=max_length,
            truncation=True,
            padding="max_length",   # <-- pad all sequences to max_length
            return_tensors="pt"
        )
        batch.append(tokens["input_ids"].squeeze(0))  # shape: [seq_len]
        if len(batch) == batch_size:
            yield torch.stack(batch)  # shape: [batch_size, seq_len]
            batch = []
    if batch:
        yield torch.stack(batch)

In [63]:
from torch.utils.data import DataLoader, IterableDataset

class TokenDataset(IterableDataset):
    def __init__(self, lines, tokenizer, batch_size, max_length, stride):
        self.lines = lines
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_length = max_length
        self.stride = stride

    def __iter__(self):
        return iter(tokenized_batches(
            self.lines,
            self.tokenizer,
            self.batch_size,
            self.max_length,
            self.stride
        ))

train_size = int(0.9 * len(lines))
train_dataset = TokenDataset(lines[:train_size], tokenizer, batch_size=16, max_length=512, stride=256)
val_dataset = TokenDataset(lines[train_size:], tokenizer, batch_size=16, max_length=512, stride=256)

train_loader = DataLoader(train_dataset)
val_loader = DataLoader(val_dataset)


### Training the Model

In [64]:
def train_model(model, optimizer, device, n_epochs,
                eval_freq, eval_iter, print_sample_iter, start_context,
                output_dir, save_ckpt_freq,
                train_loader, val_loader):

    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen = 0
    global_step = -1
    start_time = time.time()

    try:
        for epoch in range(n_epochs):
            print(f"\nEpoch {epoch+1}/{n_epochs} — Training ...")
            model.train()

            for input_batch in train_loader:
                input_batch = input_batch.to(device)  # remove any extra dimension
                target_batch = input_batch.clone()

                optimizer.zero_grad()
                loss = calc_loss_batch(input_batch, target_batch, model, device)

                loss.backward()
                optimizer.step()

                tokens_seen += input_batch.numel()
                global_step += 1

                # Evaluation step
                if global_step % eval_freq == 0:
                    train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)
                    train_losses.append(train_loss)
                    val_losses.append(val_loss)
                    track_tokens_seen.append(tokens_seen)
                    print(f"Ep {epoch+1} (Step {global_step}): "
                          f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

                # Generate text passage occasionally
                if global_step % print_sample_iter == 0:
                    generate_and_print(model, train_loader.dataset.tokenizer, device, start_context)

                # Save model checkpoint periodically
                if global_step % save_ckpt_freq == 0:
                    file_name = output_dir / f"model_pg_{global_step}.pth"
                    torch.save(model.state_dict(), file_name)
                    print(f"Saved checkpoint: {file_name}")

    except KeyboardInterrupt:
        print("Forcefully stopped. Saving interrupted checkpoint...")
        file_name = output_dir / f"model_pg_{global_step}_interrupted.pth"
        torch.save(model.state_dict(), file_name)
        print(f"Saved: {file_name}")

    return train_losses, val_losses, track_tokens_seen


In [65]:
n_epochs=1
print_sample_iter=500
eval_freq=50
save_ckpt_freq=100000
lr=5e-4
batch_size=8

In [66]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.1)

output_dir = Path("results")
output_dir.mkdir(parents=True, exist_ok=True)

train_losses, val_losses, tokens_seen = train_model(
    model=model,
    optimizer=optimizer,
    device=device,
    n_epochs=n_epochs,
    eval_freq=eval_freq,
    eval_iter=1,
    print_sample_iter=print_sample_iter,
    output_dir=output_dir,
    save_ckpt_freq=save_ckpt_freq,
    start_context=start_context,
    train_loader=train_loader,
    val_loader=val_loader
)


epochs_tensor = torch.linspace(0, n_epochs, len(train_losses))

print("debug", epochs_tensor)
#print(tokens_seen)
#print(train_losses, val_losses)
plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses, output_dir)

torch.save(model.state_dict(), output_dir / "model_Tig_gpt.pth")
print(f"Maximum GPU memory allocated: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")


Epoch 1/1 — Training ...


ValueError: too many values to unpack (expected 2)

### Loading Already trained model

In [34]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(123)

<torch._C.Generator at 0x7fc0c509f510>

In [270]:
model = GPTModel(GPT_CONFIG)
model.to(device)
model.load_state_dict(torch.load("results/model_pg_final.pth"))

<All keys matched successfully>

In [292]:
model1=GPTModel1(GPT2Config)
model1.to(device)
model1.from_pretrained(torch.load("results/model_pg_final.pth"))
model1.eval()

TypeError: PreTrainedModel.__init__() missing 1 required positional argument: 'config'

In [271]:
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
          (3): Dropout(p=0.1, inplace=False)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_resid): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAtten

In [273]:
torch.save(model,"results/pytorch_model_gpt2.pt")

In [71]:
prompt="J'ai cancer"

In [72]:
generate_and_print(model,tokenizer=tokenizer,device=device,start_context=prompt)

J'ai cancer du sein. <TITRE> Le rôle du pharmacien d'officine dans la prise en charge des patients atteints de cancer du sein. Sein. Conseil à l'


In [73]:
encoded = text_to_token_ids("J'ai cancer", tokenizer=tokenizer).to(device)
with torch.no_grad():
    token_ids = generate_text(model=model,
                            idx=encoded,
                            max_new_tokens=5, context_size=1024)
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))

J'ai cancer du sein. 


In [74]:
predi=generate_predictions(model, input_data=encoded, top_k=3, temperature=1.0)

In [75]:
predi[0][0][0]

13

In [76]:
len(predi[0])

3

### Puting some temperature and top k predictions of the model

In [80]:
input_text="J'ai cancer"
encoded = text_to_token_ids(input_text, tokenizer=tokenizer).to(device)
predi=generate_predictions(model, input_data=encoded, top_k=3, temperature=1.0)
predi=torch.Tensor(predi)
print(input_text)
for i in range(len(predi[0])):
    token_ids=[int(predi[0][i][0].item())]
    score=predi[0][i][1].item()
    #decoded_text = token_ids_to_text(token_ids, tokenizer)
    decoded_text = tokenizer.decode(token_ids=token_ids)
    print(decoded_text.replace("\n", " "),"-->",score)
input_text=input_text+decoded_text
print(input_text)
encoded = text_to_token_ids(input_text, tokenizer=tokenizer).to(device)
predi=generate_predictions(model, input_data=encoded, top_k=3, temperature=1.0)
predi=torch.Tensor(predi)

J'ai cancer
. --> 0.051486603915691376
, --> 0.047721248120069504
 à --> 0.017686987295746803
J'ai cancer à


In [211]:
def predict(model, prompt, top_k=3):
    
    # Encode the prompt text and convert to input tensors
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    #input_ids.to(device)
    # Get logits of the last token
    with torch.no_grad():  # No need to compute gradients
        outputs = model(input_ids).to(device)
        predictions = outputs[:, -1, :]

    #print(predictions)
    # Get the top k tokens and their log probabilities
    predictions=torch.softmax(outputs[:, -1, :],dim=-1)
    top_k_values, top_k_indices = torch.topk(predictions, top_k)
    top_k_log_probs = top_k_values[0].tolist()  # Convert to list
    #top_k_probs = torch.exp(top_k_values[0]).tolist()
    top_k_tokens = [tokenizer.decode(index).strip() for index in top_k_indices[0].tolist()]

    #print(top_k_values[0])
    # Combine tokens with their log probabilities
    top_k_predictions = list(zip(top_k_tokens,top_k_log_probs))

    return top_k_predictions


top_k_predictions = predict(model=model,prompt=prompt, top_k=3)

print("Top k predictions:")
for token, log_prob in top_k_predictions:
    print(f"Token: {token}, Probability score: {log_prob}")

Top k predictions:
Token: du, Probability score: 0.17419902980327606
Token: ., Probability score: 0.14047178626060486
Token: 2014, Probability score: 0.05693987011909485


In [209]:
input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
#input_ids.to(device)
# Get logits of the last token
with torch.no_grad():  # No need to compute gradients
    outputs = model(input_ids).to(device)
    predictions = outputs[:, -1, :]

In [127]:
import torch.nn.functional as F

In [119]:
predictions

tensor([[-0.1263,  1.0577, -3.0049,  ..., -4.9220, -9.1450, -8.5778]],
       device='cuda:0')

In [141]:
o=output[-1]

In [139]:
ap=F.log_softmax(output,dim=-1)

In [197]:
aa=torch.softmax(output[:-1:],dim=-1)

In [200]:
torch.topk(aa,3)

torch.return_types.topk(
values=tensor([[[0.0070, 0.0048, 0.0047],
         [0.0045, 0.0045, 0.0044],
         [0.0046, 0.0044, 0.0040],
         [0.0051, 0.0036, 0.0035]]], grad_fn=<TopkBackward0>),
indices=tensor([[[536, 637, 110],
         [548, 624, 692],
         [584, 160, 428],
         [ 33, 189, 544]]]))

In [143]:
torch.topk(F.log_softmax(o,dim=-1),3)

torch.return_types.topk(
values=tensor([[-4.7007, -4.8994, -5.1059],
        [-5.3264, -5.3997, -5.5259],
        [-5.3318, -5.3478, -5.4549],
        [-5.6147, -5.6250, -5.7012]], grad_fn=<TopkBackward0>),
indices=tensor([[338, 195, 344],
        [ 42, 191,  80],
        [338, 199, 691],
        [ 55, 131, 375]]))

In [219]:
def beam_search(model,input_text, beam_width=3, max_length=10):

    beam=[(input_text,0.0)]

    for i in range(max_length):
        candidates=[]

        for seq, score in beam:
            print(seq,":",score)
            predictions=predict(model=model,prompt=seq,top_k=beam_width)

            for word, prob in predictions:

                new_seq=seq+" "+word
                new_score=score+prob

                candidates.append((new_seq,new_score))
    
        beam=sorted(candidates,key=lambda x:x[1], reverse=True)[:beam_width]

    return beam

In [220]:
beams=beam_search(model=model,input_text=prompt)

J'ai cancer : 0.0
J'ai cancer du : 0.08337871730327606
J'ai cancer 2014 : 0.06458677351474762
J'ai cancer . : 0.05127895623445511
J'ai cancer du se : 0.6106284111738205
J'ai cancer .  : 0.4901657775044441
J'ai cancer 2014 . : 0.4407401531934738
J'ai cancer du se in : 1.6047788709402084
J'ai cancer 2014 .  : 0.7916611582040787
J'ai cancer .   : 0.7541298344731331
J'ai cancer du se in clus : 2.177734300494194
J'ai cancer du se in situ : 1.6499392613768578
J'ai cancer du se in ut : 1.623113626614213
J'ai cancer du se in clus ant : 2.422809660434723
J'ai cancer du se in clus . : 2.2979949191212654
J'ai cancer du se in situ ant : 2.2666399106383324
J'ai cancer du se in clus ant é : 3.156162738800049
J'ai cancer du se in clus .  : 2.6820903792977333
J'ai cancer du se in situ ant é : 2.605286620557308
J'ai cancer du se in clus ant é vol : 3.356650948524475
J'ai cancer du se in clus ant é lev : 3.3375744968652725
J'ai cancer du se in clus ant é ta : 3.238456517457962
J'ai cancer du se in clus 

In [218]:
for seq, score in beams:
    print(f"Prediction: {seq}, Score: {score/10}")

Prediction: J'ai cancer du se in clus .  L ' object if, Score: 0.5472464963793755
Prediction: J'ai cancer du se in clus .  N ous , , Score: 0.47964938059449197
Prediction: J'ai cancer du se in clus .  L ' é val, Score: 0.46785239726305006


In [222]:
from transformers import GPT2LMHeadModel, GPT2Config

In [235]:
GPT_CONFIG

{'vocab_size': 50257,
 'context': 1024,
 'emb_dim': 768,
 'n_heads': 12,
 'n_layers': 12,
 'dropout_rate': 0.1,
 'qkv_bias': False}

In [241]:
config_args={'gpt2':dict(n_layer=12, n_head=12, n_embd=768)}
config_args['vocab_size'] = GPT_CONFIG["vocab_size"] # always 50257 for GPT model checkpoints
config_args['block_size'] = GPT_CONFIG["context"] # always 1024 for GPT model checkpoints
config_args['bias'] = True
config_args["emb_dim"]=GPT_CONFIG["emb_dim"]
config_args['dropout']=GPT_CONFIG["dropout_rate"]

In [293]:
config=GPT2Config.from_pretrained('gpt2',output_hidden_states=False)

In [294]:
print(config)

GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.36.2",
  "use_cache": true,
  "vocab_size": 50257
}



In [228]:
#model.load_state_dict(torch.load("results/model_pg_final.pth",map_location='cpu'))

In [301]:
import torch
from transformers import GPT2LMHeadModel, GPT2Config

def convert_local_gpt2_to_hf(model_path, hf_model_name):
  """Converts a locally saved PyTorch GPT2 model to a Hugging Face Transformers model.

  Args:
      model_path: Path to the directory containing the saved PyTorch model weights (.pt file).
      hf_model_name: Name for the Hugging Face Transformers model (e.g., "converted_gpt2").
  """

  # Load model data
  loaded_data = torch.load(f"{model_path}/pytorch_model.pt")

  # Access the state dictionary (if it exists)
  model_state_dict = loaded_data.get('model_state_dict', None)

  if model_state_dict is not None:
      # Get model config (assuming it's a GPT2 model)
      config=GPT2Config(
        context=GPT_CONFIG["context"],
        emb_dim=GPT_CONFIG["emb_dim"],
        dropout_rate=0.1,
        qkv_bias=False)
      # Pretrained config as a base

      # Create new Hugging Face Transformers model
      hf_model = GPT2LMHeadModel(config)

      # Load state dict into Hugging Face model (potentially with key remapping)
      hf_model.load_state_dict(model_state_dict, strict=False)

      # Save Hugging Face Transformers model
      hf_model.save_pretrained(hf_model_name)
      print(f"Converted PyTorch model to Hugging Face Transformers model at: {hf_model_name}")
  else:
      print("Error: Could not find 'model_state_dict' in loaded data.")


In [265]:
loaded_data = torch.load(pytorch_model_path)

# Access the state dictionary (if it exists)
model_state_dict = loaded_data.get('model_state_dict', None)

In [303]:
# Example usage:
pytorch_model_path = "/home/aberhe/Projects/SANTAL/Course/notebook/results/model_pg_final.pth"
hf_model_name = "converted_gpt2"
#convert_gpt2_pth_to_hf(pytorch_model_path, hf_model_name)

In [315]:
# Load the custom model's state dictionary
state_dict = torch.load(pytorch_model_path, map_location='cpu')

# Load the configuration of GPT-2. Adjust this if you've used a different configuration
config = GPT2Config.from_pretrained('gpt2')

# Initialize the Hugging Face model with this configuration
model_hf = GPT2LMHeadModel(config)

# Update the Hugging Face model with your custom weights
# Ensure the keys in state_dict match those expected by GPT2LMHeadModel; adjustments might be necessary
model_hf.load_state_dict(state_dict, strict=False)

# Save the Hugging Face model in its expected format
model_hf.save_pretrained('./my_huggingface_gpt2_model')


In [316]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.save_pretrained('./my_huggingface_gpt2_model')

('./my_huggingface_gpt2_model/tokenizer_config.json',
 './my_huggingface_gpt2_model/special_tokens_map.json',
 './my_huggingface_gpt2_model/vocab.json',
 './my_huggingface_gpt2_model/merges.txt',
 './my_huggingface_gpt2_model/added_tokens.json')

In [339]:
model_hf = GPT2LMHeadModel.from_pretrained('my_huggingface_gpt2_model')
tokenizer = GPT2Tokenizer.from_pretrained('my_huggingface_gpt2_model')

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Tokenize input while ensuring padding and attention mask are correctly used
input_text = "J'ai cancer"
input_ids = tokenizer.encode(input_text, return_tensors="pt", padding=True, truncation=True, max_length=1024)
attention_mask = (input_ids != tokenizer.pad_token_id).long()

# Generate text with specified pad_token_id and attention_mask
outputs = model_hf.generate(input_ids, attention_mask=attention_mask, pad_token_id=tokenizer.pad_token_id, max_length=10)

# Decode and print the output
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

J'ai cancerhillaryhillaryhillary Fear Fear Fear


In [267]:
model_state_dict is not None

False

In [277]:
  model_state_dict = torch.load(pytorch_model_path)

In [281]:
model_state_dict.state_dict.keys()

AttributeError: 'function' object has no attribute 'keys'

In [296]:
GPT2Config()

GPT2Config {
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.36.2",
  "use_cache": true,
  "vocab_size": 50257
}

In [299]:
config=GPT2Config(
        context=GPT_CONFIG["context"],
        emb_dim=GPT_CONFIG["emb_dim"],
        dropout_rate=0.1,
        qkv_bias=False

    )

In [300]:
config

GPT2Config {
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "context": 1024,
  "dropout_rate": 0.1,
  "emb_dim": 768,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "qkv_bias": false,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.36.2",
  "use_cache": true,
  "vocab_size": 50257
}

In [298]:
GPT_CONFIG

{'vocab_size': 50257,
 'context': 1024,
 'emb_dim': 768,
 'n_heads': 12,
 'n_layers': 12,
 'dropout_rate': 0.1,
 'qkv_bias': False}

In [324]:
from collections import OrderedDict

In [5]:
def pytorch_model_to_huggingface_checkpoint(model_name, new_name = None):
    """Creates the Huggingface save, easily used for other purposes
    The files are saved in the path chosen in the config file.

    Args:
        checkpoint (str): name of the checkpoint
        model_name (str): name of the future huggingface save
    """

    config=GPT2Config.from_pretrained('gpt2')
    model_hf = GPT2LMHeadModel(config)
    
    state_dict = torch.load(model_name, map_location='cpu')
    """
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        name = k#[7:] # remove module.
        print(name)
        new_state_dict[name] = v
    # load params
    model_hf.load_state_dict(state_dict=state_dict)
    """
    model_hf.load_state_dict(state_dict, strict=False)
    if new_name==None:
        model_hf.save_pretrained("my_huggingface_gpt2_model")
    else:
        model_hf.save_pretrained(new_name)
    
    return model_hf

In [6]:
model_hf=pytorch_model_to_huggingface_checkpoint(model_name=pytorch_model_path,new_name="gpt2_hf")

NameError: name 'pytorch_model_path' is not defined

In [347]:
input_text = "J'ai cancer"
input_ids = tokenizer.encode(input_text, return_tensors="pt", padding=True, truncation=True, max_length=1024)
attention_mask = (input_ids != tokenizer.pad_token_id).long()

# Generate text with specified pad_token_id and attention_mask
outputs = model_hf.generate(input_ids, attention_mask=attention_mask, pad_token_id=tokenizer.pad_token_id, max_length=10)

# Decode and print the output
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

J'ai cancer vivo vivo vivo vivo vivo vivo


In [7]:
from transformers import RobertaForMaskedLM, RobertaConfig

In [11]:
args={
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.31.0",
  "type_vocab_size": 2,
  "use_cache": True,
  "vocab_size": 1600
}

In [14]:
config = RobertaConfig(
            vocab_size = args["vocab_size"],
            max_position_embeddings = args["max_position_embeddings"],
            hidden_size = args["hidden_size"],
            num_attention_heads = args["num_attention_heads"],
            num_hidden_layers = args["num_hidden_layers"]
            )

In [15]:
m=RobertaForMaskedLM(config)

In [16]:
m

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(1600, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): L