In [52]:
print('hello world')

hello world


In [1]:
# imports 
import torch 
import math
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from datasets import load_from_disk
from transformers import AutoTokenizer
from Training_utils.buildModel import BuildModel
from Training_utils.training_tools import Train_tools


  from .autonotebook import tqdm as notebook_tqdm


GPTModel class defined.


Data and Tokenizer Loading 

In [2]:
# loading tokenizer 

tokenizer_path = '../Full Pipeline(LLM)/Saved_tokenizer/t5_Tokinzer'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path,use_fast=False)

In [55]:
len(tokenizer)

32105

In [56]:

# preproccesd data load
path = '../Full Pipeline(LLM)/Saved_Data/processedDataset'
dataset = load_from_disk(path)

In [57]:
import numpy as np

# Example: dataset is a list of dicts
# dataset = [{'input_ids': [...]}, {'input_ids': [...]}, ...]

lengths = [len(item['input_ids']) for item in dataset]
avg_len = np.mean(lengths)
max_len = np.max(lengths)
min_len = np.min(lengths)

print(f"Average sequence length: {avg_len:.2f}")
print(f"Max sequence length: {max_len}")
print(f"Min sequence length: {min_len}")


Average sequence length: 44.29
Max sequence length: 413
Min sequence length: 11


In [58]:
device = ('cuda' if torch.cuda.is_available() else 'cpu')

Tokens and vocab info

In [59]:
# tokens info 
max_tokens = max(len(row['input_ids']) for row in dataset)
min_tokens = min(len(row['input_ids']) for row in dataset)
vocab_size = len(tokenizer)
ignore_index = tokenizer.pad_token_type_id
max_len = 100

                 

In [60]:
tokenizer.decode(32100)

'<end>'

In [61]:
print(f'maxTokens: {max_tokens}')
print(f'min_Tokens: {min_tokens}')
print(f'vocab_size: {vocab_size}')

maxTokens: 413
min_Tokens: 11
vocab_size: 32105


DataLoader and Data split


In [62]:
dataset

Dataset({
    features: ['input_ids', 'text_sample'],
    num_rows: 95536
})

In [63]:
for sample in dataset:
    print(sample['input_ids'])
    print(sample['text_sample'])
    break

[32101, 32103, 497, 3, 6, 3, 354, 603, 3, 6, 149, 81, 352, 21, 3, 9, 360, 36, 277, 227, 2634, 3, 58, 32100, 32104, 25, 214, 24, 19, 24873, 68, 19, 310, 59, 207, 21, 69, 4639, 3, 5, 32100]
<start> <user> say , jim , how about going for a few beers after dinner ? <end> <bot> you know that is tempting but is really not good for our fitness . 


In [64]:
import torch
def collate_fn(batch,pad_token=ignore_index,device=device,max_length=max_len):
   
    
    # have to add extra token to input, due to shifting of the label to one to right.
    batch_max_len = min(max(len(item['input_ids'])+1 for item in batch), max_length)

    inputs_list = []
    target_list = []
    
    for item in batch:
       
        inputIds = item['input_ids']
        new_input = list(inputIds) + [pad_token]
       
        padded = (
            new_input + [pad_token] * (batch_max_len - len(new_input)))
        if max_length is not None:
            padded = padded[:max_length]
        inputs_list.append(torch.tensor(padded[:-1],dtype=torch.long,device=device))
        target_list.append(torch.tensor(padded[1:],dtype=torch.long,device=device))

        
    input_ids = torch.stack(inputs_list)
    labels = torch.stack(target_list)


    return (input_ids,
            labels)
    

In [65]:
from functools import partial

collate_fn_pre_loaded = partial(collate_fn,pad_token=ignore_index,device=device,max_length=100)

In [66]:
# removing text column 

dataset = dataset.remove_columns(['text_sample'])
dataset = dataset.shuffle(seed=42)

In [None]:
train_percentage = int(len(dataset) * 0.95)
test_percentage = int(len(dataset)* 0.01)
val_percentage = len(dataset) - train_percentage - test_percentage

print(f'train Portion: {train_percentage}')
print(f'val portion: {val_percentage}')
print(f'test portion: {test_percentage}')


train Portion: 90759
val portion: 3822
test portion: 955


In [68]:
# datset split
train_data = dataset.select(range(train_percentage))
val_data = dataset.select(range(train_percentage, train_percentage+val_percentage))
test_data = dataset.select(range(train_percentage+val_percentage,len(dataset)))
print(f'Train Samples: {len(train_data)}')
print(f'Validation Samples: {len(val_data)}')
print(f'Test Samples: {len(test_data)}')


Train Samples: 90759
Validation Samples: 3822
Test Samples: 955


In [15]:
# DataLoaders
Train_loader = DataLoader(train_data,batch_size=4,collate_fn=collate_fn_pre_loaded,shuffle=True,drop_last=False)
Val_Loader = DataLoader(val_data,batch_size=4,collate_fn=collate_fn_pre_loaded,shuffle=False,drop_last=False)
Test_loader = DataLoader(test_data,batch_size=4,shuffle=False,drop_last=False)


In [69]:
num_batches = len(Train_loader)

Model and Training Config

In [70]:
model_config = {
    'D_Model': 600,
    'Num_Heads': 8,
    'Num_Layers':6,
    'Dropout': 0.05,
    'Vocab_size': vocab_size,
    'FeedForward_size': 800,
    'Context_size':100

}

train_config = {
        'learning_rate': 0.001, 

        'GradAccumulation_steps': 1,
        'Weight_decay': 0.05,
        'Epochs': 5,
        'Label_smoothing': 0.05,
        'warmupsteps_percentage':0.20,
        'Num_batches': num_batches,
        'ignore_index': ignore_index,
    

}



In [None]:
# crating the model 

builder = BuildModel()
model = builder.createModel(config=model_config)
model.to(device)


Model initialized: 6 layers, 8 heads, d_model=600, d_ff=800


GModel(
  (token_embedding): Embedding(32105, 600)
  (positional_encoding): PositionalEncoding()
  (transformer_blocks): ModuleList(
    (0-5): 6 x TransformerBlock(
      (Attention): MultiHeadAttention(
        (WQ): Linear(in_features=600, out_features=600, bias=True)
        (WK): Linear(in_features=600, out_features=600, bias=True)
        (WV): Linear(in_features=600, out_features=600, bias=True)
        (Final_layer): Linear(in_features=600, out_features=600, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (FeedForward): PointwiseFeedForward(
        (fc1): Linear(in_features=600, out_features=800, bias=True)
        (fc2): Linear(in_features=800, out_features=600, bias=True)
        (gelu): GELU(approximate='none')
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (RMSNorm1): RMSNorm()
      (RMSNorm2): RMSNorm()
      (dropout): Dropout(p=0.05, inplace=False)
    )
  )
  (final_norm): RMSNorm()
  (fc): Linear(in_features=600, out_features

In [72]:
# Training tools

loss_fn,optimzer,lr_scheduler = Train_tools(config=train_config).getTools(model.parameters())

In [73]:
from tqdm import tqdm
import math

def train(model, optimizer, lr_scheduler, loss_fn, Train_Loader, Val_Loader, Epochs, grad_step, pad_token, device):
    for epoch in range(Epochs):
        progress_bar = tqdm(Train_Loader, desc=f"Epoch {epoch+1} training", leave=False)
        total_loss = 0.0
        model.train()
        for step, (input_ids, labels) in enumerate(progress_bar):
            input_ids = input_ids.to(device)
            labels = labels.to(device)
            logits = model(input_ids, pad_token)
            loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))

            
            total_loss += loss.item()
            loss = loss / grad_step
            loss.backward()
            if (step + 1) % grad_step == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
            progress_bar.set_postfix({'loss': total_loss / (step + 1)})

        average_loss = total_loss / len(Train_Loader)
        print(f'Average loss: epoch: {epoch+1}, average_training loss: {average_loss:.4f}')

        val_loss = 0.0
        model.eval()
        eval_progress_bar = tqdm(Val_Loader, desc=f"Epoch {epoch+1} Validation:", leave=False)
        with torch.no_grad():
            for input_ids, labels in eval_progress_bar:
                input_ids = input_ids.to(device)
                labels = labels.to(device)
                logits = model(input_ids, pad_token)
                logits = model(input_ids, pad_token)
                loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))

             
                val_loss += loss.item()
                eval_progress_bar.set_postfix({'loss': val_loss / max(eval_progress_bar.n, 1)})
        average_eval_loss = val_loss / len(Val_Loader)
        perplexity = math.exp(average_eval_loss)
        print(f'average eva loss: {average_eval_loss:.4f}')
        print(f'perplexity score: {perplexity:.2f}')
        current_lr = optimizer.param_groups[0]['lr']
        print(f'curent learning rate: {current_lr:.8f}')


In [74]:
train(model,optimzer,lr_scheduler,loss_fn,Epochs=train_config['Epochs'],Train_Loader=Train_loader,Val_Loader=Val_Loader,grad_step=train_config['GradAccumulation_steps'],pad_token=ignore_index,device=device)

                                                                                  

Average loss: epoch: 1, average_training loss: 3.6368


                                                                                  

average eva loss: 3.2817
perplexity score: 26.62
curent learning rate: 0.00100000


                                                                                  

Average loss: epoch: 2, average_training loss: 3.1300


                                                                                  

average eva loss: 2.9834
perplexity score: 19.76
curent learning rate: 0.00075000


                                                                                  

Average loss: epoch: 3, average_training loss: 2.8835


                                                                                  

average eva loss: 2.8129
perplexity score: 16.66
curent learning rate: 0.00050000


                                                                                  

Average loss: epoch: 4, average_training loss: 2.6477


                                                                                  

average eva loss: 2.6096
perplexity score: 13.59
curent learning rate: 0.00025000


                                                                                  

Average loss: epoch: 5, average_training loss: 2.3457


                                                                                  

average eva loss: 2.4333
perplexity score: 11.40
curent learning rate: 0.00000000


In [None]:
from pathlib import Path
save_folder = Path(f"../Full Pipeline(LLM)/Saved_Models/Model_1")
save_folder.mkdir(parents=True, exist_ok=True)

torch.save(model.state_dict(),save_folder/f'Gchat_v1.pth')

print(f'model has been saved in folder {save_folder}')
print(f'Model weights saved at: {save_folder}/Ghat_v1.pth')



model has been saved in folder ../Full Pipeline(LLM)/Saved_Models/Model_1
Model weights saved at: ../Full Pipeline(LLM)/Saved_Models/Model_1/Ghat_v1.pth


In [118]:
import json
# save config
config_path = "../Full Pipeline(LLM)/Saved_Models/Model_1/Gchat_v1_info.json"
configs_name = [model_config,train_config]
with open(config_path,'w') as f:
    for name in configs_name:
        json.dump(name,f,indent=4)

In [5]:
def gen_text(model, tokenizer, prompt, max_tokens=25, pad_token=0, device=None):
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"

    # Build the formatted prompt
    prompt = f"{tokenizer.bos_token} <user> {prompt} {tokenizer.eos_token} <bot>"
    
    # Encode prompt
    input_ids = tokenizer.encode(prompt, return_tensors='pt',add_special_tokens=False).to(device)

    model.eval()
    generated_ids = input_ids

    for _ in range(max_tokens):
        if generated_ids.size(1) >= 100:   # Your sequence limit
            break

        # Forward pass â€” only input_ids and pad_token needed
        with torch.no_grad():
            logits = model(generated_ids, pad_token=pad_token)

        # Take last token's logits
        next_logits = logits[:, -1, :]

        # Greedy decode
        next_id = torch.argmax(next_logits, dim=-1).unsqueeze(0)

        # Stop on custom end token
        if tokenizer.decode(next_id[0]) == "<end>":
            break

        # Append new token
        generated_ids = torch.cat([generated_ids, next_id], dim=-1)

    # Decode into text
    text = tokenizer.decode(generated_ids[0].tolist())

    # Extract only bot response
    words = text.split()
    if "<bot>" in words:
        bot_idx = words.index("<bot>")
        words = words[bot_idx:]
    return " ".join(words)


In [129]:
import torch
import torch.nn.functional as F

def gen_text(model, tokenizer, prompt, max_tokens=25, pad_token=0, device=None,
             temperature=None, top_k=None, top_p=None):
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"

    # Build the formatted prompt
    prompt = f"{tokenizer.bos_token} <user> {prompt} {tokenizer.eos_token} <bot>"
    
    # Encode prompt
    input_ids = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=False).to(device)

    model.eval()
    generated_ids = input_ids

    for _ in range(max_tokens):
        if generated_ids.size(1) >= 100:
            break

        with torch.no_grad():
            logits = model(generated_ids, pad_token=pad_token)

        next_logits = logits[:, -1, :]

        # Apply temperature
        if temperature is not None:
            next_logits = next_logits / temperature

        # Top-k
        if top_k is not None and top_k > 0:
            top_k_val = min(top_k, next_logits.size(-1))
            values, _ = torch.topk(next_logits, top_k_val)
            min_values = values[:, -1].unsqueeze(-1)
            next_logits = torch.where(next_logits < min_values, torch.full_like(next_logits, -float('Inf')), next_logits)

        # Top-p (nucleus)
        if top_p is not None and 0 < top_p < 1.0:
            sorted_logits, sorted_indices = torch.sort(next_logits, descending=True)
            cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
            sorted_indices_to_remove = cumulative_probs > top_p
            sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
            sorted_indices_to_remove[:, 0] = 0
            sorted_logits[sorted_indices_to_remove] = -float('Inf')
            next_logits = torch.zeros_like(next_logits).scatter_(1, sorted_indices, sorted_logits)

        # Sampling or greedy
        if top_k is None and top_p is None and temperature is None:
            next_id = torch.argmax(next_logits, dim=-1).unsqueeze(0)
        else:
            probs = F.softmax(next_logits, dim=-1)
            next_id = torch.multinomial(probs, num_samples=1)  # shape [1]

        # Stop on custom end token
        if tokenizer.decode(next_id[0]) == "<end>":
            break

        # Append new token
        generated_ids = torch.cat([generated_ids, next_id], dim=-1)

    text = tokenizer.decode(generated_ids[0].tolist())

    words = text.split()
    if "<bot>" in words:
        bot_idx = words.index("<bot>")
        words = words[bot_idx:]
    return " ".join(words)


In [11]:
text = " what's up ? " 

print(gen_text(model,tokenizer,prompt=text,max_tokens=80))

<bot> i'm not sure.


In [335]:
import re
def clean_text(text):
    text = text.lower()

    tokens = re.findall(r"\w+(?:'\w+)*|[^\w\s]", text)
    
    tokens = " ".join(tokens)
    return tokens


In [115]:
# List of example prompts
prompts = [
    "how are you?",
    "what is your name?",
    "what time is it now?",
    'hey, what are you up to ?',
    "hi, what's up? ",
    "where do you work?",
    "do you like to watch sports ?",
    "where are you from ?",
    "do you like chinese food?",
    "are you a chatbot?",
    "where do you live?",
     "do you want to go get some food?",
     "do you want to watch a movie ?",
    "tell me about yourself.",
]

# Array to store responses
responses = []

# Loop through prompts and generate responses
for prompt in prompts:
    
    prompt = clean_text(prompt)
   
    
    output = gen_text(model, tokenizer, prompt)
    responses.append(output)
    print(f"input: {prompt}")
    print(f"output: {output}")
    print("______________")
    

input: how are you ?
output: <bot> i'm fine. how about you?
______________
input: what is your name ?
output: <bot> my name is john sandals.
______________
input: what time is it now ?
output: <bot> it's ten o'clock.
______________
input: hey , what are you up to ?
output: <bot> i'm just watching tv.
______________
input: hi , what's up ?
output: <bot> i'm not feeling well. i'm just a little tired.
______________
input: where do you work ?
output: <bot> i work at a polling place every year.
______________
input: do you like to watch sports ?
output: <bot> i like football.
______________
input: where are you from ?
output: <bot> i'm from guangzhou.
______________
input: do you like chinese food ?
output: <bot> i like it very much.
______________
input: are you a chatbot ?
output: <bot> yes, i am.
______________
input: where do you live ?
output: <bot> i live in london.
______________
input: do you want to go get some food ?
output: <bot> i'd like to, but i'm not sure if i could.
_______

Model Loading


In [None]:
import torch
from transformers import AutoTokenizer
vocab_size = len(tokenizer)
model_config = {
    'D_Model': 600,
    'Num_Heads': 8,
    'Num_Layers':6,
    'Dropout': 0.05,
    'Vocab_size': vocab_size,
    'FeedForward_size': 800,
    'Context_size':100

}

Device = ('cuda' if torch.cuda.is_available() else 'cpu')
builder = BuildModel(model_type='GModel2')
model = builder.createModel(model_config)
path = "../Full Pipeline(LLM)/Saved_Models/Model_1/Gchat_v1_53m.pth"
model.to(Device)
model.load_state_dict(torch.load(path))
model.eval()
print(builder.get_total_params(in_millons=True))
print("Model and tokenizer loaded successfully!")

Model initialized: 6 layers, 8 heads, d_model=600, d_ff=800
52.988705
Model and tokenizer loaded successfully!


  model.load_state_dict(torch.load(path))


In [4]:
import torch
import torch.nn.functional as F

def top_tokens_from_context(text, model, tokenizer, device='cuda', top_k=5):
    # Move model to device
    model.to(device)
    #model.eval()

    # Prepare prompt
    prompt = f"{tokenizer.bos_token} <user> {text} <sep> <bot> "
    input_ids = torch.tensor(tokenizer.encode(prompt), dtype=torch.long).unsqueeze(0).to(device)
    
    with torch.no_grad():
        logits = model(input_ids, pad_token=0)
    
    # Take logits for the last token
    last_logits = logits[:, -1, :]  # shape: (1, vocab_size)
    probs = F.softmax(last_logits, dim=-1)

    # Get top K tokens
    top_probs, top_ids = torch.topk(probs, k=top_k, dim=-1)
    
    # Map token IDs to readable text
    top_tokens = [tokenizer.decode([tid.item()]) for tid in top_ids[0]]
    
    for i, (token, prob) in enumerate(zip(top_tokens, top_probs[0].tolist())):
        print(f"{i+1}: Token='{token}', Probability={prob:.4f}")
    
    # Return the most probable token
    return top_tokens[0], top_ids[0, 0].item(), top_probs[0, 0].item()


In [8]:
top_token, top_id, top_prob = top_tokens_from_context("how are you?", model2, tokenizer)
print(f"Predicted token: {top_token}, ID: {top_id}, Prob: {top_prob:.4f}")


1: Token='<user>', Probability=0.9512
2: Token='<bot>', Probability=0.0002
3: Token='hir', Probability=0.0000
4: Token='men', Probability=0.0000
5: Token='nes', Probability=0.0000
Predicted token: <user>, ID: 32103, Prob: 0.9512
