<a href="https://colab.research.google.com/github/githubramkiran/LanggraphAgent/blob/main/gptmodel_tokenizer_dataset_loader_train_test_langg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

gpt model with bpe tokenizer,dataset loader,training loop,testing loop and langgraph integration code


In [None]:
!pip install langchain langgraph

Collecting langgraph
  Downloading langgraph-1.0.6-py3-none-any.whl.metadata (7.4 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Downloading sqlalchemy-2.0.45-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (9.5 kB)
Collecting langgraph-checkpoint<5.0.0,>=2.1.0 (from langgraph)
  Downloading langgraph_checkpoint-4.0.0-py3-none-any.whl.metadata (4.9 kB)
Collecting langgraph-prebuilt<1.1.0,>=1.0.2 (from langgraph)
  Downloading langgraph_prebuilt-1.0.6-py3-none-any.whl.metadata (5.2 kB)
Collecting langgraph-sdk<0.4.0,>=0.3.0 (from langgraph)
  Downloading langgraph_sdk-0.3.3-py3-none-any.whl.metadata (1.6 kB)
Collecting ormsgpack>=1.12.0 (from langgraph-checkpoint<5.0.0,>=2.1.0->langgraph)
  Downloading ormsgpack-1.12.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
INFO: pip is looking at multiple versions of langgraph-prebuilt to determine which version is compatible with other requirements. This could take a

In [None]:
#Byte Pair Encoding (BPE) Tokenizer
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
from tokenizers.pre_tokenizers import Whitespace
# Initialize BPE model with byte-level pre-tokenization
#tokenizer = Tokenizer(models.BPE(byte_fallback=True))
#tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)

# Initialize BPE tokenizer
tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

# Train on a text file
trainer = trainers.BpeTrainer(vocab_size=50000, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
tokenizer.train(["/kaggle/input/langchain/langchain.txt"], trainer)
tokenizer.save("gpt_tokenizer.json")







In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDataset(Dataset):
    def __init__(self, text_path, tokenizer, block_size):
        with open(text_path, 'r') as f:
            text = f.read()
        self.tokens = tokenizer.encode(text).ids
        self.block_size = block_size

    def __len__(self):
        return len(self.tokens) - self.block_size

    def __getitem__(self, idx):
        # x is the sequence, y is x shifted by 1 (next-token prediction)
        chunk = self.tokens[idx : idx + self.block_size + 1]
        return torch.tensor(chunk[:-1]), torch.tensor(chunk[1:])

# Initialize Loader
dataset = GPTDataset("/kaggle/input/langchain/langchain.txt", tokenizer, block_size=128)
loader = DataLoader(dataset, batch_size=32, shuffle=True)


In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

class TransformerBlock(nn.Module):
    """ A single GPT decoder block: communication followed by computation """
    def __init__(self, n_embd, n_head, block_size, dropout=0.1):
        super().__init__()
        # Causal multi-head self-attention
        self.sa = nn.MultiheadAttention(
            embed_dim=n_embd,
            num_heads=n_head,
            dropout=dropout,
            batch_first=True
        )
        # Feed-forward network (computation)
        self.ffwd = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.GELU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )
        # Layer normalization
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

        # Causal mask to ensure next-token prediction behavior
        self.register_buffer("mask", torch.tril(torch.ones(block_size, block_size)) == 0)

    def forward(self, x):
        # x shape: (batch, seq_len, n_embd)
        sz = x.size(1)
        # 1. Multi-head Attention with Causal Masking
        # Residual connection (x + sa) applied after LayerNorm (Pre-norm)
        attn_mask = self.mask[:sz, :sz]
        attn_output, _ = self.sa(self.ln1(x), self.ln1(x), self.ln1(x), attn_mask=attn_mask)
        x = x + attn_output

        # 2. Feed-Forward Network
        # Residual connection (x + ffwd) applied after LayerNorm
        x = x + self.ffwd(self.ln2(x))
        return x


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class CustomGPT(nn.Module):
    def __init__(self, vocab_size, n_embd, n_head, n_layer, block_size, dropout):
        super().__init__()
        self.block_size = block_size

        # Token and Learned Positional Embeddings
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)

        # Stack of Transformer Decoder Blocks
        self.blocks = nn.Sequential(*[
            TransformerBlock(n_embd, n_head, block_size, dropout) for _ in range(n_layer)
        ])

        # Final LayerNorm and Linear Head to vocabulary
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # Combine token (what) and position (where) embeddings
        tok_emb = self.token_embedding_table(idx) # (B, T, n_embd)
        pos_emb = self.position_embedding_table(torch.arange(T, device=idx.device)) # (T, n_embd)
        x = tok_emb + pos_emb # (B, T, n_embd)

        # Pass through the stack of blocks
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x) # (B, T, vocab_size)

        loss = None
        if targets is not None:
            # Shifted cross-entropy for next-token prediction
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = nn.functional.cross_entropy(logits, targets)

        return logits, loss

    @torch.no_grad()
    def generate(self, idx, max_new_tokens):
        # Autoregressive generation loop
        for _ in range(max_new_tokens):
            # Crop index to the block size
            idx_cond = idx[:, -self.block_size:]
            logits, _ = self(idx_cond)
            # Focus on the last time step
            logits = logits[:, -1, :]
            probs = nn.functional.softmax(logits, dim=-1)
            # Sample next token
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx
model=CustomGPT(vocab_size=tokenizer.get_vocab_size(), n_embd=128, n_head=4, n_layer=2, block_size=64, dropout=0.1)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=0.001)
device="cpu"
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using NVIDIA GPU")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using Apple MPS (Metal Performance Shaders)")
else:
    device = torch.device("cpu")
    print("Using CPU")

Using NVIDIA GPU


In [None]:

def train_gpt(model, loader, optimizer, device):
    model.train()
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        logits, loss = model(x, y) # Model returns loss via internal CrossEntropy
        loss.backward()
        # Update model weights
        optimizer.step()

print('model.summary()',model)
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('total_params',total_params)
print('trainable_params',trainable_params)
for name, param in model.named_parameters():
    print(name, param.shape)
# Example output: fc.weight torch.Size([2, 96])
torch.save(model.state_dict(), 'model_weights_v1.pth')

def test_gpt(model, loader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            _, loss = model(x, y)
            total_loss += loss.item()
    return total_loss / len(loader)


model.summary() CustomGPT(
  (token_embedding_table): Embedding(461, 128)
  (position_embedding_table): Embedding(64, 128)
  (blocks): Sequential(
    (0): TransformerBlock(
      (sa): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
      )
      (ffwd): Sequential(
        (0): Linear(in_features=128, out_features=512, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=512, out_features=128, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (ln1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    )
    (1): TransformerBlock(
      (sa): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
      )
      (ffwd): Sequential(
        (0): Linear(in_features=128, out_features=512, bias=True)
        (1): GELU(approximate='none')
      

In [None]:
from typing import Annotated, TypedDict
from langgraph.graph import StateGraph, START, END

# Define persistent state for the agent
class GPTState(TypedDict):
    history: list[str]  # Stores conversation history
    input_text: str     # Current user query
    generated_text:str

def gpt_inference_node(state: GPTState):
    # Prepare input for our PyTorch model
    prompt = state["input_text"]
    #prompt = " ".join(state["history"] + [state["next_input"]])
    input_ids = torch.tensor([tokenizer.encode(prompt).ids])

    # Generate response
    generated_ids = model.generate(input_ids, max_new_tokens=20)
    response = tokenizer.decode(generated_ids[0].tolist())
    #print(response)
    # Update state: append response to history
    #return {"history": state["history"] + [state["next_input"], response]}
    #response = tokenizer.decode(response)
    return {"generated_text":response}

# Assemble the Graph
builder = StateGraph(GPTState)
builder.add_node("generate", gpt_inference_node)
builder.add_edge(START, "generate")
builder.add_edge("generate", END)

# Compile into a runnable application
app = builder.compile()
result = app.invoke({"input_text": "What is langchain?"})
print(result)

{'input_text': 'What is langchain?', 'generated_text': 'W h at is langchain ": ast mes cor ": system_prompt heav lines are way \u200b runtime ropic nec get wor e ent om ""'}


In [None]:
from typing import TypedDict
from langgraph.graph import StateGraph, START, END

class AgentState(TypedDict):
    input_text: str
    generated_text: str

# Define node that calls the custom PyTorch model
def gpt_node(state: AgentState):
    inputs = tokenizer.encode(state["input_text"]).ids
    # Generate tokens using your model's inference method
    output_tokens = model.generate(torch.tensor([inputs]), max_new_tokens=50)
    response = tokenizer.decode(output_tokens[0].tolist())
    return {"generated_text": response}

# Construct the graph
workflow = StateGraph(AgentState)
workflow.add_node("llm", gpt_node)
workflow.add_edge(START, "llm")
workflow.add_edge("llm", END)

app = workflow.compile()
result = app.invoke({"input_text": "What is Transformers?"})
print(result["generated_text"])


W h at is r an s for m er s eed sist ve we ad aph omous fro } van 5 n _pro city by mp - can inst tion of the 20250929 have customization AI !" way ap as carefully x qu omous re ecution re ec { hu ework runtim amless _ stom C workflow want .""" al
