In [None]:
import tiktoken
import torch
from torch.nn import functional as F

def model_inference(model, enc = tiktoken.get_encoding("gpt2"), prompt = "Hello"):
    # Inference
    max_length = 50
    tokens = enc.encode(prompt)  # encode a prompt
    # add batch dimension and move to GPU
    tokens = torch.tensor(tokens, dtype=torch.long, device="cuda").unsqueeze(0)
    x = tokens.to("cuda")  # move to GPU


    # generate tokens

    for _ in range(max_length):
        logits, loss = model(x)  # (B, T, vocab_size)
        logits = logits[
            :, -1, :
        ]  # take the last token's logits (B, vocab_size) --> we only care about the next token
        probs = F.softmax(logits, dim=-1)  # convert to probabilities
        # skipped: temperature and top-k sampling
        next_token = torch.multinomial(
            probs, num_samples=1
        )  # sample from the distribution
        x = torch.cat((x, next_token), dim=1)  # append the new token to the sequence
    
    decoded_output = enc.decode([token for token in x[0].tolist() if token <= 50257])
    
    return x, " ".join(decoded_output.split())  # decode the output

In [1]:
import time
import os
from dataclasses import dataclass
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
import tiktoken
import json
from datetime import datetime

from model import GPT
from model import GPTConfig
from model import B, T
from data import DataLoader
from inference import model_inference
from hellaswag import iterate_examples
from hellaswag import render_example
from hellaswag import get_most_likely_row

In [2]:
model = GPT(GPTConfig())
model.to("cuda") 

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50304, 1024)
    (wpe): Embedding(1024, 1024)
    (h): ModuleList(
      (0-15): 16 x Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (c_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50304, bias=False)
)

In [5]:
state_dict = torch.load("model_weights_20250905_1150.pth", map_location="cuda")

# Strip unwanted prefixes
new_state_dict = {}
for k, v in state_dict.items():
    if k.startswith("_orig_mod."):
        new_state_dict[k.replace("_orig_mod.", "")] = v
    elif k.startswith("module."):
        new_state_dict[k.replace("module.", "")] = v
    else:
        new_state_dict[k] = v

In [6]:
model.load_state_dict(new_state_dict)

<All keys matched successfully>

In [7]:
model_inference(model=model, prompt="Once upon a time")

(tensor([[ 7454,  2402,   257,   640,   878,   465,  1204,    13,   198, 13488,
            355,  6283,   355,  2910,   393,  1692,  9791,   287,   262, 39385,
             13,  1649,   777, 14705,   547,  1234,   287,   262, 16161,   416,
           1793,    11,   340,  3636,   447,   247,    83,  4691,   514,  7471,
            780,    11,   996,    11,   484,   561,  1239, 18188,   262,  6881,
             13,  1119,   550,   356]], device='cuda:0'),
 'Once upon a time before his life. waited as strange as blood or human beings in the cosmos. When these planets were put in the galaxy by God, it wouldnâ€™t serve us anymore because, though, they would never attain the universe. They had we')

In [13]:
model_inference(model=model, prompt="Paris is the capital of")

(tensor([[40313,   318,   262,  3139,   286,  7431, 46157,    11,   262,  3139,
            286,  5267,   577, 14144,   373,  2067,   351,  1295, 31412,   860,
            290,   416, 45086, 10956,  1618,   264, 13207,  1312,    13,    68,
             13,   663,   826, 12979,   262,  6232,    11,   262,  1748,   351,
            663,  1388,  1910,   290,  3599,   262,  1748,   351,   663,  2802,
           1499,   357,   896,   264,   528]], device='cuda:0'),
 'Paris is the capital of Ram Igor, the capital of Novose Eye was started with placefactor 9 and by hypersigious organ shole i.e. its right afterwards the neighborhood, the city with its main market and starting the city with its mother country (its siz')