In [None]:
import tiktoken
import torch
from torch.nn import functional as F

def model_inference(model, enc = tiktoken.get_encoding("gpt2"), prompt = "Hello"):
    # Inference
    max_length = 50
    tokens = enc.encode(prompt)  # encode a prompt
    # add batch dimension and move to GPU
    tokens = torch.tensor(tokens, dtype=torch.long, device="cuda").unsqueeze(0)
    x = tokens.to("cuda")  # move to GPU


    # generate tokens

    for _ in range(max_length):
        logits, loss = model(x)  # (B, T, vocab_size)
        logits = logits[
            :, -1, :
        ]  # take the last token's logits (B, vocab_size) --> we only care about the next token
        probs = F.softmax(logits, dim=-1)  # convert to probabilities
        # skipped: temperature and top-k sampling
        next_token = torch.multinomial(
            probs, num_samples=1
        )  # sample from the distribution
        x = torch.cat((x, next_token), dim=1)  # append the new token to the sequence
    
    decoded_output = enc.decode([token for token in x[0].tolist() if token <= 50257])
    
    return x, " ".join(decoded_output.split())  # decode the output

In [2]:
import time
import os
from dataclasses import dataclass
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
import tiktoken
import json
from datetime import datetime

from model import GPT
from model import GPTConfig
from model import B, T
from data import DataLoader
from inference import model_inference
from hellaswag import iterate_examples
from hellaswag import render_example
from hellaswag import get_most_likely_row

In [3]:
model = GPT(GPTConfig())
model.to("cuda") 

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50304, 1024)
    (wpe): Embedding(1024, 1024)
    (h): ModuleList(
      (0-15): 16 x Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (c_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50304, bias=False)
)

In [8]:
state_dict = torch.load("model_weights_20250904_0642.pth", map_location="cuda")

In [11]:
# Strip unwanted prefixes
new_state_dict = {}
for k, v in state_dict.items():
    if k.startswith("_orig_mod."):
        new_state_dict[k.replace("_orig_mod.", "")] = v
    elif k.startswith("module."):
        new_state_dict[k.replace("module.", "")] = v
    else:
        new_state_dict[k] = v

In [13]:
model.load_state_dict(new_state_dict)

<All keys matched successfully>

In [27]:
model_inference(model=model, prompt="Once upon a time")

(tensor([[ 7454,  2402,   257,   640,    11,  2935, 26674,   274,   561,   423,
          30671,   262,  8222,   329,  3598,   812,    11,   290,   772,   788,
            561,  1249,   683,   284,  3100,   329,  2241,    13,   198,  3198,
            286,  2935, 26674,   274,   338,   749,  5863, 10288,   284,   262,
          20715, 15895,  5442,  3223,  2084,   373,   326,   981, 31521,  7244,
           1276,   307, 32478, 48451]], device='cuda:0'),
 "Once upon a time, Descartes would have inhabited the forest for seven years, and even then would allow him to dig for himself. One of Descartes's most famous references to the Nobel Prize winning dark ago was that while detecting Adam must beEitherSPEC")