In [2]:
from transformers import GPT2LMHeadModel
from transformers import pipeline, set_seed, PretrainedConfig
import os
import torch
from torch.nn import functional as F

os.environ['XDG_CACHE_HOME'] = './cache'
os.environ['HF_HOME'] = './cache'
os.environ['HF_DATASETS_CACHE'] = './cache'
os.environ['TRANSFORMERS_CACHE'] = './cache'
os.environ['HUGGINGFACE_HUB_CACHE'] = './cache'


In [5]:
set_seed(42)

generator = pipeline('text-generation', model='gpt2', model_kwargs = {"cache_dir": './cache'})
generator(
    "Hello, I'm language model,",
    max_length=30,
    top_k=50,  # Adjusting top-k
    do_sample=True,  # Enable sampling,
    num_return_sequences=5
)

Device set to use mps:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hello, I'm language model, and my project will get better with time, but I think there are some things I'd probably do differently. I"},
 {'generated_text': "Hello, I'm language model, but the second we're talking about your own language, will you please define an argument to those, that actually can"},
 {'generated_text': "Hello, I'm language model, and I'm just a programmer. I'm not a machine-in-machine type-converter (although"},
 {'generated_text': 'Hello, I\'m language model, I\'m a student. You\'re a language model, and I don\'t care about your education, your family."'},
 {'generated_text': "Hello, I'm language model, I'm program.\n\nYou can imagine how it's like at this point in your life, your family takes"}]

# My model

In [5]:
def get_device():
    if torch.cuda.is_available():
        return 'cuda'
    elif hasattr(torch, 'mps') and torch.mps.is_available():
        return 'mps'
    else:
        'cpu'

device = get_device()

In [6]:
device

'mps'

In [9]:
from train_gpt2 import GPT

In [14]:
num_sequences = 5
max_length = 30

model = GPT.from_pretrained('gpt2')
model.eval()
model.to("cpu")

import tiktoken
enc = tiktoken.get_encoding('gpt2')
tokens = enc.encode("Hello, I'm language model,")
tokens = torch.tensor(tokens, dtype=torch.long)
tokens = tokens.unsqueeze(0).repeat(num_sequences, 1)
x = tokens.to("cpu")

torch.manual_seed(42)
torch.mps.manual_seed(42)

while x.size(1) < max_length:
    with torch.no_grad():
        logits = model(x)
        logits = logits[:, -1, :]
        probs = F.softmax(logits, dim=1)
        topk_probs, topk_indicies = torch.topk(probs, 50, dim=-1)
        ix = torch.multinomial(topk_probs, 1)
        xcol = torch.gather(topk_indicies, -1, ix)
        x = torch.cat((x, xcol), dim=1)
        
for i in range(num_sequences):
    tokens = x[i, :max_length].tolist()
    decoded = enc.decode(tokens=tokens)
    print(f"> {decoded}")

loading weights from pretrained gpt: %s gpt2
> Hello, I'm language model, and and the I ( to and and, and such to a " ( and a.. that to which of
> Hello, I'm language model, a a ( which it to has not one I who and to. to I, and one was that he to
> Hello, I'm language model, as, for to that it that. and for are we
. in, and a you it, we the
> Hello, I'm language model, and ( and that for for and and to would the, was the at,.. in in ( the

> Hello, I'm language model, of he.,, and and the that the the it, be and to a we by,. that not


In [46]:
transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']

model_hf = GPT2LMHeadModel.from_pretrained("gpt2", cache_dir='./cache')
model = GPT.from_pretrained('gpt2')

model_dict = model.state_dict()
for hf_key, hf_value in model_hf.state_dict().items():
    if any(hf_key.endswith(w) for w in transposed):
        hf_value = hf_value.t()
    
    
    result = hf_value.equal(model_dict[hf_key])
    assert result
    

loading weights from pretrained gpt: %s gpt2


In [45]:
any('transformer.h.0.attn.c_attn.weight'.endswith(w) for w in transposed)

True

In [36]:
model_hf.state_dict()['transformer.h.0.attn.c_attn.weight'].t()

tensor([[-0.4738,  0.0874,  0.0039,  ..., -0.2592,  0.1517, -0.4100],
        [-0.2614,  0.1473,  0.0695,  ..., -0.0164,  0.2170, -0.1924],
        [-0.0978,  0.2387,  0.3668,  ...,  0.1991,  0.1043, -0.2400],
        ...,
        [ 0.0513, -0.0525,  0.1143,  ...,  0.0095,  0.0293, -0.0046],
        [-0.0584, -0.0113,  0.0363,  ..., -0.0516, -0.0429,  0.0070],
        [ 0.0250, -0.0156, -0.0318,  ...,  0.0319, -0.0475,  0.0198]])

In [34]:
model.state_dict()['transformer.h.0.attn.c_attn.weight']

tensor([[-0.4738,  0.0874,  0.0039,  ..., -0.2592,  0.1517, -0.4100],
        [-0.2614,  0.1473,  0.0695,  ..., -0.0164,  0.2170, -0.1924],
        [-0.0978,  0.2387,  0.3668,  ...,  0.1991,  0.1043, -0.2400],
        ...,
        [ 0.0513, -0.0525,  0.1143,  ...,  0.0095,  0.0293, -0.0046],
        [-0.0584, -0.0113,  0.0363,  ..., -0.0516, -0.0429,  0.0070],
        [ 0.0250, -0.0156, -0.0318,  ...,  0.0319, -0.0475,  0.0198]])

In [None]:
transformer.h.0.ln_1.weight torch.Size([768])
transformer.h.0.ln_1.bias torch.Size([768])
transformer.h.0.attn.bias torch.Size([1, 1, 1024, 1024])
transformer.h.0.attn.c_attn.weight torch.Size([2304, 768])
transformer.h.0.attn.c_attn.bias torch.Size([2304])
transformer.h.0.attn.c_proj.weight torch.Size([768, 768])
transformer.h.0.attn.c_proj.bias torch.Size([768])
transformer.h.0.ln_2.weight torch.Size([768])
transformer.h.0.ln_2.bias torch.Size([768])
transformer.h.0.mlp.c_fc.weight torch.Size([3072, 768])
transformer.h.0.mlp.c_fc.bias torch.Size([3072])
transformer.h.0.mlp.c_proj.weight torch.Size([768, 3072])
transformer.h.0.mlp.c_proj.bias torch.Size([768])