In [16]:
import torch
import torch.nn as nn

from functools import partial
from model import Transformer, ModelArgs
from lora import add_lora, remove_lora

%autoreload 2

UsageError: Line magic function `%autoreload` not found.


## model

In [17]:
# stories 42M
dim = 512
n_layers = 8
n_heads = 8
n_kv_heads = 8
multiple_of = 32
dropout = 0.0
vocab_size = 32000
max_seq_len = 1024

model_args = dict(
    dim=dim,
    n_layers=n_layers,
    n_heads=n_heads,
    n_kv_heads=n_kv_heads,
    vocab_size=vocab_size,
    multiple_of=multiple_of,
    max_seq_len=max_seq_len,
    dropout=dropout,
)

gptconf = ModelArgs(**model_args)
model = Transformer(gptconf)

print(model)

Transformer(
  (tok_embeddings): Embedding(32000, 512)
  (dropout): Dropout(p=0.0, inplace=False)
  (layers): ModuleList(
    (0-7): 8 x TransformerBlock(
      (attention): Attention(
        (wq): Linear(in_features=512, out_features=512, bias=False)
        (wk): Linear(in_features=512, out_features=512, bias=False)
        (wv): Linear(in_features=512, out_features=512, bias=False)
        (wo): Linear(in_features=512, out_features=512, bias=False)
        (attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
      (feed_forward): FeedForward(
        (w1): Linear(in_features=512, out_features=1376, bias=False)
        (w2): Linear(in_features=1376, out_features=512, bias=False)
        (w3): Linear(in_features=512, out_features=1376, bias=False)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (attention_norm): RMSNorm()
      (ffn_norm): RMSNorm()
    )
  )
  (norm): RMSNorm()
  (output): Linear(in_features=512,

In [20]:
lora_rank = 8
lora_alpha = 16
lora_dropout_p = 0.05
target_modules = ['wq', 'wk']

remove_lora(model)
add_lora(model, rank=lora_rank, alpha=lora_alpha, dropout_p=lora_dropout_p, target_modules=target_modules)
print(model)
# register_lora_layer(model, lora_rank, lora_alpha, lora_dropout_p, target_modules)
# model.apply(partial(register_lora_layer, lora_rank=lora_rank, lora_alpha=lora_alpha, lora_dropout_p=lora_dropout_p, target_modules=target_modules))

add lora to layers.0.attention.wq
add lora to layers.0.attention.wk
add lora to layers.1.attention.wq
add lora to layers.1.attention.wk
add lora to layers.2.attention.wq
add lora to layers.2.attention.wk
add lora to layers.3.attention.wq
add lora to layers.3.attention.wk
add lora to layers.4.attention.wq
add lora to layers.4.attention.wk
add lora to layers.5.attention.wq
add lora to layers.5.attention.wk
add lora to layers.6.attention.wq
add lora to layers.6.attention.wk
add lora to layers.7.attention.wq
add lora to layers.7.attention.wk
Transformer(
  (tok_embeddings): Embedding(32000, 512)
  (dropout): Dropout(p=0.0, inplace=False)
  (layers): ModuleList(
    (0-7): 8 x TransformerBlock(
      (attention): Attention(
        (wq): ParametrizedLinear(
          in_features=512, out_features=512, bias=False
          (parametrizations): ModuleDict(
            (weight): ParametrizationList(
              (0): LoRA(in_features=512, out_features=512, weight_type=linear, lora_rank=8, lora

In [9]:
def get_lora_params(model, print_shapes=True):
    def name_is_lora(name):
        return (
            len(name.split(".")) >= 4
            and (name.split(".")[-4]) == "parametrizations"
            and name.split(".")[-1] in ["lora_A", "lora_B"]
        )
    for n, p in model.named_parameters():
        if name_is_lora(n):
            if print_shapes:
                print(n, p.shape)
            yield p


In [10]:
for n in get_lora_params(model):
    continue

layers.0.attention.wq.parametrizations.weight.0.lora_A torch.Size([512, 8])
layers.0.attention.wq.parametrizations.weight.0.lora_B torch.Size([8, 512])
layers.0.attention.wk.parametrizations.weight.0.lora_A torch.Size([512, 8])
layers.0.attention.wk.parametrizations.weight.0.lora_B torch.Size([8, 512])
layers.1.attention.wq.parametrizations.weight.0.lora_A torch.Size([512, 8])
layers.1.attention.wq.parametrizations.weight.0.lora_B torch.Size([8, 512])
layers.1.attention.wk.parametrizations.weight.0.lora_A torch.Size([512, 8])
layers.1.attention.wk.parametrizations.weight.0.lora_B torch.Size([8, 512])
layers.2.attention.wq.parametrizations.weight.0.lora_A torch.Size([512, 8])
layers.2.attention.wq.parametrizations.weight.0.lora_B torch.Size([8, 512])
layers.2.attention.wk.parametrizations.weight.0.lora_A torch.Size([512, 8])
layers.2.attention.wk.parametrizations.weight.0.lora_B torch.Size([8, 512])
layers.3.attention.wq.parametrizations.weight.0.lora_A torch.Size([512, 8])
layers.3.att

In [21]:
ckpt = torch.load("./out/tinyshakespeare_lora_stories260k_default_nodropout_0822_1536/ckpt.pt")

In [24]:
ckpt["config"]

{'out_dir': 'out/tinyshakespeare_lora_stories260k_default_nodropout_0822_1536',
 'eval_interval': 2000,
 'log_interval': 1,
 'eval_iters': 100,
 'eval_only': False,
 'always_save_checkpoint': False,
 'init_from': 'pretrained:out/stories260k_default/ckpt.pt',
 'wandb_log': True,
 'wandb_project': 'llamac',
 'wandb_run_name': 'tinyshakespeare_lora_stories260k_default_nodropout_0822_1536',
 'batch_size': 128,
 'max_seq_len': 512,
 'vocab_source': 'custom',
 'vocab_size': 512,
 'dataset': 'tinyshakespeare',
 'dim': 288,
 'n_layers': 6,
 'n_heads': 6,
 'n_kv_heads': 6,
 'multiple_of': 32,
 'dropout': 0.0,
 'use_lora': True,
 'lora_rank': 16,
 'lora_alpha': 1,
 'lora_dropout_p': 0.0,
 'gradient_accumulation_steps': 1,
 'learning_rate': 0.001,
 'max_iters': 20000,
 'weight_decay': 0.01,
 'beta1': 0.9,
 'beta2': 0.99,
 'grad_clip': 1.0,
 'decay_lr': True,
 'warmup_iters': 1000,
 'device': 'cuda',
 'dtype': 'bfloat16',
 'compile': True}

## dataloader

#### inspect the tiny story

In [1]:
import numpy as np
import requests
import torch
import json

In [2]:
m = np.memmap("./data/tok512/data00.bin", dtype=np.uint16, mode="r")

In [3]:
m

memmap([  1, 317, 269, ..., 287, 411, 426], dtype=uint16)

In [4]:
with open("./data/TinyStories_all_data/data00.json", "r") as f:
    data = json.load(f)

In [14]:
print(data[0]['story'])



Lily and Ben are friends. They like to play in the park. One day, they see a big tree with a swing. Lily wants to try the swing. She runs to the tree and climbs on the swing.
"Push me, Ben!" she says. Ben pushes her gently. Lily feels happy. She swings higher and higher. She laughs and shouts.
Ben watches Lily. He thinks she is cute. He wants to swing too. He waits for Lily to stop. But Lily does not stop. She swings faster and faster. She is having too much fun.
"Can I swing too, Lily?" Ben asks. Lily does not hear him. She is too busy swinging. Ben feels sad. He walks away.
Lily swings so high that she loses her grip. She falls off the swing. She lands on the ground. She hurts her foot. She cries.
"Ow, ow, ow!" she says. She looks for Ben. She wants him to help her. But Ben is not there. He is gone.
Lily feels sorry. She wishes she had shared the swing with Ben. She wishes he was there to hug her. She limps to the tree. She sees something hanging from a branch. It is Ben's hat. He 

In [6]:
from tokenizer import Tokenizer

In [7]:
tok = Tokenizer("./data/tok512.model")

In [10]:
print(tok.decode(m.tolist()[:1000]))

Lily and Ben are friends. They like to play in the park. One day, they see a big tree with a swing. Lily wants to try the swing. She runs to the tree and climbs on the swing.
"Push me, Ben!" she says. Ben pushes her gently. Lily feels happy. She swings higher and higher. She laughs and shouts.
Ben watches Lily. He thinks she is cute. He wants to swing too. He waits for Lily to stop. But Lily does not stop. She swings faster and faster. She is having too much fun.
"Can I swing too, Lily?" Ben asks. Lily does not hear him. She is too busy swinging. Ben feels sad. He walks away.
Lily swings so high that she loses her grip. She falls off the swing. She lands on the ground. She hurts her foot. She cries.
"Ow, ow, ow!" she says. She looks for Ben. She wants him to help her. But Ben is not there. He is gone.
Lily feels sorry. She wishes she had shared the swing with Ben. She wishes he was there to hug her. She limps to the tree. She sees something hanging from a branch. It is Ben's hat. He le

#### prepare tinyshakespeare

1. encode the whole text
2. save it into a .bin
3. dataloader load it