# imports

In [1]:
import math
import os
import glob
import random
from functools import partial

import numpy as np
import torch
import torch.nn.functional as F


In [2]:

from contextlib import nullcontext

In [3]:
import matplotlib.pyplot as plt

In [4]:
from sentencepiece import SentencePieceProcessor

In [5]:
from data_loader import *
from utils import *
from model import *

In [6]:
device = 'cuda:0'
#device='cpu'
device_type = 'cuda' if 'cuda' in device else 'cpu'



# paths

In [7]:
DATA_CACHE_DIR = './instruct_data_same_length/'

In [8]:
out_dir = './fine_tuning_instruct_pad'
os.makedirs(out_dir, exist_ok=True)

In [9]:
pretrained_model_path = '/home/cindy/learning/karpathy/llms/llama2.c/stories15M.pt'
pretrained_model_path = '/home/cindy/learning/karpathy/llms/llama2.c/stories110M.pt'

# tokenizer

In [10]:
tokenizer = SentencePieceProcessor('./tokenizer.model')

In [11]:
vocab_size = tokenizer.vocab_size()

# training

#### mixed precision settings

In [12]:
dtype = 'bfloat16'
torch.manual_seed(1337)
torch.backends.cuda.matmul.allow_tf32 = True  # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True  # allow tf32 on cudnn
device_type = "cuda" if "cuda" in device else "cpu"  # for later use in torch.autocast
# note: float16 data type will automatically use a GradScaler
ptdtype = {"float32": torch.float32, "bfloat16": torch.bfloat16, "float16": torch.float16}[dtype]

ctx = (
    nullcontext()
    if device_type == "cpu"
    else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
)

In [13]:

# initialize a GradScaler. If enabled=False scaler is a no-op
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == "float16"))


#### model

In [14]:
checkpoint = torch.load(pretrained_model_path, map_location=device)
checkpoint.keys()

dict_keys(['model', 'model_args', 'iter_num', 'best_val_loss', 'config'])

In [15]:
model_args = ModelArgs(**checkpoint['model_args'])
model_args.max_seq_len = 350
model = Transformer(model_args)

In [16]:
state_dict = checkpoint['model']
#unwanted_prefix = '_orig_mod.'
unwanted_prefix = ''
for k,v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
model.load_state_dict(state_dict, strict=False)

<All keys matched successfully>

##### freeze

In [17]:
for p in model.parameters():
    p.requires_grad = False

In [18]:
print(f'Number of parameters: {sum(p.nelement() for p in model.parameters())}')

Number of parameters: 109529856


##### add lora

In [19]:
#lora_layer_types = [nn.Linear, nn.Embedding]
lora_rank = 2
lora_dropout = 0.1
lora_alpha = 1.0
lora_targets = ['wk', 'wq', 'wo', 'wv']
apply_lora(
    model, 
    targets=lora_targets,
    rank=lora_rank,
    dropout=lora_dropout,
    alpha=lora_alpha,
    verbose=False
)
#tie_lora_weights(model.output, model.tok_embeddings)

In [20]:
print(f'Number of parameters: {sum(p.nelement() for p in model.parameters())}')

Number of parameters: 109824768


In [21]:
model.to(device);

#### data

In [22]:
max_seq_len = model_args.max_seq_len
print(max_seq_len)

350


In [23]:
batch_size = 64

wanted_batch_size = 4 * 128
gradient_accumulation_steps = wanted_batch_size // batch_size

print(f'Wanted batch_size: {wanted_batch_size}, gradient accumulation steps: {gradient_accumulation_steps}, batch_size: {batch_size}')

Wanted batch_size: 512, gradient accumulation steps: 8, batch_size: 64


In [24]:
iter_batches = partial(
    iter_batch_func,
    device=device,
    batch_size=batch_size,
    max_seq_len=max_seq_len,
    data_cache_dir=DATA_CACHE_DIR
)

#### optimizer

In [25]:
learning_rate = 5e-4
optimizer = get_optimizer(
    model=model,
    device_type='cuda',
    learning_rate=learning_rate,  # max learning rate
    weight_decay = 1e-1,
    beta1 = 0.9,
    beta2 = 0.95,
)

num decayed parameter tensors: 96, with 294,912 parameters
num non-decayed parameter tensors: 0, with 0 parameters


## training loop

In [26]:
max_iters = 5000
eval_iters = 100
best_val_loss = 1e9
grad_clip = 1

In [27]:
iter_num = 0 

In [28]:
out_dir

'./fine_tuning_instruct_pad'

In [29]:


train_batch_iter = iter_batches(split='train')
X, Y = next(train_batch_iter)

while True:
    lr = get_lr(iter_num, max_iters=max_iters) 
    for param_group in optimizer.param_groups:
        param_group["lr"] = lr

    if iter_num % eval_iters == 0 :
        losses = estimate_loss(
            model=model,
            iter_batches=iter_batches,
            eval_iters=eval_iters,
            ctx=ctx
        )
        print(f"step {iter_num}: lr {lr}, train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        if losses["val"] < best_val_loss:
            best_val_loss = losses["val"]
            if iter_num > 0:
                save_checkpoint(
                    model=model,
                    optimizer=optimizer,
                    model_args=model_args,
                    iter_num=iter_num,
                    out_dir=out_dir
                )
                _, paragraph = generate_paragraph(
                    model, 
                    prompt='Write a story. In the story, try to use the verb "eat", the noun "cat" and the adjective "sad". The story has the following features: the story should contain at least one dialogue. Possible story:',
                    tokenizer=tokenizer,
                    device='cuda:0',
                    max_new_tokens=300
                )
                print(paragraph)

    for micro_step in range(gradient_accumulation_steps):
        with ctx:
            logits = model(X)
            loss = compute_loss(logits, Y)
            loss = loss / gradient_accumulation_steps
        X, Y = next(train_batch_iter)
        scaler.scale(loss).backward()
     
    if grad_clip != 0.0:
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
    scaler.step(optimizer)
    scaler.update()
    optimizer.zero_grad(set_to_none=True)

   
    iter_num += 1
    if iter_num > max_iters:
        break


step 0: lr 0.0, train loss 0.8439, val loss 0.8843
step 100: lr 5e-05, train loss 0.8359, val loss 0.8762
saving checkpoint to ./fine_tuning_instruct_pad
Write a story. In the story, try touse the verb "eat", the noun "cat" and the adjective "sad". The story has the following features: the story should contain at least one dialogue. Possible story: For example, a three-year-old, who was playing in her garden, found an old shirt. The shirt was old and worn out, but she loved it anyway. She decided to keep it and wear it to school the next day.
The next day, when she was about to wear the shirt, her mom asked her to take off the old shirt before going to school.
Aly replied, "But Mom, I want to keep my old shirt. It's my favorite."
But her mom said, "No, Amy. You can't keep it. It's too old and worn. Let's put it away and get you a new one."
Aly was upset. She didn't want a new shirt. She wanted her old shirt. She started to cry. Her mom hugged her, and said, "It's OK, Amy. You'll find a

In [30]:
out_dir

'./fine_tuning_instruct_pad'

## 