In [8]:
%pip install einops
%pip install peft
%pip install trl
%pip install tensorboard
%pip install -U transformers
# %pip install -U accelerate datasets 
# %pip install -q https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl
# %pip install tokenizers==0.15.0
# %pip install torch==2.1.2+cu121 --index-url https://download.pytorch.org/whl/cu121
# %pip install torchaudio==2.1.2+cu121 --index-url https://download.pytorch.org/whl/cu121
# %pip install torchvision==0.16.2+cu121 --index-url https://download.pytorch.org/whl/cu121
# %pip install transformers==4.35.2
# %pip install ipywidgets


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting transformers
  Using cached transformers-4.37.2-py3-none-any.whl.metadata (129 kB)
Using cached transformers-4.37.2-py3-none-any.whl (8.4 MB)
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed transformers-4.37.2
Note: you may need to restart the kernel to use updated packages.


In [168]:
import os
from dataclasses import dataclass, field
from typing import Optional
import json

import torch
from datasets import load_dataset
from peft import LoraConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
)
from tqdm.notebook import tqdm

from trl import SFTTrainer

In [179]:
dataset = load_dataset("json", data_files="./2011_2023_event_des.json", field='data', split='all')
dataset = dataset.train_test_split(test_size=0.1)


Found cached dataset json (C:/Users/danm/.cache/huggingface/datasets/json/default-75cc6199393bf5a9/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


In [170]:
dataset

DatasetDict({
    train: Dataset({
        features: ['train'],
        num_rows: 717166
    })
    test: Dataset({
        features: ['train'],
        num_rows: 79686
    })
})

In [183]:
tokenizer = AutoTokenizer.from_pretrained("../models/phi-2-mlb/tokenizer_merged", trust_remote_code=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [184]:
# tokenizer.add_tokens(["<|im_start|>", "<PAD>"])
tokenizer.pad_token = "<PAD>"
# tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))

In [185]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype='float16',
    bnb_4bit_use_double_quant=False,
)

In [189]:
model = AutoModelForCausalLM.from_pretrained(
        "../models/phi-2", 
        quantization_config=bnb_config, 
        device_map = 'auto',
        trust_remote_code=True,
        use_auth_token=True,
    )
model.config.eos_token_id = tokenizer.eos_token_id

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [190]:
model.resize_token_embeddings(len(tokenizer))

Embedding(60359, 2560)

In [191]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True) 

lora_config = LoraConfig(
    r=32, 
    lora_alpha=32, 
    target_modules = [ "q_proj", "k_proj", "v_proj", "dense" ],
    modules_to_save = ["lm_head", "embed_tokens"],
    lora_dropout=0.1, 
    bias="none", 
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)

model.config.use_cache = False

In [192]:
peft_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['q_proj', 'k_proj', 'v_proj','dense','fc1','fc2',]
)

In [193]:
torch_device = "cuda" if torch.cuda.is_available() else "cpu"

In [17]:
if torch.cuda.device_count() > 1: # If more than 1 GPU
    model.is_parallelizable = True
    model.model_parallel = True

In [194]:
from transformers import TrainingArguments, Trainer

# dataset-specific parameters
bs=8    # batch size for training
bs_eval=16    # batch size for evaluation
ga_steps=16  # gradient accumulation steps
lr=0.00002  # learning rate
epochs=1

steps_per_epoch=len(dataset["train"])//(bs*ga_steps)

args = TrainingArguments(
    output_dir="../models/phi-2-mlb",
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs_eval,
    evaluation_strategy="steps",
    logging_steps=1,
    eval_steps=steps_per_epoch//2,    # 2 evals per epoch
    save_steps=steps_per_epoch//100,
    save_total_limit=3,     # save once per epoch
    gradient_accumulation_steps=ga_steps,
    num_train_epochs=epochs,
    lr_scheduler_type="constant",
    optim="paged_adamw_32bit",      # val_loss will go nan with paged_adamw_8bit
    learning_rate=lr,
    group_by_length=False,
    bf16=True,        
    ddp_find_unused_parameters=False,
    
)



In [195]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    peft_config=lora_config,
    dataset_text_field="train",
    max_seq_length=2048,
    tokenizer=tokenizer,
    args=args,
    packing=False,
)

Map:   0%|          | 0/717166 [00:00<?, ? examples/s]

Map:   0%|          | 0/79686 [00:00<?, ? examples/s]

In [196]:
trainer.train()

  0%|          | 0/5602 [00:00<?, ?it/s]

{'loss': 5.8374, 'learning_rate': 2e-05, 'epoch': 0.0}
{'loss': 5.8606, 'learning_rate': 2e-05, 'epoch': 0.0}
{'loss': 5.7135, 'learning_rate': 2e-05, 'epoch': 0.0}
{'loss': 5.6986, 'learning_rate': 2e-05, 'epoch': 0.0}
{'loss': 5.6031, 'learning_rate': 2e-05, 'epoch': 0.0}
{'loss': 5.6055, 'learning_rate': 2e-05, 'epoch': 0.0}
{'loss': 5.5103, 'learning_rate': 2e-05, 'epoch': 0.0}
{'loss': 5.4549, 'learning_rate': 2e-05, 'epoch': 0.0}
{'loss': 5.4295, 'learning_rate': 2e-05, 'epoch': 0.0}
{'loss': 5.3531, 'learning_rate': 2e-05, 'epoch': 0.0}
{'loss': 5.3036, 'learning_rate': 2e-05, 'epoch': 0.0}
{'loss': 5.2769, 'learning_rate': 2e-05, 'epoch': 0.0}
{'loss': 5.1592, 'learning_rate': 2e-05, 'epoch': 0.0}
{'loss': 5.142, 'learning_rate': 2e-05, 'epoch': 0.0}
{'loss': 5.1154, 'learning_rate': 2e-05, 'epoch': 0.0}
{'loss': 4.9963, 'learning_rate': 2e-05, 'epoch': 0.0}
{'loss': 5.0159, 'learning_rate': 2e-05, 'epoch': 0.0}
{'loss': 4.9626, 'learning_rate': 2e-05, 'epoch': 0.0}
{'loss': 4.



{'loss': 2.9528, 'learning_rate': 2e-05, 'epoch': 0.01}
{'loss': 2.8366, 'learning_rate': 2e-05, 'epoch': 0.01}
{'loss': 2.7526, 'learning_rate': 2e-05, 'epoch': 0.01}
{'loss': 2.8025, 'learning_rate': 2e-05, 'epoch': 0.01}
{'loss': 2.7565, 'learning_rate': 2e-05, 'epoch': 0.01}
{'loss': 2.684, 'learning_rate': 2e-05, 'epoch': 0.01}
{'loss': 2.6601, 'learning_rate': 2e-05, 'epoch': 0.01}
{'loss': 2.5837, 'learning_rate': 2e-05, 'epoch': 0.01}
{'loss': 2.5416, 'learning_rate': 2e-05, 'epoch': 0.01}
{'loss': 2.4728, 'learning_rate': 2e-05, 'epoch': 0.01}
{'loss': 2.4205, 'learning_rate': 2e-05, 'epoch': 0.01}
{'loss': 2.4418, 'learning_rate': 2e-05, 'epoch': 0.01}
{'loss': 2.3281, 'learning_rate': 2e-05, 'epoch': 0.01}
{'loss': 2.3345, 'learning_rate': 2e-05, 'epoch': 0.01}
{'loss': 2.2849, 'learning_rate': 2e-05, 'epoch': 0.01}
{'loss': 2.2055, 'learning_rate': 2e-05, 'epoch': 0.01}
{'loss': 2.2384, 'learning_rate': 2e-05, 'epoch': 0.01}
{'loss': 2.1677, 'learning_rate': 2e-05, 'epoch':



{'loss': 1.4622, 'learning_rate': 2e-05, 'epoch': 0.02}
{'loss': 1.4378, 'learning_rate': 2e-05, 'epoch': 0.02}
{'loss': 1.4544, 'learning_rate': 2e-05, 'epoch': 0.02}
{'loss': 1.4402, 'learning_rate': 2e-05, 'epoch': 0.02}
{'loss': 1.4213, 'learning_rate': 2e-05, 'epoch': 0.02}
{'loss': 1.3582, 'learning_rate': 2e-05, 'epoch': 0.02}
{'loss': 1.4258, 'learning_rate': 2e-05, 'epoch': 0.02}
{'loss': 1.3537, 'learning_rate': 2e-05, 'epoch': 0.02}
{'loss': 1.3851, 'learning_rate': 2e-05, 'epoch': 0.02}
{'loss': 1.365, 'learning_rate': 2e-05, 'epoch': 0.02}
{'loss': 1.3562, 'learning_rate': 2e-05, 'epoch': 0.02}
{'loss': 1.3772, 'learning_rate': 2e-05, 'epoch': 0.02}
{'loss': 1.3262, 'learning_rate': 2e-05, 'epoch': 0.02}
{'loss': 1.341, 'learning_rate': 2e-05, 'epoch': 0.02}
{'loss': 1.3162, 'learning_rate': 2e-05, 'epoch': 0.02}
{'loss': 1.3055, 'learning_rate': 2e-05, 'epoch': 0.02}
{'loss': 1.2941, 'learning_rate': 2e-05, 'epoch': 0.02}
{'loss': 1.2623, 'learning_rate': 2e-05, 'epoch': 



{'loss': 1.1267, 'learning_rate': 2e-05, 'epoch': 0.03}
{'loss': 1.145, 'learning_rate': 2e-05, 'epoch': 0.03}
{'loss': 1.1173, 'learning_rate': 2e-05, 'epoch': 0.03}
{'loss': 1.0815, 'learning_rate': 2e-05, 'epoch': 0.03}
{'loss': 1.1049, 'learning_rate': 2e-05, 'epoch': 0.03}
{'loss': 1.1107, 'learning_rate': 2e-05, 'epoch': 0.03}
{'loss': 1.0863, 'learning_rate': 2e-05, 'epoch': 0.03}
{'loss': 1.1001, 'learning_rate': 2e-05, 'epoch': 0.03}
{'loss': 1.1014, 'learning_rate': 2e-05, 'epoch': 0.03}
{'loss': 1.1084, 'learning_rate': 2e-05, 'epoch': 0.03}
{'loss': 1.0647, 'learning_rate': 2e-05, 'epoch': 0.03}
{'loss': 1.0581, 'learning_rate': 2e-05, 'epoch': 0.03}
{'loss': 1.038, 'learning_rate': 2e-05, 'epoch': 0.03}
{'loss': 1.0561, 'learning_rate': 2e-05, 'epoch': 0.03}
{'loss': 1.1022, 'learning_rate': 2e-05, 'epoch': 0.03}
{'loss': 1.0859, 'learning_rate': 2e-05, 'epoch': 0.03}
{'loss': 1.0389, 'learning_rate': 2e-05, 'epoch': 0.03}
{'loss': 1.0648, 'learning_rate': 2e-05, 'epoch': 



{'loss': 0.9589, 'learning_rate': 2e-05, 'epoch': 0.04}
{'loss': 0.9551, 'learning_rate': 2e-05, 'epoch': 0.04}
{'loss': 0.9303, 'learning_rate': 2e-05, 'epoch': 0.04}
{'loss': 0.9766, 'learning_rate': 2e-05, 'epoch': 0.04}
{'loss': 0.975, 'learning_rate': 2e-05, 'epoch': 0.04}
{'loss': 0.9192, 'learning_rate': 2e-05, 'epoch': 0.04}
{'loss': 0.93, 'learning_rate': 2e-05, 'epoch': 0.04}
{'loss': 0.9517, 'learning_rate': 2e-05, 'epoch': 0.04}
{'loss': 0.9025, 'learning_rate': 2e-05, 'epoch': 0.04}
{'loss': 0.9212, 'learning_rate': 2e-05, 'epoch': 0.04}
{'loss': 0.9775, 'learning_rate': 2e-05, 'epoch': 0.04}
{'loss': 0.962, 'learning_rate': 2e-05, 'epoch': 0.04}
{'loss': 0.9339, 'learning_rate': 2e-05, 'epoch': 0.04}
{'loss': 0.927, 'learning_rate': 2e-05, 'epoch': 0.04}
{'loss': 0.9266, 'learning_rate': 2e-05, 'epoch': 0.04}
{'loss': 0.936, 'learning_rate': 2e-05, 'epoch': 0.04}
{'loss': 0.9065, 'learning_rate': 2e-05, 'epoch': 0.04}
{'loss': 0.9104, 'learning_rate': 2e-05, 'epoch': 0.04



{'loss': 0.856, 'learning_rate': 2e-05, 'epoch': 0.05}
{'loss': 0.8727, 'learning_rate': 2e-05, 'epoch': 0.05}
{'loss': 0.8636, 'learning_rate': 2e-05, 'epoch': 0.05}
{'loss': 0.8451, 'learning_rate': 2e-05, 'epoch': 0.05}
{'loss': 0.8975, 'learning_rate': 2e-05, 'epoch': 0.05}
{'loss': 0.8592, 'learning_rate': 2e-05, 'epoch': 0.05}
{'loss': 0.8466, 'learning_rate': 2e-05, 'epoch': 0.05}


In [77]:
trainer.save_model("../models/phi-2-mlb/")

In [None]:
# trainer.train(resume_from_checkpoint="../models/phi-2-mlb/checkpoint-1344")

In [148]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from peft import PeftModel
import torch

# base model
base_path="../models/phi-2"  


# adapters: path to folder with adapter_model.safetensors
adapter_path="../models/phi-2-mlb/" 

# # where to save merged model
save_to="../models/phi-2-mlb/"       

generation_config = GenerationConfig(
    max_new_tokens=100, 
    temperature=0.7,
    top_p=0.1,
    top_k=40,
    repetition_penalty=1.18,
    do_sample=True,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

# Load LoRA and merge
merged_model = PeftModel.from_pretrained(model, adapter_path)
merged_model = merged_model.merge_and_unload()

merged_model.save_pretrained(save_to, safe_serialization=True, max_shard_size='4GB')
tokenizer.save_pretrained(save_to)
generation_config.save_pretrained(save_to)



In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from peft import PeftModel
import torch

model_path="../models/phi-2-mlb/"   

model = AutoModelForCausalLM.from_pretrained(
        model_path, 
        quantization_config=bnb_config, 
        device_map = 'auto',
        trust_remote_code=True,
        use_auth_token=True,
    )
model.config.eos_token_id = tokenizer.eos_token_id

prompt="Instruct: {\"input\": {\"pitcher\": {\"id\": 460024, \"name\": \"luke hochevar\"}, \"batter\": {\"id\": 110029, \"name\": \"bobby abreu\"}, \"p_throws\": \"R\", \"stand\": \"L\", \"inning_topbot\": \"Top\", \"inning\": 1, \"outs_when_up\": 1, \"on_1b\": \"\", \"on_2b\": {\"id\": 435062, \"name\": \"howie kendrick\"}, \"on_3b\": \"\", \"home_score\": 0, \"away_score\": 0}}? \n"


input_tokens = tokenizer(prompt, return_tensors="pt").to("cuda")
output_tokens = merged_model.generate(**input_tokens, max_new_tokens=512)

output = tokenizer.decode(
    output_tokens[0][len(input_tokens[0]):],
    skip_special_tokens=True
    )               

print(output)