In [None]:
%pip install einops
%pip install peft
%pip install trl
%pip install tensorboard
%pip install -U transformers
%pip install -U accelerate datasets 
%pip install -q https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl
%pip install tokenizers==0.15.0
%pip install torch==2.1.2+cu121 --index-url https://download.pytorch.org/whl/cu121
%pip install torchaudio==2.1.2+cu121 --index-url https://download.pytorch.org/whl/cu121
%pip install torchvision==0.16.2+cu121 --index-url https://download.pytorch.org/whl/cu121
%pip install transformers==4.35.2
%pip install ipywidgets


In [1]:
import os
from dataclasses import dataclass, field
from typing import Optional
import json

import torch
from datasets import load_dataset,DatasetDict
from peft import LoraConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
)
from tqdm.notebook import tqdm
from trl import SFTTrainer
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

In [2]:
train_dataset = load_dataset("json", data_files='../data/2011_2023_train_test_valuation.json',field="train", split='all')
eval_dataset = load_dataset("json", data_files='../data/2011_2023_train_test_valuation.json', field="valuation", split='all') 
test_dataset = load_dataset("json", data_files='../data/2011_2023_train_test_valuation.json', field="test", split='all')

dataset = DatasetDict(
    {
        "train":train_dataset,
        "validation":eval_dataset,
        "test":test_dataset
    }
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'input', 'output'],
        num_rows: 188478
    })
    validation: Dataset({
        features: ['text', 'input', 'output'],
        num_rows: 12
    })
    test: Dataset({
        features: ['text', 'input', 'output'],
        num_rows: 1
    })
})

In [4]:
tokenizer = AutoTokenizer.from_pretrained("../models/phi-2", trust_remote_code=True)
## Add Special Tokens
tokenizer.add_tokens(["[INST]", "<PAD>"])
tokenizer.pad_token = "<PAD>"
tokenizer.add_special_tokens(dict(eos_token="</s>"))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


1

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype='float16',
    bnb_4bit_use_double_quant=False,
)

In [6]:
model = AutoModelForCausalLM.from_pretrained(
        "../models/phi-2", 
        quantization_config=bnb_config, 
        device_map = 'auto',
        trust_remote_code=True,
        use_auth_token=True,
    )
model.config.eos_token_id = tokenizer.eos_token_id



bin e:\src\transformer-sketchbook\.venv\lib\site-packages\bitsandbytes\libbitsandbytes_cuda121.dll


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


In [7]:
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True) 

lora_config = LoraConfig(
    r=32, 
    lora_alpha=32, 
    target_modules = [ "q_proj", "k_proj", "v_proj", "dense" ],
    modules_to_save = ["lm_head", "embed_tokens"],
    lora_dropout=0.1, 
    bias="none", 
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)

model.config.use_cache = False

In [8]:
torch_device = "cuda" if torch.cuda.is_available() else "cpu"

In [17]:
if torch.cuda.device_count() > 1: # If more than 1 GPU
    model.is_parallelizable = True
    model.model_parallel = True

In [9]:
# dataset-specific parameters
bs=8    # batch size for training
bs_eval=16    # batch size for evaluation
ga_steps=16  # gradient accumulation steps
lr=0.00002  # learning rate
epochs=1

steps_per_epoch=len(dataset["train"])//(bs*ga_steps)

args = TrainingArguments(
    output_dir="../models/phi-2-mlb",
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs_eval,
    evaluation_strategy="steps",
    logging_steps=1,
    eval_steps=steps_per_epoch//2,    # 2 evals per epoch
    save_steps=steps_per_epoch//100,
    save_total_limit=3,     # save once per epoch
    gradient_accumulation_steps=ga_steps,
    num_train_epochs=epochs,
    lr_scheduler_type="constant",
    optim="paged_adamw_32bit",      # val_loss will go nan with paged_adamw_8bit
    learning_rate=lr,
    group_by_length=False,
    bf16=True,        
    ddp_find_unused_parameters=False,
)

In [10]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=args,
)



Map:   0%|          | 0/188478 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [None]:
trainer.train(resume_from_checkpoint="../models/phi-2-mlb/checkpoint-1344")

In [11]:
trainer.train()

  0%|          | 0/1472 [00:00<?, ?it/s]



{'loss': 2.1484, 'learning_rate': 2e-05, 'epoch': 0.0}
{'loss': 2.1453, 'learning_rate': 2e-05, 'epoch': 0.0}


KeyboardInterrupt: 

In [198]:
trainer.save_model("../models/phi-2-mlb/")

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from peft import PeftModel
import torch

# base model
base_path="../models/phi-2"  

# adapters: path to folder with adapter_model.safetensors
adapter_path="../models/phi-2-mlb/" 

# # where to save merged model
save_to="../models/phi-2-mlb-combined"       


tokenizer = AutoTokenizer.from_pretrained(base_path, trust_remote_code=True)
## Add Special Tokens
tokenizer.add_tokens(["<|im_start|>", "<PAD>"])
tokenizer.pad_token = "<PAD>"
tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))

model = AutoModelForCausalLM.from_pretrained(
        base_path, 
        quantization_config=bnb_config, 
        device_map = 'auto',
        trust_remote_code=True,
        use_auth_token=True,
    )

model.config.eos_token_id = tokenizer.eos_token_id


generation_config = GenerationConfig(
    max_new_tokens=100, 
    temperature=0.7,
    top_p=0.1,
    top_k=40,
    repetition_penalty=1.18,
    do_sample=True,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

# Load LoRA and merge
merged_model = PeftModel.from_pretrained(model, adapter_path)
merged_model = merged_model.merge_and_unload()

merged_model.save_pretrained(save_to, safe_serialization=True, max_shard_size='4GB')
tokenizer.save_pretrained(save_to)
generation_config.save_pretrained(save_to)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


bin e:\src\transformer-sketchbook\.venv\lib\site-packages\bitsandbytes\libbitsandbytes_cuda121.dll


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.
You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


ValueError: The model is quantized with bitsandbytes and is not serializable - check out the warnings from the logger on the traceback to understand the reason why the quantized model is not serializable.

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from peft import PeftModel
import torch

# base model
base_path="../models/phi-2"  

# adapters: path to folder with adapter_model.safetensors
adapter_path="../models/phi-2-mlb/checkpoint-13100" 
      
# # Load model and tokenizer
base_model = AutoModelForCausalLM.from_pretrained(
    base_path,
    torch_dtype=torch.bfloat16,
).to(device)


if torch.cuda.device_count() > 1: # If more than 1 GPU
    base_model.is_parallelizable = False
    base_model.model_parallel = False

tokenizer = AutoTokenizer.from_pretrained(base_path)

# # Add/set tokens same tokens to base model before merging, like we did before training  
tokenizer.add_tokens(["<|im_start|>", "<PAD>"])
tokenizer.pad_token = "<PAD>"
tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))

base_model.config.eos_token_id = tokenizer.eos_token_id

# Load LoRA and merge
merged_model = PeftModel.from_pretrained(base_model, adapter_path).to(device)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


bin e:\src\transformer-sketchbook\.venv\lib\site-packages\bitsandbytes\libbitsandbytes_cuda121.dll


In [4]:
from datasets import load_dataset,DatasetDict
dataset = load_dataset("json", data_files='../data/2011_2023_phi-2_struct_encoded.json',field="test", split='all')
# dataset = eval_dataset[-1000:]


In [5]:
temp_set = []
valid_set = []
for i in dataset['text']:
   split_input = i.split("Output: ")
   temp = split_input[0]
   temp_set.append(f"{temp}Output: ")
   try:
      valid = json.loads(split_input[-1])
      valid_set.append(valid['event'])
   except:
      valid = {}
      valid_set.append("")
      pass
   

test_set = temp_set

In [7]:
test_set[0]

'Instruct: {"pitcher": {"id": 444836, "name": "aaron laffey"}, "batter": {"id": 475582, "name": "ryan zimmerman"}, "at_bat_number": 53, "p_throws": "L", "stand": "R", "inning_topbot": "Top", "inning": 7, "outs_when_up": 2, "on_1b": {"id": "", "name": ""}, "on_2b": {"id": "", "name": ""}, "on_3b": {"id": "", "name": ""}, "home_score": 2, "away_score": 5, "pitch_number": 1, "if_fielding_alignment": "", "of_fielding_alignment": ""}. \n Output: '

In [6]:
    
input_tokens = tokenizer(test_set[0], return_tensors="pt").to(device)
output_tokens = merged_model.generate(**input_tokens, max_new_tokens=512)

output = tokenizer.decode(
    output_tokens[0][len(input_tokens[0]):],
    skip_special_tokens=True
    )               

print(output)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



 {"event": "field_out", "event_description": "hit_into_play", "pitch_name": "Sinker", "description": "Ryan Zimmerman grounds out, second baseman Aaron Miles to first baseman James Loney.", "runs": 0, "at_bat": ["hit_into_play"], "release_speeds": [86.8], "pitch_names": ["Sinker"]}

