In [None]:
%pip install einops
%pip install peft
%pip install trl
%pip install tensorboard
%pip install -U transformers
%pip install -U accelerate datasets 
%pip install -q https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl
%pip install tokenizers==0.15.0
%pip install torch==2.1.2+cu121 --index-url https://download.pytorch.org/whl/cu121
%pip install torchaudio==2.1.2+cu121 --index-url https://download.pytorch.org/whl/cu121
%pip install torchvision==0.16.2+cu121 --index-url https://download.pytorch.org/whl/cu121
%pip install transformers==4.35.2
%pip install ipywidgets


In [4]:
import os
from dataclasses import dataclass, field
from typing import Optional
import json

import torch
from datasets import load_dataset,DatasetDict
from peft import LoraConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
)
from tqdm.notebook import tqdm
from trl import SFTTrainer
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

In [None]:
train_dataset = load_dataset("json", data_files='./2011_2023_train_test_valuation.json',field="train", split='all')
eval_dataset = load_dataset("json", data_files='./2011_2023_train_test_valuation.json', field="valuation", split='all') 
test_dataset = load_dataset("json", data_files='./2011_2023_train_test_valuation.json', field="test", split='all')

dataset = DatasetDict(
    {
        "train":train_dataset,
        "validation":eval_dataset,
        "test":test_dataset
    }
)

In [170]:
dataset

DatasetDict({
    train: Dataset({
        features: ['train'],
        num_rows: 717166
    })
    test: Dataset({
        features: ['train'],
        num_rows: 79686
    })
})

In [None]:
tokenizer = AutoTokenizer.from_pretrained("../models/phi-2", trust_remote_code=True)
## Add Special Tokens
tokenizer.add_tokens(["<|im_start|>", "<PAD>"])
tokenizer.pad_token = "<PAD>"
tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))

In [185]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype='float16',
    bnb_4bit_use_double_quant=False,
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
        "../models/phi-2", 
        quantization_config=bnb_config, 
        device_map = 'auto',
        trust_remote_code=True,
        use_auth_token=True,
    )
model.config.eos_token_id = tokenizer.eos_token_id

In [191]:
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True) 

lora_config = LoraConfig(
    r=32, 
    lora_alpha=32, 
    target_modules = [ "q_proj", "k_proj", "v_proj", "dense" ],
    modules_to_save = ["lm_head", "embed_tokens"],
    lora_dropout=0.1, 
    bias="none", 
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)

model.config.use_cache = False

In [193]:
torch_device = "cuda" if torch.cuda.is_available() else "cpu"

In [17]:
if torch.cuda.device_count() > 1: # If more than 1 GPU
    model.is_parallelizable = True
    model.model_parallel = True

In [194]:
# dataset-specific parameters
bs=8    # batch size for training
bs_eval=16    # batch size for evaluation
ga_steps=16  # gradient accumulation steps
lr=0.00002  # learning rate
epochs=1

steps_per_epoch=len(dataset["train"])//(bs*ga_steps)

args = TrainingArguments(
    output_dir="../models/phi-2-mlb",
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs_eval,
    evaluation_strategy="steps",
    logging_steps=1,
    eval_steps=steps_per_epoch//2,    # 2 evals per epoch
    save_steps=steps_per_epoch//100,
    save_total_limit=3,     # save once per epoch
    gradient_accumulation_steps=ga_steps,
    num_train_epochs=epochs,
    lr_scheduler_type="constant",
    optim="paged_adamw_32bit",      # val_loss will go nan with paged_adamw_8bit
    learning_rate=lr,
    group_by_length=False,
    bf16=True,        
    ddp_find_unused_parameters=False,
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=args,
)

In [None]:
trainer.train(resume_from_checkpoint="../models/phi-2-mlb/checkpoint-1344")

In [None]:
trainer.train()

In [198]:
trainer.save_model("../models/phi-2-mlb/")

In [199]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from peft import PeftModel
import torch

# base model
base_path="../models/phi-2"  

# adapters: path to folder with adapter_model.safetensors
adapter_path="../models/phi-2-mlb/" 

# # where to save merged model
save_to="../models/phi-2-mlb/"       


tokenizer = AutoTokenizer.from_pretrained(base_path, trust_remote_code=True)
## Add Special Tokens
tokenizer.add_tokens(["<|im_start|>", "<PAD>"])
tokenizer.pad_token = "<PAD>"
tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))

model = AutoModelForCausalLM.from_pretrained(
        base_path, 
        quantization_config=bnb_config, 
        device_map = 'auto',
        trust_remote_code=True,
        use_auth_token=True,
    )

model.config.eos_token_id = tokenizer.eos_token_id


generation_config = GenerationConfig(
    max_new_tokens=100, 
    temperature=0.7,
    top_p=0.1,
    top_k=40,
    repetition_penalty=1.18,
    do_sample=True,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

# Load LoRA and merge
merged_model = PeftModel.from_pretrained(model, adapter_path)
merged_model = merged_model.merge_and_unload()

merged_model.save_pretrained(save_to, safe_serialization=True, max_shard_size='4GB')
tokenizer.save_pretrained(save_to)
generation_config.save_pretrained(save_to)



In [202]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from peft import PeftModel
import torch

# model_path="../models/phi-2-mlb/"   

# model = AutoModelForCausalLM.from_pretrained(
#         model_path, 
#         quantization_config=bnb_config, 
#         device_map = 'auto',
#         trust_remote_code=True,
#         use_auth_token=True,
#     )

# model.config.eos_token_id = tokenizer.eos_token_id

prompt= "Instruct: what is the outcome of {\"input\": {\"pitcher\": {\"id\": 573204, \"name\": \"caleb thielbar\"}, \"batter\": {\"id\": 488726, \"name\": \"michael brantley\"}}}? \n"

input_tokens = tokenizer(prompt, return_tensors="pt").to("cuda")
output_tokens = merged_model.generate(**input_tokens, max_new_tokens=512)

output = tokenizer.decode(
    output_tokens[0][len(input_tokens[0]):],
    skip_special_tokens=True
    )               

print(output)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



# The output should be:
# {
#     "input": {
#         "farth": {
#             "id": "farth",
#             "name": "farth"
#         },
#         "farth": {
#             "id": "farth",
#             "name": "farth"
#         },
#         "farth": {
#             "id": "farth",
#             "name": "farth"
#         },
#         "farth": {
#             "id": "farth",
#             "name": "farth"
#         },
#         "farth": {
#             "id": "farth",
#             "name": "farth"
#         },
#         "farth": {
#             "id": "farth",
#             "name": "farth"
#         },
#         "farth": {
#             "id": "farth",
#             "name": "farth"
#         },
#         "farth": {
#             "id": "farth",
#             "name": "farth"
#         },
#         "farth": {
#             "id": "farth",
#             "name": "farth"
#         },
#         "farth": {
#             "id": "farth",
#             "name": "farth"
#         },
#         "farth": {
#  