In [10]:
import sys
sys.path.append("..")

## Falcon-7b

## Phi-1.5

### Fine-tuning

#### Model and Tokenizer

In [1]:
import copy
import torch
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM

In [2]:
model_name = "microsoft/phi-1_5"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [4]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    # load_in_8bit=True,
    device_map="auto",
    trust_remote_code=True,
)
model = copy.deepcopy(base_model)

#### Dataset Preperation

In [5]:
from datasets import load_dataset

In [6]:
cutoff_len = 256

def generate_prompt(data_point):
  return f"""{data_point["Question"]}

Answer: {data_point["Answer"]}
  """.strip()


def generate_and_tokenize_prompt(data_point):
  full_prompt = generate_prompt(data_point)
  result = tokenizer(full_prompt, padding='max_length', truncation=True, max_length=cutoff_len) # , return_tensors=None)
  result['data'] = full_prompt
  return result

In [7]:
dataset = load_dataset('json', data_files='../assets/qa_gpt4.json', split="train")
dataset = dataset.shuffle().map(generate_and_tokenize_prompt)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/131 [00:00<?, ? examples/s]

In [8]:
print(dataset[0].keys())
dataset[0]['data']
len(dataset[0]['input_ids'])

dict_keys(['Answer', 'Question', 'input_ids', 'attention_mask', 'data'])


256

#### Training

In [15]:
OUTPUT_DIR = "/root/hongyu/JupyterNotebooksFinetuning/models/phi1.5"
training_args = transformers.TrainingArguments(
    per_device_train_batch_size=64,
    gradient_accumulation_steps=4,
    # warmup_steps=100,
    auto_find_batch_size=True,
    num_train_epochs=1,
    learning_rate=1e-6,  # 2e-5,
    weight_decay=0.1,
    fp16=False,
    # optim='adamw_torch',
    # bf16=True,
    save_total_limit=3,
    logging_steps=1,
    output_dir=OUTPUT_DIR,
    save_strategy='epoch',
    adam_beta1=0.9,
    adam_beta2=0.98,
    adam_epsilon=1e-6
)

In [16]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=dataset,
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False) #, return_tensors='pt')  #, pad_to_multiple_of=8),
)

In [17]:
model.config.use_cache = False
trainer.train()

You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


Step,Training Loss
1,2.0066
2,1.9094
3,1.8583
4,1.8155


TrainOutput(global_step=4, training_loss=1.897437334060669, metrics={'train_runtime': 9.338, 'train_samples_per_second': 14.029, 'train_steps_per_second': 0.428, 'total_flos': 258227526696960.0, 'train_loss': 1.897437334060669, 'epoch': 0.94})

### Inference

In [21]:
from src.llm.phi import phi_1_5_inference

In [22]:
base_output = phi_1_5_inference(base_model, tokenizer, question=dataset[0]["Question"])

print("Question: " + dataset[0]["Question"])
print("#################################")
print("GPT-4: " + dataset[0]["Answer"])
print("#################################")
print("Base Model: " + base_output)

ft_output = phi_1_5_inference(model, tokenizer, question=dataset[0]["Question"])
print("#################################")
print("Fine-tuned Model: " + ft_output)

Question: Can you tell me about Jordan Grumet's presence in the FIRE world?
#################################
GPT-4: In the Financial Independence, Retire Early (FIRE) community, Jordan Grumet is known as Doc G. He is recognized not only for his work as a hospice doctor but also as a host of the Earn & Invest Podcast that discusses similar financial independence topics.
#################################
Base Model: Jordan Grumet is a renowned expert in the field of fire safety and prevention.
#################################
Fine-tuned Model: Jordan Grumet is a prominent figure in the Financial Independence, Retire Early (FIRE) community.
