## Model and Tokenizer

In [1]:
import torch
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "microsoft/phi-1_5"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [4]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    load_in_8bit=False,
    device_map="auto",
    trust_remote_code=True,
)
model = base_model

Downloading pytorch_model.bin: 100%|██████████| 2.84G/2.84G [00:54<00:00, 51.6MB/s]
Downloading (…)neration_config.json: 100%|██████████| 69.0/69.0 [00:00<00:00, 104kB/s]


## Dataset Preperation

In [11]:
import datasets
from datasets import load_dataset
from src import utils

In [6]:
def generate_prompt(data_point):
  return f"""
<Human>: {data_point["user"]}
<AI>: {data_point["AI"]}
  """.strip()

def generate_and_tokenize_prompt(data_point):
  full_prompt = generate_prompt(data_point)
  tokenized_full_prompt = tokenizer(full_prompt, padding=True, truncation=True)
  return tokenized_full_prompt

In [23]:
dataset = load_dataset('json', data_files='qa_gpt4.json', split="train")
dataset = dataset.shuffle().map(generate_and_tokenize_prompt)

Map: 100%|██████████| 131/131 [00:00<00:00, 3657.64 examples/s]


In [24]:
print(dataset['train'][0])
print(dataset.keys())

KeyError: "Column train not in the dataset. Current columns in the dataset: ['AI', 'user', 'input_ids', 'attention_mask']"

## Training

In [25]:
OUTPUT_DIR = "/root/hongyu/JupyterNotebooksFinetuning/models/phi1.5"
training_args = transformers.TrainingArguments(
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    auto_find_batch_size=True,
    num_train_epochs=4,
    learning_rate=2e-4,
    bf16=True,
    save_total_limit=2,
    logging_steps=10,
    output_dir=OUTPUT_DIR,
    save_strategy='epoch'
)

In [26]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=dataset,
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

In [27]:
model.config.use_cache = False

for name, param in model.named_parameters():
    param.requires_grad = True

trainer.train()

You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


TrainOutput(global_step=8, training_loss=145.66378784179688, metrics={'train_runtime': 30.0832, 'train_samples_per_second': 17.418, 'train_steps_per_second': 0.266, 'total_flos': 388885863874560.0, 'train_loss': 145.66378784179688, 'epoch': 3.56})