In [None]:
!pip install -qqq -U wandb --progress-bar off
import wandb
from huggingface_hub import login
from google.colab import userdata

login(userdata.get('HF_TOKEN'))

wb_token = userdata.get('wandb')
wandb.login(key=wb_token)

In [None]:
!pip install -q -U git+https://github.com/huggingface/transformers.git --progress-bar off
#!pip install -q transformers==4.29.2 --progress-bar off
!pip install accelerate --progress-bar off
!pip install datasets evaluate --progress-bar off
!pip install -q -U bitsandbytes --progress-bar off
!pip install -q -U git+https://github.com/huggingface/peft.git --progress-bar off

In [None]:
from datasets import load_dataset

mrqa = load_dataset("enriquesaou/mrqa-squadded-sample")

In [None]:
mrqa

# Training

In [None]:
base_model_id = "microsoft/Phi-3-mini-4k-instruct"

In [None]:
from transformers import AutoModelForCausalLM
import torch
from transformers import BitsAndBytesConfig

""" # quant to 4 bits
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype='float16',
    bnb_4bit_use_double_quant=False,
)

"""

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
)


model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
    trust_remote_code=True,
)

In [None]:
#print(model)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    padding_side="left", # https://ai.stackexchange.com/questions/41485/while-fine-tuning-a-decoder-only-llm-like-llama-on-chat-dataset-what-kind-of-pa
    add_eos_token=True,
    trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token # https://kaitchup.substack.com/p/phi-2-a-small-model-easy-to-fine

In [None]:
def format_cqa(context, question):
    return "Answer the question extracting from the context below.\nContext: " + context + "\nQuestion: " + question + "\nAnswer: "

In [None]:
def tokenize_prompt(data_point):
    full_prompt = format_cqa(data_point['context'], data_point['question']) + data_point['answers']['text'][0]

    result = tokenizer(full_prompt)
    result["labels"] = result["input_ids"].copy()
    return result

#tokenized_dataset = mrqa.map(tokenize_prompt,
#                             remove_columns=mrqa['train'].column_names)

In [None]:
#mrqa, tokenized_dataset

In [None]:
# check that tokenization is correct
#untokenized_text = tokenizer.decode(tokenized_dataset['train'][0]['input_ids'], skip_special_tokens=True)
#print(untokenized_text)

In [None]:
import matplotlib.pyplot as plt

def plot_data_lengths(tok_dataset):
    lengths = [len(x['input_ids']) for x in tok_dataset]
    print(len(lengths))
    print(lengths)

    # Plotting the histogram
    plt.figure(figsize=(10, 6))
    plt.hist(lengths, bins=20, alpha=0.7, color='blue')
    plt.xlabel('Length of input_ids')
    plt.ylabel('Frequency')
    plt.title('Distribution of Lengths of input_ids')
    plt.show()


In [None]:
#plot_data_lengths(tokenized_dataset['train']), plot_data_lengths(tokenized_dataset['test'])

In [None]:
max_length = 1200

def tokenize_prompt(data_point):
    full_prompt = format_cqa(data_point['context'], data_point['question']) + data_point['answers']['text'][0]

    result = tokenizer(full_prompt,
                       truncation=True,
                       max_length=max_length,
                       padding="max_length")
    result["labels"] = result["input_ids"].copy()
    return result

tokenized_dataset = mrqa.map(tokenize_prompt,
                             remove_columns=mrqa['train'].column_names)

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

model.train()
model = prepare_model_for_kbit_training(model)

#phi3 adapter
config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules="all-linear",
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)


model = get_peft_model(model, config)

#model.config.pretraining_tp = 1

In [None]:
my_model_id = "enriquesaou/phi-3-mrqa"

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling


training_arguments = TrainingArguments(
    output_dir=my_model_id,
    per_device_train_batch_size=4,#4, # lower for less memory
    gradient_accumulation_steps=1, # effective batch size: 1*4 (but training is slowed)
    max_steps=300,
    warmup_ratio=.03,
    learning_rate=3e-5,
    fp16=True,
    logging_steps=50,
    optim="paged_adamw_8bit",
    save_strategy="steps",
    save_steps=100,
    eval_strategy="steps",
    eval_steps=100,
    #do_eval=True,
    report_to="wandb",
)

trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    args=training_arguments,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False  # silence the warnings
trainer.train()

In [None]:
trainer.push_to_hub()