Imports

In [1]:
import os
# Hide warnings
# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import warnings
import torch
from keras import backend as K
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from datasets import load_dataset,Dataset
from peft import LoraConfig, PeftModel, get_peft_model
from trl import SFTTrainer
import bitsandbytes as bnb
import transformers

torch.cuda.empty_cache() # Emptying CUDA cache in order to free important space
torch.cuda.is_available() # Checking CUDA status

K.clear_session() # Same but for Keras

2024-06-13 13:36:58.084912: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


Preconfigs

In [2]:
transformers.set_seed(123)
pd.set_option('display.max_colwidth', None)
warnings.filterwarnings('ignore')

Fine-tuning

-Data loading

In [3]:
# Data Loading
data_path='./model/dataset/databricks-dolly-15k.jsonl'
raw_data= pd.read_json(data_path, lines=True)
raw_data.head(2)

Unnamed: 0,instruction,context,response,category
0,When did Virgin Australia start operating?,"Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.[3] It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.[4]","Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.",closed_qa
1,Which is a species of fish? Tope or Rope,,Tope,classification


Templating

In [4]:
def format_dolly(sample):
    instruction = f"### Instruction\n{sample['instruction']}"
    # add context if it is available
    context = f"### Context\n{sample['context']}" if len(sample["context"]) > 0 else None
    response = f"### Answer\n{sample['response']}"
    # join all the parts together
    prompt = "\n\n".join([i for i in [instruction, context, response] if i is not None])
    return prompt

Testing format

In [5]:
from random import randrange
random_idx = randrange(len(raw_data))
print(format_dolly(raw_data.iloc[random_idx]))

### Instruction
What is a Ferocactus

### Context
Ferocactus is a genus of large barrel-shaped cacti, mostly with large spines and small flowers. There are about 30 species included in the genus. They are found in the southwestern United States and northwestern Mexico.

### Answer
Ferocactus is a genus of large barrel-shaped cacti, mostly with large spines and small flowers. There are about 30 species included in the genus. They are found in the southwestern United States and northwestern Mexico.


Applying template

In [6]:
data = pd.DataFrame(raw_data.apply(format_dolly, axis=1), columns=["text"])
data.head()

Unnamed: 0,text
0,"### Instruction\nWhen did Virgin Australia start operating?\n\n### Context\nVirgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.[3] It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.[4]\n\n### Answer\nVirgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route."
1,### Instruction\nWhich is a species of fish? Tope or Rope\n\n### Answer\nTope
2,### Instruction\nWhy can camels survive for long without water?\n\n### Answer\nCamels use the fat in their humps to keep them filled with energy and hydration for long periods of time.
3,"### Instruction\nAlice's parents have three daughters: Amy, Jessy, and what’s the name of the third daughter?\n\n### Answer\nThe name of the third daughter is Alice"
4,"### Instruction\nWhen was Tomoaki Komorida born?\n\n### Context\nKomorida was born in Kumamoto Prefecture on July 10, 1981. After graduating from high school, he joined the J1 League club Avispa Fukuoka in 2000. Although he debuted as a midfielder in 2001, he did not play much and the club was relegated to the J2 League at the end of the 2001 season. In 2002, he moved to the J2 club Oita Trinita. He became a regular player as a defensive midfielder and the club won the championship in 2002 and was promoted in 2003. He played many matches until 2005. In September 2005, he moved to the J2 club Montedio Yamagata. In 2006, he moved to the J2 club Vissel Kobe. Although he became a regular player as a defensive midfielder, his gradually was played less during the summer. In 2007, he moved to the Japan Football League club Rosso Kumamoto (later Roasso Kumamoto) based in his local region. He played as a regular player and the club was promoted to J2 in 2008. Although he did not play as much, he still played in many matches. In 2010, he moved to Indonesia and joined Persela Lamongan. In July 2010, he returned to Japan and joined the J2 club Giravanz Kitakyushu. He played often as a defensive midfielder and center back until 2012 when he retired.[1]\n\n### Answer\nTomoaki Komorida was born on July 10,1981."


Creating training data

In [7]:
train_data = Dataset.from_pandas(data)
len(train_data)

15015

--Fine-tuning

LoRA

In [8]:
lora_config=LoraConfig(
    r=8,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

Model loading using BitsAndBytes

In [9]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(torch.cuda.current_device()))


True
0
NVIDIA GeForce GTX 1080 Ti


In [10]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_path='./model/Gemma-11-2b-it/'

tokenizer = AutoTokenizer.from_pretrained(model_path, device_map='auto')
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config=bnb_config,
    device_map= "auto")#low_cpu_mem_usage = True,

# KV cache is useless during training(Finetune), It only works for inference.
model.config.use_cache = False # enable this in inference mode
model.config.pretraining_tp = 1  

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.33s/it]


Inference before fine-tuning

In [11]:
instruction="What should I do on a trip to Europe?"
text = f"### Instruction\n{instruction}\n\n### Answer\n"
device = "cuda"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

### Instruction
What should I do on a trip to Europe?

### Answer
**1. Research and Planning:**

- Determine your interests and desired destinations.
- Research and compare travel options, including flights, accommodation, and activities.
- Create a detailed itinerary that includes must-see sights, must-do experiences,


Clearing cuda cache

In [12]:
torch.cuda.empty_cache()
import gc
gc.collect()

20

Training args

In [13]:
training_args = TrainingArguments(
    output_dir= "./results",
    per_device_train_batch_size=1,
    gradient_accumulation_steps= 4,
    learning_rate= 2e-4,
    fp16=True,
    num_train_epochs= 1,
    max_steps= 300,
    optim="paged_adamw_8bit",
    report_to="none"
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_data,
    peft_config=lora_config,
    dataset_text_field="text",
    max_seq_length=512,
)

Map: 100%|██████████| 15015/15015 [00:01<00:00, 10400.68 examples/s]
max_steps is given, it will override any value given in num_train_epochs


Training

In [14]:
trainer.train()

100%|██████████| 300/300 [14:48<00:00,  2.96s/it]

{'train_runtime': 888.4931, 'train_samples_per_second': 1.351, 'train_steps_per_second': 0.338, 'train_loss': 2.0789699300130207, 'epoch': 0.08}





TrainOutput(global_step=300, training_loss=2.0789699300130207, metrics={'train_runtime': 888.4931, 'train_samples_per_second': 1.351, 'train_steps_per_second': 0.338, 'total_flos': 2407690597048320.0, 'train_loss': 2.0789699300130207, 'epoch': 0.07992007992007992})

Model saving

In [15]:
trainer.save_model('Gemma-1.1-2b-instruct-dollyfinetuned')

Model merging

In [16]:
merge_model = PeftModel.from_pretrained(model,'Gemma-1.1-2b-instruct-dollyfinetuned')
tuned_model = merge_model.merge_and_unload()

model_dir = "./'Gemma-1.1-2b-instruct-dollyfinetuned'"
tuned_model.save_pretrained(model_dir, safe_serialization=True)
tokenizer.save_pretrained(model_dir)

("./'Gemma-1.1-2b-instruct-dollyfinetuned'/tokenizer_config.json",
 "./'Gemma-1.1-2b-instruct-dollyfinetuned'/special_tokens_map.json",
 "./'Gemma-1.1-2b-instruct-dollyfinetuned'/tokenizer.model",
 "./'Gemma-1.1-2b-instruct-dollyfinetuned'/added_tokens.json",
 "./'Gemma-1.1-2b-instruct-dollyfinetuned'/tokenizer.json")