Checking if CUDA is enabled first.

In [223]:
# Check if cuda is enabled 
import torch
import os
import tensorflow as tf
torch.cuda.empty_cache()   # Emptying Cuda cache in order to free important space
os.environ["KERAS_BACKEND"]="tensorflow" # options 'torch' / 'jax' / 'tensorflow'
torch.cuda.is_available()

True

Dataset for fine-tuning<br>
Dataset type: Instruction - Response

In [224]:
# Loading dataset
import json
data=[]
datasetlocation='./model/dataset/databricks-dolly-15k.jsonl'    # Change accordingly, might want to change function below as well
with open(datasetlocation) as file:
    for line in file:
        features=json.loads(line)
        # if features['context']:    # Uncomment to skip 'context' column for simplicity
        #    continue
        template="Instruction:\n{instruction}\n\nResponse:\n{response}" 
        data.append(template.format(**features))

# Alternative to above
# data='databricks/databricks-dolly-15k' or something else
# from datasets import load_dataset
# dataset=load_dataset(data)
########################################################

# Need permission from Gemma first (form completion)
# Loading Huggingface Token and using login function

with open("token.json", "r") as json_file:
    token_dict = json.load(json_file)

access_token = token_dict["token"]
from huggingface_hub import login
login(token=access_token)

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/devy/.var/app/com.visualstudio.code/cache/huggingface/token
Login successful


Using Google's Gemma 2B from HuggingFace


In [225]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# 
core='google/gemma-2b'   # Model name, theoretically code should work for a different model as long as it is not too different from gemma-2b
cache_dir='./model/gemma'# Cache location where the model will be stored
#

# Gemma chat/prompt template: 
# <bos><start_of_turn>user
# Message <end_of_turn>
# <start_of_turn>model

PYTORCH_CUDA_ALLOC_CONF=expandable_segments=True

# BitsAndBytesConfig 4bit, alternative: 8-bit
bnb_config=BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4', 
    bnb_4bit_compute_dtype=torch.bfloat16
    )
# Model to be used
model = AutoModelForCausalLM.from_pretrained(
    core,
    cache_dir=cache_dir,
    device_map='auto',
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
)
# Tokenizer 
tokenizer = AutoTokenizer.from_pretrained(core,cache_dir=cache_dir)
tokenizer.padding_side='right' # to prevent warnings

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.34it/s]


In [226]:
import tkinter as tk
def send_message(event=None):
    message = entry.get()
    if message.lower() == 'exit':
        root.quit()
        return
    # Generate response
    inputs = tokenizer(message, return_tensors="pt")
    outputs = model.generate(inputs.input_ids, max_length=500, do_sample=True)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Display message and response
    chat_area.insert(tk.END, f"You: {message}\n")
    chat_area.insert(tk.END, f"Bot: {response}\n\n")
    entry.delete(0, tk.END)

# Create GUI
root = tk.Tk()
root.title("Chatbot")

chat_area = tk.Text(root, width=50, height=20)
scrollbar = tk.Scrollbar(root, command=chat_area.yview)
chat_area.configure(yscrollcommand=scrollbar.set)
entry = tk.Entry(root, width=50)
entry.bind("<Return>", send_message)

chat_area.pack(side=tk.TOP, fill=tk.BOTH, padx=5, pady=5)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
entry.pack(side=tk.BOTTOM, fill=tk.X, padx=5, pady=5)

entry.focus_set()

root.mainloop()



Fine-Tuning<br>
Option 1: LoRA - Low rank adaptation

In [227]:
from peft import LoraConfig, get_peft_model

# LoRA config
lora_config = LoraConfig(
    lora_alpha= 32,
    lora_dropout=0.05,
    # bias=
    r=64,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"], # linear layers
    task_type="CAUSAL_LM",
)
model=get_peft_model(model,lora_config)

In [228]:
trainable, total =model.get_nb_trainable_parameters()
print(f"Trainable parameters: {trainable} | Total parameters: {total} | Percentage: {trainable/total*100:.4f}%" )

Trainable parameters: 78446592 | Total parameters: 2584619008 | Percentage: 3.0351%


Training arguments

In [229]:
from transformers import TrainingArguments
args = TrainingArguments(
    output_dir='Gemma-2B-Dolly-FT',
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    warmup_steps=2,
    optim='adamw_torch_fused',
    logging_steps=1,
    save_strategy='epoch',
    bf16=False, # current gpu does not support this
    tf32=False, # current gpu does not support this either
    fp16=True,
    learning_rate=2e-4,                                     # Based on QLoRA paper
    max_grad_norm=0.3,                                      # Based on QLoRA paper
    warmup_ratio=0.03,                                      # Based on QLoRA paper
    lr_scheduler_type='constant',
    push_to_hub=False,
    report_to='tensorboard',
    )

In [230]:
from trl import SFTTrainer
import transformers

def formatting_func(example):
    output_texts = []
    for i in range(len(example)):
        text = f"Instruction: {example['Instruction'][i]}\nResponse: {example['response'][i]}"
        output_texts.append(text)
    return output_texts


trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=train_data,
    eval_dataset=test_data,
    dataset_text_field="prompt"
    peft_config=lora_config,
    formatting_func=formatting_func,
    tokenizer=tokenizer,
    dataset_kwargs={
        "add_special_tokens": False,
        "append_concat_token":False,
    }
)

# Training the model
trainer.train()

# .. and then saving it
trainer.save_model(core+'/Gemma2bFT')

SyntaxError: invalid syntax (2352635195.py, line 18)

Response

In [None]:
text = "Hello, how are you?"
input = tokenizer(text, return_tensors="pt").to("cuda")

outputs = model.generate(**input)
print(tokenizer.decode(outputs[0],skip_special_tokens=True))

Alternative Response 

In [None]:
%%script echo skipping
prompt=template.format(
    instruction="Can you introduce yourself?",
    response="",
)
print(model.generate(prompt,max_length=256))