<a href="https://colab.research.google.com/github/fedecanzo/LLM-finetuning/blob/main/NotebookLLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#FineTuning LLAMA2

# Training


In [None]:
# !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7
!pip install -q accelerate peft bitsandbytes transformers trl sentencepiece wandb chromadb

In [None]:
# Non dovrebbe servire, i checkpoint vengono salvati su Huggingface HUB
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import time
import torch
import shutil
from trl import SFTTrainer
from google.colab import userdata
from datasets import load_dataset, DatasetDict
from peft import LoraConfig, PeftModel,get_peft_model,prepare_model_for_kbit_training,PeftConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    TrainerCallback,
    pipeline,
    logging,
)

## Dopo l'installazione di qualche libreria si resetta il locale
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
# CONFIG VARIABLES
OUTPUT_DIR = "Llama2-to-SQL"

TOKEN_WANDB = userdata.get('WANDB_TOKEN')
TOKEN_HF    = userdata.get('HF_TOKEN')

os.environ['WANDB_API_KEY']   = TOKEN_WANDB
os.environ["WANDB_PROJECT"]   = OUTPUT_DIR
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

In [None]:
#Logging su Weight and Bias
!wandb login

In [None]:
def get_prompt_format(type_format):

  if type_format=="meta-llama":
    return "<s> [INST] <<SYS>> You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA. {c} <</SYS>>. Question is: {q}. [/INST] Answer is: {a} </s>"
  elif type_format=="chatml":
    return "<|im_start|>system You are a helpful bot, your job it to convert input question into its respective SQL command <|im_end|> <|im_start|>user Context: {c} Question: {q} <|im_end|> <|im_start|>assistant {a} "
  elif type_format=="mistral":
    return "<s> [INST] You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA. {c} Question is: {q}.[/INST] Answer is: {a} </s>"
  else:
    raise Exception(f"Prompt format not supported: {type_format}")

# UTILITY FUNCTION
def get_prompt_template(c,q,a,type_format):
  prompt = get_prompt_format(type_format)
  return prompt.format(c=c,q=q,a=a)

# Inference
def get_prompt_template_inference(c,q,type_format):
  if type_format=="meta-llama":
    return f"<s>[INST] <<SYS>> You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA. {c} <</SYS>>. Question is: {q}. [/INST] "
  elif type_format=="chatml":
    return f"<|im_start|>system You are a helpful bot, your job it to convert input question into its respective SQL command <|im_end|> <|im_start|>user Context: {c} Question: {q} <|im_end|>  "
  else:
    raise Exception(f"Prompt format not supported: {type_format}")


def generate_inference(model, tokenizer, input_str, device, time_track  ):
    start = time.time()
    input_tokenized = tokenizer(input_str, return_tensors="pt", padding=True ).to(device)
    output_to_decode = model.generate(**input_tokenized )
    output_str = tokenizer.decode(output_to_decode[0])

    if time_track:
      print(f"Output: {output_str}, Tempo: {time.time()-start}")

    return output_str

def forward_inference(model, tokenizer, input_str, device, time_track):
    start = time.time()
    input_tokenized = tokenizer(input_str, return_tensors="pt").to(device)
    output_to_decode=model(**input_tokenized)
    output_str = tokenizer.decode(output_to_decode.logits.argmax(axis=-1)[0])

    if time_track:
      print(f"Output: {output_str}, Tempo: {time.time()-start}")

    return output_str

def print_trainable_parameters(model,str_info):
      """
      Prints the number of trainable parameters in the model.
      """
      trainable_params = 0
      all_param = 0
      for _, param in model.named_parameters():
          all_param += param.numel()
          if param.requires_grad:
              trainable_params += param.numel()
      print(f"{str_info} -- Trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" )


In [None]:
def load_model(model_id,device_map,type_quantization):
    #LOAD MODEL
    MODEL_NAME=model_id
    DEVICE_MAP=device_map

    if type_quantization=="8bit":
      bnb_config = BitsAndBytesConfig(
          load_in_8bit=True,
      )
    elif type_quantization=="4bit":
      bnb_config = BitsAndBytesConfig(
          load_in_4bit=True,
          bnb_4bit_use_double_quant=True,
          bnb_4bit_quant_type="nf4",
          bnb_4bit_compute_dtype=torch.bfloat16
      )
    elif type_quantization==None:
      bnb_config=None
    else:
      raise Exception(f"4bit o 8bit expected instead receiving: {type_quantization}")

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        device_map = DEVICE_MAP,
        token = TOKEN_HF,
        quantization_config=bnb_config,
    )

    model.config.pretraining_tp = 1
    model.config.use_cache = False
    model.gradient_checkpointing_enable()
    model = prepare_model_for_kbit_training(model)

    return model

def load_tokenizer(model_id):
  #LOAD TOKENIZER
  tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
  tokenizer.pad_token = tokenizer.eos_token
  tokenizer.padding_side = "right"
  return tokenizer

# Aggiunge i parametri extra al modello (è un operazione in PLACE)
def get_trainable_model(model, peft_config):
  model = get_peft_model(model, peft_config)
  return model

DEVICE_MAP="auto"
TYPE_QUANTIZATION="4bit"

MODEL_NAME="meta-llama/Llama-2-7b-chat-hf"
# MODEL_NAME="meta-llama/Llama-2-7b-hf"
# MODEL_NAME="teknium/OpenHermes-2.5-Mistral-7B" #Strano
# MODEL_NAME="mistralai/Mistral-7B-Instruct-v0.2"
# MODEL_NAME="codellama/CodeLlama-7b-hf"
# codellama fine tuning di llama2 su codice e instruction

model = load_model(MODEL_NAME, DEVICE_MAP, TYPE_QUANTIZATION)
tokenizer_llama = load_tokenizer(MODEL_NAME)

# print_trainable_parameters(model,"Modello normale")

In [None]:
print_trainable_parameters(model, "Modello normale") #0%

In [None]:
def load_dataset_from_hf(dataset_id,test_size=0.1):
    dataset = load_dataset(dataset_id)
    dataset = dataset['train'].train_test_split(test_size=test_size)
    return dataset

def get_dataset_with_prompt(dataset,type_format_prompt):
  create_prompt = lambda row: {'text' : [ get_prompt_template(c,q,a,type_format_prompt) for c,q,a in zip(row["context"], row['question'], row['answer'])]}
  ddict=DatasetDict()
  train_dataset = dataset['train'].map(create_prompt, batched=True )
  test_dataset = dataset['test'].map(create_prompt, batched=True )
  ddict['train']=train_dataset
  ddict['test']=test_dataset
  return ddict

#PER TEST
def get_reduced_train_test_dataset(dataset,reduced_size):
    ddict=DatasetDict()
    train_dataset = dataset['train'].select([i for i in range(reduced_size)]).shuffle(seed=42)
    test_dataset = dataset['test'].select([i for i in range(int(reduced_size*0.25))]).shuffle(seed=42)
    ddict['train']=train_dataset
    ddict['test']=test_dataset
    return ddict

type_format_prompt="meta-llama"
dataset = load_dataset_from_hf("b-mc2/sql-create-context", test_size=0.1)
dataset = get_dataset_with_prompt(dataset,type_format_prompt)
dataset = get_reduced_train_test_dataset(dataset,reduced_size=1000)

print(dataset)

In [None]:
print(dataset['train']['text'][0])

In [None]:
def get_lora_config(r, alpha, dropout):
    lora_config = LoraConfig(
      lora_alpha=alpha,
      lora_dropout=dropout,
      r=r,
      target_modules = ['q_proj','k_proj','v_proj','o_proj','gate_proj','down_proj','up_proj','lm_head'],
      bias="none",
      task_type="CAUSAL_LM",
    )
    return lora_config

LORA_ALPHA = 128
LORA_DROPOUT= 0.1
LORA_R = 128
# https://arxiv.org/abs/2106.09685 LoRA paper
peft_config = get_lora_config(LORA_R, LORA_ALPHA, LORA_DROPOUT)

In [None]:
model = get_trainable_model(model,peft_config)
print_trainable_parameters(model,"MODELLO CON PARAMETRI EXTRA")

In [None]:
# save adapter file su gdrive (NON SERVE PIù)
class SaveOnGDriveCallback(TrainerCallback):
    def on_save(self, args, state, control, logs=None, **kwargs):
        step = state.global_step
        shutil.copytree(f"/content/{OUTPUT_DIR}/checkpoint-{str(step)}",f"/content/drive/MyDrive/test-checkpoint-{str(step)}",dirs_exist_ok = True)

# Test EvalCallback
class EvalTest(TrainerCallback):
    def on_evaluate(self, args, state, control, logs=None, **kwargs):
        # print(self,dir(self),kwargs)
        # print(kwargs['model'], kwargs['tokenizer'])
        print(kwargs['eval_dataloader'],dir(kwargs['eval_dataloader']), kwargs['eval_dataloader'].dataset )


In [None]:
# Set training parameters
NUM_TRAIN_EPOCHS = 2
PER_DEVICE_TRAIN_BATCH_SIZE = 4
# PER_DEVICE_TRAIN_BATCH_SIZE = 8
# PER_DEVICE_TRAIN_BATCH_SIZE = 32 # Cuda OOM

PER_DEVICE_EVAL_BATCH_SIZE = 4

GRADIENT_ACCUMULATION_STEPS = 3
# OPTIM = "paged_adamw_32bit"
OPTIM = "paged_adamw_8bit"
# OPTIM = "adamw_torch"

SAVE_STEPS = 25
SAVE_STRATEGY = "steps"

EVAL_STEPS = 5
EVAL_STRATEGY = "steps"

LOGGING_STEPS = 1

LEARNING_RATE = 2e-4
WEIGHT_DECAY = 0.001
LR_SCHEDULER_TYPE = "constant"
MAX_SEQ_LENGTH = 1024
PACKING = False

training_arguments = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    optim=OPTIM,

    save_steps=SAVE_STEPS,
    save_strategy=SAVE_STRATEGY,

    logging_steps=LOGGING_STEPS,

    evaluation_strategy=EVAL_STRATEGY,
    eval_steps=EVAL_STEPS,

    fp16=True,

    max_grad_norm=0.3,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    hub_strategy = "checkpoint",
    push_to_hub = True,
    report_to="wandb",
    overwrite_output_dir = True,
)
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LENGTH,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=PACKING,

    neftune_noise_alpha=5,
    # callbacks=[EvalTest],
)

In [None]:
print(max([ len(i) for i in dataset['train']['text']]))
print(max([ len(i) for i in dataset['test']['text']]))

In [None]:
# trainer.train(resume_from_checkpoint=False)
trainer.train(resume_from_checkpoint=True)


In [None]:

from tqdm import tqdm

model.eval()
number_of_eval_samples = 10
equal=0
infer= []
for record in tqdm(dataset['test'].shuffle().select(range(number_of_eval_samples))):
  prompt_infer = get_prompt_template_inference(c=record['context'],q=record['question'],type_format="meta-llama")
  # success_rate.append(evaluate(prompt_infer, trainer.model, tokenizer, "cuda:0"))
  input_str=prompt_infer
  input_tokenized = tokenizer(input_str, return_tensors="pt",padding=True,add_special_tokens=False).to("cuda:0")

  output_to_decode = model.generate( **input_tokenized, do_sample=True, max_new_tokens = 64 )
  output_str = tokenizer.decode(output_to_decode[0])
  output_label = get_prompt_template(c=record['context'],q=record['question'],type_format="meta-llama", a=record['answer'])
  print("Input: ", input_str ,"\nInference: ", output_str, "\nGroundTruth: ", output_label,"\n######\n" )
  infer.append([output_str,output_label])
  if output_str.strip() == output_label.strip():
    equal+=1

print("Accuracy: ", equal/len(dataset['test']))

In [None]:
# !pip install sentence_transformers

from sentence_transformers import SentenceTransformer,util

# Load a sentence embedding model
model_sentence = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')


a =  """SELECT source FROM table_name_62 WHERE cartridge = ".375 remington ultra magnum" """
b =  """SELECT source FROM table_name_62 WHERE cartridge = ".325 remington ultra magnum" </s>"""

a_vector = model_sentence.encode(a)
b_vector = model_sentence.encode(b)

# Calculate semantic similarity
# similarity = model.similarity(apple_sentence, orange_sentence)
print(util.pytorch_cos_sim(a_vector, b_vector))
print(util.cos_sim(a_vector, b_vector))
print(util.semantic_search(a_vector, b_vector))

In [None]:

context="CREATE TABLE table_name_35 (mountain_range VARCHAR, rank VARCHAR)"
question="Which mountain range has a rank of 200?"
type_format="mistral"
device="cuda:0"

# trainer.model.eval()
#
print(generate_inference(trainer.model, tokenizer, get_prompt_template_inference(c=context, q=question,type_format=type_format),device,True))
# print(forward_inference(merged_model, tokenizer, get_prompt_template_inference(c=context, q=question),"cuda:0"))

In [None]:
from transformers import pipeline
generator=pipeline(task="text-generation",model=model, tokenizer=tokenizer )

In [None]:
#pulizia vedere se svuota la VRAM (se non funziona devi riavviare il noteboook)
import gc
# del trainer
# del model
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Test carica un dataset da JSON e push su HUB

from datasets import load_dataset
datasets = load_dataset("json", data_files="dataset.json", field="data")

datasets.push_to_hub("test_dataset-Private",private=True)

In [None]:
#Carica sia modello che adapter ()

def load_model_and_adapter_from_hf(base_id, adapter_id):
  base_model = AutoModelForCausalLM.from_pretrained(base_id,token=TOKEN_HF,load_in_8bit=True)
  peft_model = PeftModel.from_pretrained(base_model, adapter_id )
  return peft_model

adapter_id="Federic/lora-fine-tuning-llama2-SQL-lora-100-dataset-size"
base_id ="meta-llama/Llama-2-7b-chat-hf"

peft_model=load_model_and_adapter_from_hf(base_id, adapter_id)
tokenizer = load_tokenizer(base_id)

## Test ChromaDB

In [None]:
!pip install chromadb

In [None]:
import chromadb
chroma_client = chromadb.Client()

In [None]:
collection = chroma_client.create_collection(name="external_documents")

In [None]:
records = []
records.append(("How many heads of the departments are older than 56 ?	", "SELECT COUNT(*) FROM head WHERE age > 56", "CREATE TABLE head (age INTEGER)"))
records.append(("List the name, born state and age of the heads of departments ordered by age.", "SELECT name, born_state, age FROM head ORDER BY age", "CREATE TABLE head (name VARCHAR, born_state VARCHAR, age VARCHAR)"))
records.append(("List the creation year, name and budget of each department.","SELECT creation, name, budget_in_billions FROM department", "CREATE TABLE department (creation VARCHAR, name VARCHAR, budget_in_billions VARCHAR)"))
records.append(("What are the maximum and minimum budget of the departments?", "SELECT MAX(budget_in_billions), MIN(budget_in_billions) FROM department", "CREATE TABLE department (budget_in_billions INTEGER)"))

questions = [ rec[0] for rec in records]
sqls =      [ rec[1] for rec in records]
ddls =      [ rec[2] for rec in records]


In [None]:
collection.add(
    documents=questions,
    metadatas=[{"source": "question"} for i in range(len(questions)) ],
    ids=["question-"+str(i+1) for i in range(len(questions))]
)

collection.add(
    documents=ddls,
    metadatas=[{"source": "ddl"} for i in range(len(ddls)) ],
    ids=["ddl-"+str(i+1) for i in range(len(ddls))],
)

collection.add(
    documents=sqls,
    metadatas=[{"source": "sql"} for i in range(len(sqls)) ],
    ids=["sql-"+str(i+1) for i in range(len(sqls))]
)


In [None]:
# Prendo le n domande più simili
def get_relevant_document(prompt,n_results):
  res = collection.query(
    query_texts = [prompt],
    n_results=n_results,
    where = {
        "source": {
              "$eq": "question"
          }
      }
  )

  sqls = [collection.get( str("sql-")+i.split("question-")[1])['documents'] for i in res['ids'][0]]
  return list(zip(res['documents'][0], sqls))

def get_prev_docs(docs):
  out=""
  for input,output in docs:
    out+=f"Input: {input}, Output: {output[0]} -- "

  return out+ "."


In [None]:
# collection.delete(collection.get()['ids'])
# collection.get()['ids']
#
#

In [None]:
type_format="meta-llama"
q="What are the distinct ages of the heads who are acting?"
docs=get_relevant_document(q,2)
previous_docs = f"Here some similar couple (input, output): {get_prev_docs(docs)} "
c=f"CREATE TABLE head (age VARCHAR, head_id VARCHAR); CREATE TABLE management (head_id VARCHAR, temporary_acting VARCHAR) -- {previous_docs} "
prompt=get_prompt_template_inference(c=c, q=q, type_format=type_format)
print(prompt)
