<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/FAISS_FINETUNING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install Pytorch & other libraries
!pip install torch tensorboard --quiet

# Install Hugging Face libraries
!pip install  --upgrade transformers datasets accelerate evaluate bitsandbytes --quiet

#FlashAttention only supports Ampere GPUs or newer. #NEED A100 , L4  IN GOOGLE COLAB
!pip install -U flash-attn --no-build-isolation --quiet


! pip install peft --quiet
! pip install ninja packaging --quiet

!pip install diffusers safetensors  --quiet
!pip install colab-env --quiet

!pip install mistral_inference -q

!pip install trl==0.8.6 -q

In [None]:
!pip install sentence_transformers -q
!pip install faiss-gpu -q

In [1]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig, EarlyStoppingCallback
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model, TaskType
from sentence_transformers import SentenceTransformer
import faiss
import os
import evaluate
import numpy as np

In [2]:
import colab_env
import os
from huggingface_hub import login


access_token_write = os.getenv("HUGGINGFACE_ACCESS_TOKEN_WRITE")

login(
  token=access_token_write,
  add_to_git_credential=True
)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:

import torch
import os
import sys
import json
import IPython
from datetime import datetime
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
)
from trl import SFTTrainer

In [None]:
from datasets import load_dataset

# Convert dataset to OAI messages
system_message = """You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.
SCHEMA:
{schema}"""

def create_conversation(sample):
  return {
    "messages": [
      {"role": "system", "content": system_message.format(schema=sample["context"])},
      {"role": "user", "content": sample["question"]},
      {"role": "assistant", "content": sample["answer"]}
    ]
  }

# Load dataset from the hub
dataset = load_dataset("b-mc2/sql-create-context", split="train")
dataset = dataset.shuffle().select(range(12500))

# Convert dataset to OAI messages
dataset = dataset.map(create_conversation, remove_columns=dataset.features,batched=False)

# split dataset into 10,000 training samples and 2,500 test samples
dataset = dataset.train_test_split(test_size=2500/12500)

#print(dataset["train"][345]["messages"])

# save datasets to disk
dataset["train"].to_json("train_dataset.json", orient="records")
dataset["test"].to_json("test_dataset.json", orient="records")

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['messages'],
        num_rows: 2500
    })
})

In [6]:
from datasets import load_dataset

# Load jsonl data from disk for sql
dataset = load_dataset("json", data_files="train_dataset.json", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
dataset

Dataset({
    features: ['messages'],
    num_rows: 10000
})

In [8]:
# 1. Load Dataset and Vector Database
dataset = load_dataset("json", data_files="train_dataset.json", split="train")
dataset_test = load_dataset("json", data_files="test_dataset.json", split="train")

# Load sentence transformer
sentence_transformer = SentenceTransformer('all-mpnet-base-v2')

# Create FAISS index directly from the schema information in the dataset
# (Assuming each sample has a 'context' field containing the schema)
#schema=sample["messages"][0]['content']

schema_corpus = [sample["messages"][0]['content'][153:] for sample in dataset] # extract schema info
#schema_corpus = schema_corpus0[0][153:len(schema_corpus0[0])]
schema_embeddings = sentence_transformer.encode(schema_corpus)
faiss_index = faiss.IndexFlatL2(schema_embeddings.shape[1])
faiss_index.add(schema_embeddings)

Generating train split: 0 examples [00:00, ? examples/s]

In [15]:
schema_corpus[9999]

'CREATE TABLE table_name_32 (team VARCHAR, rider VARCHAR)'

In [10]:
dataset

Dataset({
    features: ['messages'],
    num_rows: 10000
})

In [11]:
dataset_test

Dataset({
    features: ['messages'],
    num_rows: 2500
})

In [None]:
# 2. Model and Tokenizer Configuration
model_id = "mistralai/Mistral-7B-Instruct-v0.3"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    quantization_config=bnb_config,
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [16]:
from trl import setup_chat_format
# # set chat template to OAI chatML, remove if you start from a fine-tuned model
model, tokenizer = setup_chat_format(model, tokenizer)

In [18]:
schema_corpus[9999]

'CREATE TABLE table_name_32 (team VARCHAR, rider VARCHAR)'

In [19]:
# 3. Augment Input with Context (Only for the training dataset)
def augment_with_context(sample):
    schema = sample["messages"][0]['content']
    schema_query = schema[153:len(schema)]  # Extract schema information
    question = sample["messages"][1]["content"]
    original_answer = sample["messages"][2]["content"]

    query_embedding = sentence_transformer.encode(question)
    _, indices = faiss_index.search(query_embedding.reshape(1, -1), k=3)
    context_snippets = [schema_corpus[i] for i in indices[0]]

    # Create new input string with schema, question, and additional context
    augmented_prompt = schema_query + "\nQuestion: " + question + "\n" + " ".join(context_snippets)
    return {"prompt": augmented_prompt, "completion": original_answer}


augmented_train_dataset = dataset.map(
    augment_with_context,
    batched=False,
)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
augmented_train_dataset

In [35]:
augmented_train_dataset['prompt'][0]

[{'content': 'You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.\nSCHEMA:\nCREATE TABLE table_name_27 (silver INTEGER, nation VARCHAR, gold VARCHAR)',
  'role': 'system'},
 {'content': 'What kind of Silver has a Nation of norway and a Gold smaller than 3?',
  'role': 'user'},
 {'content': 'SELECT SUM(silver) FROM table_name_27 WHERE nation = "norway" AND gold < 3',
  'role': 'assistant'}]

In [47]:
dataset['messages'][0]

[{'content': 'You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.\nSCHEMA:\nCREATE TABLE table_name_27 (silver INTEGER, nation VARCHAR, gold VARCHAR)',
  'role': 'system'},
 {'content': 'What kind of Silver has a Nation of norway and a Gold smaller than 3?',
  'role': 'user'},
 {'content': 'SELECT SUM(silver) FROM table_name_27 WHERE nation = "norway" AND gold < 3',
  'role': 'assistant'}]

In [49]:
augmented_train_dataset['completion'][0]

'SELECT SUM(silver) FROM table_name_27 WHERE nation = "norway" AND gold < 3'

In [38]:
dataset

Dataset({
    features: ['messages'],
    num_rows: 10000
})

In [67]:
!pip install trl -q

In [None]:
# 4. Apply PEFT (LoRA) Configuration
from peft import LoraConfig

# LoRA config based on QLoRA paper & Sebastian Raschka experiment
peft_config = LoraConfig(
        lora_alpha=128,
        lora_dropout=0.05,
        r=256,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM",
)


#model = get_peft_model(model, peft_config)
#model.print_trainable_parameters()

# 5. Define Compute Metrics Function
# Assuming you have a way to evaluate generated SQL against ground truth
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    # Convert logits to predictions (replace with your actual logic)
    predictions = np.argmax(logits, axis=-1)
    # Calculate accuracy (replace with your actual metric)
    accuracy = np.mean(predictions == labels)
    return {"accuracy": accuracy}


# 6. Fine-tuning with Training Arguments
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="Mistral-7B-text-to-sql-flash-attention-2-FAISS",    # directory to save and repository id

    num_train_epochs=3,                     # number of training epochs
    per_device_train_batch_size=3,          # batch size per device during training
    gradient_accumulation_steps=8,      #2  # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=10,                       # log every 10 steps
    #save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    bf16=True,                              # use bfloat16 precision
    tf32=True,                              # use tf32 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    weight_decay=0.01,
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=True,                       # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
    hub_token=access_token_write,           # Add this line
    load_best_model_at_end=True,
    logging_dir="/content/gdrive/MyDrive/model/Mistral-7B-text-to-sql-flash-attention-2-FAISS/logs",

    eval_strategy="steps",
    eval_steps=10,
    save_strategy="steps",
    save_steps=10,
    metric_for_best_model = "loss",
    warmup_steps=15,
)

model.config.use_cache=False
model.gradient_checkpointing_enable()

max_seq_length=2048
# 7. Create Trainer

from trl import SFTTrainer

max_seq_length = 2048 # max sequence length for model and packing of the dataset

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=augmented_train_dataset,
    eval_dataset=dataset_test,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
        #"dataset_text_field": "text",  # Specify the text field in the dataset
    }
)
trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=3))

# Start Training
trainer.train()
trainer.save_model()

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Step,Training Loss,Validation Loss
