<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/Evaluator_Mistral_7B_text_to_sql.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://medium.com/@frankmorales_91352/fine-tuning-the-llm-mistral-7b-instruct-v0-3-249c1814ceaf

# Dependencies

In [None]:
!nvidia-smi

Wed Jun 26 13:55:54 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0              46W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
# Install Pytorch & other libraries
!pip install torch tensorboard --quiet

# Install Hugging Face libraries
!pip install  --upgrade transformers datasets accelerate evaluate bitsandbytes --quiet

#FlashAttention only supports Ampere GPUs or newer. #NEED A100 , L4  IN GOOGLE COLAB
!pip install -U flash-attn --no-build-isolation --quiet

!pip install colab-env --quiet

!pip install mistral_inference -q

!pip install peft -q

# Hugging Face Setup

In [None]:
import colab_env
import os
from huggingface_hub import login


access_token_write = os.getenv("HUGGINGFACE_ACCESS_TOKEN_WRITE")

login(
  token=access_token_write,
  add_to_git_credential=True
)


#from huggingface_hub import notebook_login
#notebook_login(write_permission=True)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Load the Fine Tuned Model

In [None]:
import torch
import os
import sys
import json
import IPython
from datetime import datetime
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
)

In [None]:
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, pipeline

peft_model_id = "frankmorales2020/Mistral-7B-text-to-sql-flash-attention-2-dataeval"

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load Model with PEFT adapter
model = AutoPeftModelForCausalLM.from_pretrained(
    peft_model_id,
    device_map="auto",
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)

tokenizer = AutoTokenizer.from_pretrained(peft_model_id)

# load into pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Tensorboad Setup

In [None]:
import colab_env

/content/gdrive/MyDrive/model/POC-Mistral-7B-text-to-sql-flash-attention-2-dataeval/logs

In [None]:
%load_ext tensorboard

##only in my personal dev-environment
%tensorboard --logdir /content/gdrive/MyDrive/model/Mistral-7B-text-to-sql-flash-attention-2-dataeval/logs

# Dataset Settings

In [None]:
from datasets import load_dataset

# Convert dataset to OAI messages
system_message = """You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.
SCHEMA:
{schema}"""

def create_conversation(sample):
  return {
    "messages": [
      {"role": "system", "content": system_message.format(schema=sample["context"])},
      {"role": "user", "content": sample["question"]},
      {"role": "assistant", "content": sample["answer"]}
    ]
  }

# Load dataset from the hub
dataset = load_dataset("b-mc2/sql-create-context", split="train")
dataset = dataset.shuffle().select(range(12500))

# Convert dataset to OAI messages
dataset = dataset.map(create_conversation, remove_columns=dataset.features,batched=False)

# split dataset into 10,000 training samples and 2,500 test samples
dataset = dataset.train_test_split(test_size=2500/12500)


# save datasets to disk
dataset["train"].to_json("train_dataset.json", orient="records")
dataset["test"].to_json("test_dataset.json", orient="records")

In [None]:
# Load our test dataset
eval_dataset = load_dataset("json", data_files="/content/test_dataset.json", split="train")

# Model Evaluation - Inference

In [None]:
from tqdm import tqdm
from random import randint
from datasets import load_dataset


def evaluate(sample):
    prompt = pipe.tokenizer.apply_chat_template(sample["messages"][:2], tokenize=False, add_generation_prompt=True)
    outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)
    predicted_answer = outputs[0]['generated_text'][len(prompt):].strip()


    original_answer = sample["messages"][2]["content"]
    if predicted_answer ==  original_answer:
        print()
        print()
        print('SUCCESS!')
        print()
        print(f'Generated Answer: {predicted_answer}')
        print(f'Original Answer: {original_answer}')
        print()
        return 1
    else:
        print()
        print()
        print('NO - SUCCESS!')
        print()
        print(f'Generated Answer: {predicted_answer}')
        print(f' Original Answer: {original_answer}')
        print()
        return 0

success_rate = []
number_of_eval_samples = 10

# iterate over eval dataset and predict
for n in tqdm(range(number_of_eval_samples)):
    s=eval_dataset[n]
    success_rate.append(evaluate(s))

# compute accuracy
accuracy = sum(success_rate)/len(success_rate)

 10%|█         | 1/10 [00:05<00:47,  5.30s/it]



SUCCESS!

Generated Answer: SELECT laps FROM table_name_41 WHERE driver = "jean-christophe boullion"
Original Answer: SELECT laps FROM table_name_41 WHERE driver = "jean-christophe boullion"



 20%|██        | 2/10 [00:09<00:37,  4.66s/it]



SUCCESS!

Generated Answer: SELECT MIN(rank) FROM table_name_37 WHERE year > 2005 AND moving_from = "nancy"
Original Answer: SELECT MIN(rank) FROM table_name_37 WHERE year > 2005 AND moving_from = "nancy"



 30%|███       | 3/10 [00:12<00:28,  4.06s/it]



SUCCESS!

Generated Answer: SELECT 1 AS st_leg FROM table_name_6 WHERE team_1 = "everton"
Original Answer: SELECT 1 AS st_leg FROM table_name_6 WHERE team_1 = "everton"



 40%|████      | 4/10 [00:15<00:21,  3.52s/it]



SUCCESS!

Generated Answer: SELECT finish FROM table_name_18 WHERE player = "jack nicklaus"
Original Answer: SELECT finish FROM table_name_18 WHERE player = "jack nicklaus"



 50%|█████     | 5/10 [00:18<00:16,  3.23s/it]



NO - SUCCESS!

Generated Answer: SELECT period FROM table_name_67 WHERE year = "1896"
 Original Answer: SELECT period FROM table_name_67 WHERE year = 1896



 60%|██████    | 6/10 [00:21<00:12,  3.23s/it]



NO - SUCCESS!

Generated Answer: SELECT MAX(round) FROM table_name_40 WHERE player = "joe taylor"
 Original Answer: SELECT SUM(round) FROM table_name_40 WHERE player = "joe taylor"



 70%|███████   | 7/10 [00:24<00:09,  3.28s/it]



SUCCESS!

Generated Answer: SELECT COUNT(rank) FROM table_name_18 WHERE total < 5 AND bronze < 0
Original Answer: SELECT COUNT(rank) FROM table_name_18 WHERE total < 5 AND bronze < 0



 80%|████████  | 8/10 [00:30<00:08,  4.14s/it]



SUCCESS!

Generated Answer: SELECT AVG(position) FROM table_name_90 WHERE goals_against < 59 AND goals_for > 32 AND draws > 9 AND points > 35
Original Answer: SELECT AVG(position) FROM table_name_90 WHERE goals_against < 59 AND goals_for > 32 AND draws > 9 AND points > 35



 90%|█████████ | 9/10 [00:34<00:03,  3.99s/it]



SUCCESS!

Generated Answer: SELECT candidates FROM table_1342233_24 WHERE district = "Mississippi 6"
Original Answer: SELECT candidates FROM table_1342233_24 WHERE district = "Mississippi 6"



100%|██████████| 10/10 [00:38<00:00,  3.86s/it]



SUCCESS!

Generated Answer: SELECT season FROM table_25214321_1 WHERE third_place = "Raquel Pacheco"
Original Answer: SELECT season FROM table_25214321_1 WHERE third_place = "Raquel Pacheco"






In [None]:
print()
#print(f"Accuracy: {accuracy*100:.2f}%")
print(f"Accuracy (Eval dataset and predict) for a sample of {number_of_eval_samples}: {accuracy*100:.2f}%")


Accuracy (Eval dataset and predict) for a sample of 10: 80.00%


# Model Evaluation - Kernel

In [None]:
# Count Hidden Layers and Neurons (Before Evaluation)
if hasattr(model, 'base_model'):
    llama_model = model.base_model
else:
    llama_model = model

# Count hidden layers of type LlamaDecoderLayer
num_hidden_layers = llama_model.config.num_hidden_layers
#print(num_hidden_layers)

# Estimate neurons (this is very simplified, as explained earlier)
num_neurons = num_hidden_layers * llama_model.config.hidden_size

print(f"Number of hidden layers in the model: {num_hidden_layers}")
print(f"Approximate number of neurons (simplified): {num_neurons}")


Number of hidden layers in the model: 32
Approximate number of neurons (simplified): 131072


In [None]:
torch.cuda.empty_cache()

In [None]:
# Load our test dataset
eval_dataset = load_dataset("json", data_files="/content/test_dataset.json", split="train")
reduced_size = 10
eval_dataset = eval_dataset.shuffle(seed=42).select(range(reduced_size))

In [None]:
eval_dataset

Dataset({
    features: ['messages'],
    num_rows: 10
})

In [None]:
eval_dataset[0]["messages"][0]['content']

'You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.\nSCHEMA:\nCREATE TABLE campuses (campus VARCHAR, county VARCHAR, YEAR VARCHAR)'

In [None]:
max_length = 10
all_input_ids = []
all_attention_masks = []

for item in eval_dataset:
    messages = item['messages']

    # Concatenate the 'content' of all messages into a single string
    text = " ".join([msg['content'] for msg in messages])
    #print()
    #print(text)
    #print()

    tokenized = tokenizer(text, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")
    all_input_ids.append(tokenized["input_ids"][0])
    all_attention_masks.append(tokenized["attention_mask"][0])


input_ids = torch.stack(all_input_ids)
attention_masks = torch.stack(all_attention_masks)

In [None]:
# Now you have input_ids, attention_masks, and labels as tensors with compatible shapes
print(input_ids.shape)
print(attention_masks.shape)

torch.Size([10, 10])
torch.Size([10, 10])


In [None]:
torch.cuda.empty_cache()

In [None]:
del model
del tokenizer
torch.cuda.empty_cache()

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from datasets import load_dataset
from peft import PeftModel
import evaluate
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM

# Constants
BATCH_SIZE = 8
MAX_LENGTH = 10
peft_model_id = "frankmorales2020/Mistral-7B-text-to-sql-flash-attention-2-dataeval"
data_files = "/content/test_dataset.json"
reduced_size = 10

# Load tokenizer (using tokenizer from the PEFT model)
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.3",
    torch_dtype=torch.float16,
    device_map='auto'
)

# Resize the token embeddings to match the PEFT vocabulary
base_model.resize_token_embeddings(len(tokenizer))

# Load PEFT model (using the base_model object)
model = PeftModel.from_pretrained(base_model, peft_model_id)
model.eval()

# Ensure model is on the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load your test dataset
eval_dataset = load_dataset("json", data_files=data_files, split="train")
eval_dataset = eval_dataset.shuffle(seed=42).select(range(reduced_size))

# Tokenization and Tensor Creation
all_input_ids = []
all_attention_masks = []
for item in eval_dataset:
    messages = item['messages']
    # Concatenate the 'content' of all messages into a single string
    text = " ".join([msg['content'] for msg in messages])

    tokenized = tokenizer(text, padding="max_length", truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
    all_input_ids.append(tokenized["input_ids"][0])
    all_attention_masks.append(tokenized["attention_mask"][0])

input_ids = torch.stack(all_input_ids)
attention_masks = torch.stack(all_attention_masks)

# Create TensorDataset from your tensors
eval_dataset = TensorDataset(input_ids, attention_masks)

# Create DataLoader
eval_dataloader = DataLoader(eval_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Evaluation function (Manually calculating perplexity)
def evaluate_model(model, eval_dataloader):
    model.eval()
    losses = []
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        try:
            # Convert batch to device (assuming it's a list/tuple of tensors)
            batch = tuple(t.to(device) for t in batch)

            with torch.no_grad():
                outputs = model(input_ids=batch[0], attention_mask=batch[1], labels=batch[0])  # Adjust based on your batch structure
            loss = outputs.loss
            losses.append(loss.item())

        except RuntimeError as e:
            if "out of memory" in str(e):
                print("WARNING: Ran out of memory. Consider reducing batch size or model complexity.")
                return None  # Exit early if out of memory
            else:
                raise e

    try:
        perplexity = torch.exp(torch.tensor(losses).mean())
        return perplexity
    except OverflowError:
        print("WARNING: Overflow error while calculating perplexity. Loss values might be too large.")
        return None


# Perform Evaluation
results = evaluate_model(model, eval_dataloader)


 The perplexity of 10.40 achieved on the dataset indicates that the fine-tuned Mistral-7B model has a reasonable understanding of natural language and SQL syntax. However, further evaluation using task-specific metrics is necessary to fully assess the model's effectiveness in real-world scenarios.

In [None]:
print(f"Perplexity: {results:.2f}")

Perplexity: 10.40


In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from datasets import load_dataset
from peft import PeftModel
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
import sqlite3

# Constants
BATCH_SIZE = 8
MAX_LENGTH = 10
peft_model_id = "frankmorales2020/Mistral-7B-text-to-sql-flash-attention-2-dataeval"
data_files = "/content/test_dataset.json"
reduced_size = 10

# Load tokenizer and base model (Mistral-7B-v0.3)
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.3")
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.3",
                                            torch_dtype=torch.float16,
                                            device_map='auto'
                                            )

# Load PEFT model directly (assuming it's a LoRA-based model)
model = PeftModel.from_pretrained(model, peft_model_id)
model.eval()

# Ensure model is on the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load your test dataset
eval_dataset = load_dataset("json", data_files=data_files, split="train")
eval_dataset = eval_dataset.shuffle(seed=42).select(range(reduced_size))

# Tokenization and Tensor Creation
all_input_ids = []
all_attention_masks = []
for item in eval_dataset:
    messages = item['messages']
    # Concatenate the 'content' of all messages into a single string
    text = " ".join([msg['content'] for msg in messages])

    tokenized = tokenizer(text, padding="max_length", truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
    all_input_ids.append(tokenized["input_ids"][0])
    all_attention_masks.append(tokenized["attention_mask"][0])

input_ids = torch.stack(all_input_ids)
attention_masks = torch.stack(all_attention_masks)

# Create TensorDataset from your tensors
eval_dataset = TensorDataset(input_ids, attention_masks)

# Create DataLoader
eval_dataloader = DataLoader(eval_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Evaluation functions (Perplexity and Execution Accuracy)
def evaluate_perplexity(model, eval_dataloader):
    model.eval()
    losses = []
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        try:
            # Convert batch to device (assuming it's a list/tuple of tensors)
            batch = tuple(t.to(device) for t in batch)

            with torch.no_grad():
                outputs = model(input_ids=batch[0], attention_mask=batch[1], labels=batch[0])
                loss = outputs.loss
                losses.append(loss.item())

        except RuntimeError as e:
            if "out of memory" in str(e):
                print("WARNING: Ran out of memory. Consider reducing batch size or model complexity.")
                return None  # Exit early if out of memory
            else:
                raise e

    try:
        perplexity = torch.exp(torch.tensor(losses).mean())
        return perplexity
    except OverflowError:
        print("WARNING: Overflow error while calculating perplexity. Loss values might be too large.")
        return None


# Function to execute SQL query and get results
def execute_query(query, db_path):
    try:
        with sqlite3.connect(db_path) as conn:
            cursor = conn.cursor()
            cursor.execute(query)
            results = cursor.fetchall()
        return results
    except Exception as e:
        print(f"Error executing query: {e}")
        return None

# Function to evaluate accuracy
def evaluate_accuracy(model, eval_dataset, db_path, tokenizer):
    correct = 0
    total = 0
    for item in tqdm(eval_dataset, desc="Evaluating"):
        try:
            # Extract text to feed into the model
            text = " ".join([msg['content'] for msg in item['messages']])
            inputs = tokenizer(text, return_tensors="pt").to(device)

            # Generate SQL query
            with torch.no_grad():
                output = model.generate(**inputs, max_length=200, num_return_sequences=1)
            generated_query = tokenizer.decode(output[0], skip_special_tokens=True)

            # Execute the generated query and get results
            generated_results = execute_query(generated_query, db_path)

            # Get the reference query and its results (assuming it's in your dataset)
            reference_query = item['messages'][-1]['content']  # Last message is the reference query
            reference_results = execute_query(reference_query, db_path)

            if generated_results == reference_results:
                correct += 1
        except Exception as e:
            print(f"Error evaluating example: {e}")
        total += 1

    accuracy = (correct / total) * 100 if total > 0 else 0
    return accuracy

# Perform Evaluation
perplexity = evaluate_perplexity(model, eval_dataloader)
print(f"Perplexity: {perplexity:.2f}")

# Add your database path here
#db_path = "your_database.db"
accuracy = evaluate_accuracy(model, eval_dataset, db_path, tokenizer)
print(f"Accuracy (Eval dataset and predict): {accuracy:.2f}%")

