<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/upload_model_GNN_T2SQLipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install Pytorch & other libraries
!pip install torch tensorboard --quiet

# Install Hugging Face libraries
!pip install  --upgrade transformers datasets accelerate evaluate bitsandbytes --quiet

#FlashAttention only supports Ampere GPUs or newer. #NEED A100 IN GOOGLE COLAB
#!pip install -U transformers
!pip install -U flash-attn --no-build-isolation --quiet


! pip install peft --quiet
! pip install datasets trl ninja packaging --quiet

# Uncomment only if you're using A100 GPU
#!pip install flash-attn --no-build-isolation
!pip install diffusers safetensors  --quiet
!pip install colab-env --quiet

!pip install huggingface_hub -q

In [None]:
import colab_env
import os

access_token_write = os.getenv("HUGGINGFACE_ACCESS_TOKEN_WRITE")

from huggingface_hub import login

login(
  token=access_token_write,
  add_to_git_credential=True
)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
)

from peft import PeftModel # PeftModel is now correctly imported from peft


import logging
from tqdm.auto import tqdm
from datasets import load_dataset
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, pipeline
import os

In [None]:
#model_name = "frankmorales2020/Mistral-7B-text-to-sql-flash-attention-2" # 04/03/2024

# Load Models and Tokenizer

PEFT_MODEL_ID = "frankmorales2020/Mistral-7B-text-to-sql-flash-attention-2-dataeval"
#model = AutoPeftModelForCausalLM.from_pretrained(PEFT_MODEL_ID)

print('\n')
print("Loading Mistral-T2SQL Model...")
mistral_model = AutoPeftModelForCausalLM.from_pretrained(PEFT_MODEL_ID)
print('\n')

print('\n')
print("Loading Mistral Tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(PEFT_MODEL_ID)
print('\n')

print('\n')
print("Loading GNNT2SQL Model...")
model_name2 ='/content/gdrive/MyDrive/model/GNNT2SQL/checkpoint-1950/'


# Use PeftModel to load the model, pass the model object and model_id as arguments
model = PeftModel.from_pretrained(mistral_model, model_name2)
print('\n')

# set device
device = 'cuda'

#v Tokenizer
#tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
# We redefine the pad_token and pad_token_id with out of vocabulary token (unk_token)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.unk_token_id


# load into pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [None]:
# Convert dataset to OAI messages
system_message = """You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.
SCHEMA:
{schema}"""

def create_conversation(sample):
  return {
    "messages": [
      {"role": "system", "content": system_message.format(schema=sample["context"])},
      {"role": "user", "content": sample["question"]},
      {"role": "assistant", "content": sample["answer"]}
    ]
  }

# Load dataset from the hub
# jtjt520j/CSpider_sql_create_context
# b-mc2/sql-create-context
#dataset = load_dataset("jtjt520j/CSpider_sql_create_context", split="train")
dataset = load_dataset("b-mc2/sql-create-context", split="train")
dataset = dataset.shuffle().select(range(12500))

# Convert dataset to OAI messages
dataset = dataset.map(create_conversation, remove_columns=dataset.features,batched=False)
# split dataset into 10,000 training samples and 2,500 test samples
dataset = dataset.train_test_split(test_size=2500/12500)

print(dataset["train"][345]["messages"])

# save datasets to disk
dataset["train"].to_json("train_dataset.json", orient="records")
dataset["test"].to_json("test_dataset.json", orient="records")

In [None]:
from datasets import load_dataset
from random import randint


eval_dataset = load_dataset("json", data_files="test_dataset.json", split="train")
rand_idx = randint(0, len(eval_dataset))

In [None]:
# generate the same prompt as for the first local test
prompt = tokenizer.apply_chat_template(eval_dataset[rand_idx]["messages"][:2], tokenize=False, add_generation_prompt=True)
request= {"inputs":prompt,"parameters":{"temperature":0.2, "top_p": 0.95, "max_new_tokens": 256}}

Parameter-Efficient Fine-Tuning (PEFT)

In [None]:
from datasets import load_dataset
from random import randint


# Load our test dataset
eval_dataset = load_dataset("json", data_files="test_dataset.json", split="train")
rand_idx = randint(0, len(eval_dataset))

# Test on sample
prompt = pipe.tokenizer.apply_chat_template(eval_dataset[rand_idx]["messages"][:2], tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=256, do_sample=False, temperature=0.1, top_k=50, top_p=0.1, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)


Query:
Name the transit passengers for 171078
Original Answer:
SELECT MAX(transit_passengers) FROM table_13836704_8 WHERE freight__metric_tonnes_ = 171078
Generated Answer:
SELECT MAX(transit_passengers) FROM table_13836704_8 WHERE freight__metric_tonnes_ = 171078

In [None]:
print(f"Query:\n{eval_dataset[rand_idx]['messages'][1]['content']}")
print(f"Original Answer:\n{eval_dataset[rand_idx]['messages'][2]['content']}")
print(f"Generated Answer:\n{outputs[0]['generated_text'][len(prompt):].strip()}")

Query:
What was the average money when the score was 68-69-68-72=277?
Original Answer:
SELECT AVG(money___) AS $__ FROM table_name_97 WHERE score = 68 - 69 - 68 - 72 = 277
Generated Answer:
SELECT AVG(money___) FROM table_name_97 WHERE score = 68 - 69 - 68 - 72 = 277


https://huggingface.co/frankmorales2020

In [None]:
from tqdm import tqdm

# Load our test dataset
eval_dataset = load_dataset("json", data_files="test_dataset.json", split="train")
rand_idx = randint(0, len(eval_dataset))

def evaluate(sample):
    prompt = pipe.tokenizer.apply_chat_template(sample["messages"][:2], tokenize=False, add_generation_prompt=True)
    outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)
    predicted_answer = outputs[0]['generated_text'][len(prompt):].strip()
    print()
    print()
    print('Question: %s\n'%sample["messages"][1]["content"])
    #print(sample["messages"][2]["content"])
    #print('Predicted Answer: %s'%sample["messages"][2]["content"])
    print()
    if predicted_answer == sample["messages"][2]["content"]:
        #print('Success!')
        print('Success Answer: %s'%sample["messages"][2]["content"])
        return 1
    else:
        print('Real Answer: %s'%predicted_answer)
        print('Failed Answer: %s'%sample["messages"][2]["content"])
        return 0

success_rate = []

number_of_eval_samples = 10
# iterate over eval dataset and predict
for s in tqdm(eval_dataset.shuffle().select(range(number_of_eval_samples))):
    success_rate.append(evaluate(s))

# compute accuracy
accuracy = sum(success_rate)/len(success_rate)

print(f"Accuracy: {accuracy*100:.2f}%")

 10%|█         | 1/10 [00:12<01:54, 12.74s/it]



Question: How many different nationalities do conductors have?


Success Answer: SELECT COUNT(DISTINCT Nationality) FROM conductor


 20%|██        | 2/10 [00:27<01:53, 14.15s/it]



Question: Name the finish for a 39-31 record


Success Answer: SELECT finish FROM table_name_64 WHERE record = "39-31"


 30%|███       | 3/10 [00:49<02:02, 17.53s/it]



Question: Which name has a pressure of 985hpa (29.09inhg)?


Success Answer: SELECT name FROM table_name_56 WHERE pressure = "985hpa (29.09inhg)"


 40%|████      | 4/10 [01:03<01:36, 16.05s/it]



Question: What status is shown for Cadeby?


Success Answer: SELECT status FROM table_name_17 WHERE name = "cadeby"


 50%|█████     | 5/10 [01:12<01:08, 13.64s/it]



Question: What are the distinct states and create time of all votes?


Success Answer: SELECT DISTINCT state, created FROM votes


 60%|██████    | 6/10 [01:23<00:51, 12.80s/it]



Question: Show all student ids who are older than 20.


Success Answer: SELECT StuID FROM Student WHERE age > 20


 70%|███████   | 7/10 [01:42<00:44, 14.81s/it]



Question: On what surface was the Australian Open (6) played on?


Success Answer: SELECT surface FROM table_29163303_1 WHERE championship = "Australian Open (6)"


 80%|████████  | 8/10 [02:04<00:33, 17.00s/it]



Question: What is every value for Russian when value for Bulgarian is пес, куче?


Success Answer: SELECT russian FROM table_25008327_8 WHERE bulgarian = "пес, куче"


 90%|█████████ | 9/10 [02:41<00:23, 23.17s/it]



Question: List the names of wrestlers and the teams in elimination in descending order of days held.


Real Answer: SELECT T2.Name, T1.Team FROM wrestler AS T1 JOIN elimination AS T2 ON T1.Wrestler_ID = T2.Wrestler_ID ORDER BY T1.Days_held DESC
Failed Answer: SELECT T2.Name, T1.Team FROM elimination AS T1 JOIN wrestler AS T2 ON T1.Wrestler_ID = T2.Wrestler_ID ORDER BY T2.Days_held DESC


100%|██████████| 10/10 [03:02<00:00, 18.21s/it]



Question: What player belongs to the Chicago Blackhawks?


Success Answer: SELECT player FROM table_2781227_2 WHERE nhl_team = "Chicago Blackhawks"
Accuracy: 90.00%





When evaluated on 10 samples from the evaluation dataset, our model achieved an impressive accuracy of 90.00%.


100%|██████████| 1000/1000 [33:17<00:00,  2.00s/it]Accuracy: 82.60% with peft_model_id = "frankmorales2020/Mistral-7B-text-to-sql-flash-attention-2" # 04/03/2024 and 10/03/2024

In [None]:
from tqdm import tqdm

# Load our test dataset
eval_dataset = load_dataset("json", data_files="test_dataset.json", split="train")
rand_idx = randint(0, len(eval_dataset))

def evaluate(sample):
    prompt = pipe.tokenizer.apply_chat_template(sample["messages"][:2], tokenize=False, add_generation_prompt=True)
    outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)
    predicted_answer = outputs[0]['generated_text'][len(prompt):].strip()
    if predicted_answer == sample["messages"][2]["content"]:
        return 1
    else:
        return 0

success_rate = []
number_of_eval_samples = 1000
# iterate over eval dataset and predict
for s in tqdm(eval_dataset.shuffle().select(range(number_of_eval_samples))):
    success_rate.append(evaluate(s))

# compute accuracy
accuracy = sum(success_rate)/len(success_rate)

print(f"Accuracy: {accuracy*100:.2f}%")

100%|██████████| 1000/1000 [5:48:09<00:00, 20.89s/it]

Accuracy: 75.60%






When evaluated on 1000 samples from the evaluation dataset, our model achieved an impressive accuracy of 75.60%. However, there's room for improvement. We could enhance the model's performance by exploring techniques like few-shot learning, RAG, and Self-healing to generate the SQL query.