<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/NEW_GRAPH_Evaluator_ChromaDB_Post_Trainining_synthetic_text_to_sql_gretelai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## LIBRARIES

In [None]:
!pip install -q datasets
!pip install -q chromadb
!pip install -q faiss-gpu
!pip install peft  -q

!pip install bitsandbytes -q
!pip pip install accelerate -q

!pip install -U flash-attn --no-build-isolation --quiet

!pip install colab-env --quiet

!pip install mistral_inference -q

!pip install -q evaluate sentence_transformers

In [2]:
!nvidia-smi

Fri Sep 13 02:07:03 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:00:03.0 Off |                    0 |
| N/A   49C    P8              17W /  72W |      1MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
import torch
import colab_env
import os
import sys
import json
import IPython
from datetime import datetime
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
)

# Environment Settings

In [5]:
import colab_env
import os

access_token_write = os.getenv("HUGGINGFACE_ACCESS_TOKEN_WRITE")

from huggingface_hub import login

login(
  token=access_token_write,
  add_to_git_credential=True
)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


## Load Models and Tokenizer AND ChromaDB Setup

In [None]:
import logging
from tqdm.auto import tqdm
from datasets import load_dataset
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, pipeline
import os

from peft import PeftModel # PeftModel is now correctly imported from peft

from sentence_transformers import SentenceTransformer
import chromadb

# Logging Setup
logging.basicConfig(level=logging.INFO)

# 1. Configurable Parameters

#gretelai/synthetic_text_to_sql

#DATASET_FILE = "/content/gdrive/MyDrive/datasets/test_dataset.json"

DATASET_FILE = "/content/gdrive/MyDrive/datasets/gretelai_test_dataset.json"

NUM_SAMPLES_TO_PROCESS = int(os.getenv("NUM_SAMPLES", 25))
GENERATION_PARAMS = {
    "max_new_tokens": 256, "do_sample": True, "temperature": 0.7, "top_k": 50, "top_p": 0.95
}
SIMILARITY_THRESHOLD = 0.85


# 2. Load Evaluation Dataset
eval_dataset = load_dataset("json", data_files=DATASET_FILE, split="train")
if NUM_SAMPLES_TO_PROCESS > 0:
    eval_dataset = eval_dataset.select(range(NUM_SAMPLES_TO_PROCESS))
logging.info(f"Processing {len(eval_dataset)} samples from the dataset.")


# 3. Load Models and Tokenizer

PEFT_MODEL_ID = "frankmorales2020/Mistral-7B-text-to-sql-flash-attention-2-dataeval"
#model = AutoPeftModelForCausalLM.from_pretrained(PEFT_MODEL_ID)

print('\n')
print("Loading Mistral-T2SQL Model...")
mistral_model = AutoPeftModelForCausalLM.from_pretrained(PEFT_MODEL_ID)
print('\n')

print('\n')
print("Loading Mistral Tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(PEFT_MODEL_ID)
print('\n')

print('\n')
print("Loading GNNT2SQL Model...")
model_id ='/content/gdrive/MyDrive/model/GNNT2SQL/checkpoint-1950/'
logging.info(f"Loading fine-tuned PEFT model from: {model_id}")

# Use PeftModel to load the model, pass the model object and model_id as arguments
model = PeftModel.from_pretrained(mistral_model, model_id)
print('\n')

### ONLY WITH HF MODEL ######
#print('\n')
#print("Loading Mistral-T2SQL Model...")
#PEFT_MODEL_ID = "frankmorales2020/Mistral-7B-text-to-sql-flash-attention-2-dataeval"
#model = AutoPeftModelForCausalLM.from_pretrained(PEFT_MODEL_ID)
#tokenizer = AutoTokenizer.from_pretrained(PEFT_MODEL_ID)
#print('\n')

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, **GENERATION_PARAMS)
logging.info("Model and tokenizer loaded successfully!")

# 4. ChromaDB Setup
client = chromadb.PersistentClient(path='db')  # Store embeddings on disk
collection = client.get_or_create_collection(name="sql_queries_and_embeddings")

print('\n')
print("Loading ChromaDB queries...")
# Add Original SQL Queries to ChromaDB
# original_answer = sample["messages"][2]["content"]
embedding_model = SentenceTransformer("all-mpnet-base-v2")
original_sql_queries = [
    item['messages'][2]['content']
    for item in eval_dataset if len(item['messages']) > 2 and item['messages'][2].get('content')
]

sql_embeddings = embedding_model.encode(original_sql_queries).tolist()
collection.add(
    embeddings=sql_embeddings,
    metadatas=[{"original_sql": query} for query in original_sql_queries],
    ids=[f"original_{i}" for i in range(len(original_sql_queries))]  # Unique IDs
)
print('\n')

# Postgresql Setup

In [None]:
#ADDED By FM 01/06/2024
!apt-get update -y
!apt-get install postgresql-14 -y

!service postgresql restart
!sudo apt install postgresql-server-dev-all

In [None]:
# PostGRES SQL Settings
!sudo -u postgres psql -c "CREATE USER postgres WITH SUPERUSER"
!sudo -u postgres psql -c "ALTER USER postgres PASSWORD 'postgres'"

In [9]:
QUERY_create='CREATE TABLE table_name_24 (score VARCHAR, date VARCHAR)'

In [10]:
QUERY_select='SELECT 2009 FROM table_name_50 WHERE 2011 = "a"'

In [11]:
def table_creator(query):
    import os
    import psycopg2 as ps
    import pandas as pd

    DB_NAME = "postgres"
    DB_USER = "postgres"
    DB_PASS = "postgres"
    DB_HOST = "localhost"
    DB_PORT = "5432"

    conn = ps.connect(database=DB_NAME,
                  user=DB_USER,
                  password=DB_PASS,
                  host=DB_HOST,
                  port=DB_PORT)

    cur = conn.cursor() # creating a cursor




    # Wrap the execute command in a try-except block to handle potential errors
    try:
        cur.execute("""
                            %s
                            """%query)
        conn.commit()
        print("Table Created successfully")
    except Exception as e:
        conn.rollback() # Rollback the transaction in case of an error
        print("Error creating table:", e)

    conn.close()

In [12]:
import os
import psycopg2 as ps
import pandas as pd

DB_NAME = "postgres"
DB_USER = "postgres"
DB_PASS = "postgres"
DB_HOST = "localhost"
DB_PORT = "5432"

In [13]:
import os
import psycopg2 as ps
import pandas as pd

def table_select(query):
    conn = ps.connect(database=DB_NAME,
                      user=DB_USER,
                      password=DB_PASS,
                      host=DB_HOST,
                      port=DB_PORT)
    print("Database connected successfully")

    #query = query.replace('"', "'") # Replace double quotes with single quotes for potential date values

    try:

        #df = pd.read_sql_query("%s"%query, con=conn)
        #print('rec: %'%df) # Print the resulting DataFrame

        cur = conn.cursor()
        cur.execute(query)
        rows = cur.fetchall()
        conn.commit()
        conn.close()
        print('\n')
        print('Record(s): %s \n'%len(rows))
        for row in rows:
            print(row)


        eqc=1

    except Exception as e:
        eqc=0
        #conn.rollback() # Rollback the transaction in case of an error
        print("Error executing query:", e)
        #print('TABLE IS EMPTY')
        conn.commit()
        conn.close()

    return eqc

In [14]:
table_creator(QUERY_create)

Table Created successfully


# Model Evaluator

In [15]:
# 6. Evaluation Function (Exact Match Only)
def evaluate(sample):
    eqc=0
    prompt = pipe.tokenizer.apply_chat_template(sample["messages"][:2], tokenize=False, add_generation_prompt=True)
    outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)
    predicted_answer = outputs[0]['generated_text'][len(prompt):].strip()

    #predicted_answer = outputs[0]['generated_text']

    #print("\n\n")
    question = sample["messages"][1]["content"]
    original_answer = sample["messages"][2]["content"]


    schema=sample["messages"][0]['content']
    schema_query=schema[153:len(schema)]

    #print(f'Question: {question}')
    #print(f'SCHEMA: {schema_query}')
    #print(f'Original Answer: {original_answer}')
    #print(f'Generated Answer: {predicted_answer}')

    if predicted_answer == original_answer:

        print("\n")
        print('MATCH')
        print("\n")

        print(f'Question: {question}')
        #print(f'SCHEMA: {schema_query}')
        print(f'Original Answer: {original_answer}')
        #print(f'Generated Answer: {predicted_answer}')

        print("\n")
        print(f'SCHEMA QUERY: {schema_query}')
        table_creator(schema_query)
        print("\n")
        print(f'Generated Answer: {predicted_answer}')
        eqc=table_select(predicted_answer)
        print(eqc)
        print("\n")
        if int(eqc)==1:
           print('Good Query execution')
        else:
           print('Bad Query execution')

        #print("\n")
        #print('MATCH')
        return 1, eqc

    # If not an exact match, check semantic similarity using ChromaDB:
    predicted_embedding = embedding_model.encode([predicted_answer]).tolist()[0]
    results = collection.query(
        query_embeddings=[predicted_embedding],
        n_results=1,
        include=["distances", "metadatas"]
    )
    closest_distance = results['distances'][0][0]
    most_similar_query = results['metadatas'][0][0]['original_sql']
    print("\n")
    print(f'Closest Distance by ChromaDB: {closest_distance}')

    similarity_threshold = SIMILARITY_THRESHOLD

    #if closest_distance < similarity_threshold:
    if most_similar_query == original_answer:

        print("\n")
        print('MATCH (Semantically Similar by ChromaDB)')
        print("\n")



        print("\n")
        print(f'Question: {question}')
        print(f'SCHEMA: {schema_query}')
        print(f'Original Answer: {original_answer}')
        print("\n\n")
        print(f'Generated Answer: {predicted_answer}')
        print("\n")


        print("\n")
        print(f'SCHEMA QUERY: {schema_query}')
        table_creator(schema_query)
        print("\n")


        print('Similar Query:', most_similar_query)
        eqc=table_select(most_similar_query)
        print("\n")
        if int(eqc)==1:
           print('Good Query execution')
        else:
           print('Bad Query execution')
        print("\n")
        return 1, eqc

    else:
        print('NO MATCH')
        return 0, eqc

    print("\n\n")

# 7. Main Evaluation Loop
success_rate = []
success_rate_query = []

for i, s in enumerate(tqdm(eval_dataset)):
    print()
    print(f"EVALUATING SAMPLE: {i}")
    try:
        success_rate.append(evaluate(s))
    except Exception as e:
        logging.error(f"Error evaluating sample {i}: {e}")



# 8. Compute and Print Accuracy
if len(success_rate) > 0:
    # Extract the first element (match success indicator) from each tuple
    match_successes = [result[0] for result in success_rate]
    accuracy = sum(match_successes) / len(success_rate)
    print(f"\nMatch Accuracy: {accuracy:.2%}\n")

    query_successes = [result[1] for result in success_rate]
    accuracy = sum(query_successes) / len(query_successes)
    print(f"\nQuery Successes: {accuracy:.2%}\n")

else:
    print("\nNo samples were successfully evaluated. Check the dataset and evaluation logic.\n")


# 8. Compute and Print Accuracy
#if len(success_rate) > 0:
#    accuracy = sum(success_rate) / len(success_rate)
#    print(f"\nMatch Accuracy: {accuracy:.2%}\n")
#else:
#    print("\nNo samples were successfully evaluated. Check the dataset and evaluation logic.\n")

  0%|          | 0/25 [00:00<?, ?it/s]


EVALUATING SAMPLE: 0


Closest Distance by ChromaDB: 0.23844483000562397


MATCH (Semantically Similar by ChromaDB)




Question: What is the percentage of successful open data initiatives in the education sector?
SCHEMA: CREATE TABLE open_data_initiatives (id INT, sector VARCHAR(20), status VARCHAR(10)); INSERT INTO open_data_initiatives (id, sector, status) VALUES (1, 'justice', 'open'), (2, 'transportation', 'open'), (3, 'education', 'closed'), (4, 'education', 'open');
Original Answer: SELECT 100.0 * COUNT(CASE WHEN status = 'open' THEN 1 END) / COUNT(*) FROM open_data_initiatives WHERE sector = 'education';



Generated Answer: SELECT COUNT(*) FROM open_data_initiatives WHERE sector = 'education' AND status = 'open' DIVIDE SELECT COUNT(*) FROM open_data_initiatives WHERE sector = 'education'




SCHEMA QUERY: CREATE TABLE open_data_initiatives (id INT, sector VARCHAR(20), status VARCHAR(10)); INSERT INTO open_data_initiatives (id, sector, status) VALUES (1, 'justice', 'open'), (2