<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/AGENTEVAL_DEMO_FINAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install peft -q
!pip install bitsandbytes -q
!pip install transformers -q
!pip install sentence-transformers -q
!pip install datasets -q
!pip install tqdm -q

!pip install colab-env -q
import colab_env

In [2]:
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, pipeline
from datasets import load_dataset
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util

# Load a pre-trained sentence transformer model
embedding_model = SentenceTransformer('all-mpnet-base-v2')

In [3]:
class EvaluationAgent:
    def __init__(self, model_id, eval_dataset, number_of_eval_samples):
        self.model_id = model_id
        self.eval_dataset = eval_dataset
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

         # 1. Load Tokenizer first
        self.tokenizer = AutoTokenizer.from_pretrained(model_id) # Initialize tokenizer here

        self.number_of_eval_samples=number_of_eval_samples

        # 2. Then Load Model (using AutoPeftModelForCausalLM)
        self.model = AutoPeftModelForCausalLM.from_pretrained(
            model_id,
            device_map="cuda",
            torch_dtype=torch.float16,
        )

        # load into pipeline
        self.pipe = pipeline("text-generation", model=self.model, tokenizer=self.tokenizer)

    def evaluate(self, sample):
        prompt =  self.pipe.tokenizer.apply_chat_template(sample["messages"][:2], tokenize=False, add_generation_prompt=True)
        outputs = self.pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95, eos_token_id=self.pipe.tokenizer.eos_token_id, pad_token_id=self.pipe.tokenizer.pad_token_id)
        predicted_answer = outputs[0]['generated_text'][len(prompt):].strip()

        # Generate embeddings using the embedding_model
        embedding1 = embedding_model.encode(predicted_answer, convert_to_tensor=True)
        embedding2 = embedding_model.encode(sample["messages"][2]["content"], convert_to_tensor=True)


        #print(f"Query:\n{sample['messages'][1]['content']}")
        #print(f"Original Answer:\n{sample['messages'][2]['content']}")
        #print(f"Generated Answer:\n{predicted_answer}")

        # Calculate cosine similarity
        cosine_similarity = util.cos_sim(embedding1, embedding2).item()

        # Define a threshold for semantic similarity (e.g., 0.8)
        threshold = 0.8

        # Check if similarity is above the threshold
        if cosine_similarity >= threshold:
            return 1  # Semantically similar
        else:
            return 0  # Semantically dissimilar


    def evaluation_loop(self):
        success_rate = []
        number_of_eval_samples = self.number_of_eval_samples
        # iterate over eval dataset and predict
        for s in tqdm(eval_dataset.shuffle().select(range(number_of_eval_samples))):
            success_rate.append(self.evaluate(s))

        # compute accuracy
        accuracy = sum(success_rate)/len(success_rate)

        print('\n')
        #print(f"Success rate: {success_rate}")
        print(f"Accuracy: {accuracy*100:.2f}%")

In [None]:
from datasets import load_dataset


# Convert dataset to OAI messages
system_message = """You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.
SCHEMA:
{schema}"""

def create_conversation(sample):
  return {
    "messages": [
      {"role": "system", "content": system_message.format(schema=sample["context"])},
      {"role": "user", "content": sample["question"]},
      {"role": "assistant", "content": sample["answer"]}
    ]
  }

# Load dataset from the hub
dataset = load_dataset("b-mc2/sql-create-context", split="train")
dataset = dataset.shuffle().select(range(12500))

# Convert dataset to OAI messages
dataset = dataset.map(create_conversation, remove_columns=dataset.features,batched=False)

# split dataset into 10,000 training samples and 2,500 test samples
dataset = dataset.train_test_split(test_size=2500/12500)

print(dataset["train"][345]["messages"])

# save datasets to disk
dataset["test"].to_json("test_dataset.json", orient="records")

# Load our test dataset
eval_dataset = load_dataset("json", data_files="test_dataset.json", split="train")

# Example usage
#fine tune model
initial_peft_model_id = "/content/gdrive/MyDrive/model/Mistral-7B-text-to-sql-flash-attention-2"
#initial_peft_model_id = "/content/gdrive/MyDrive/model/results-MODEL/checkpoint-1250

number_of_eval_samples=5
agenteval=EvaluationAgent(initial_peft_model_id,eval_dataset,number_of_eval_samples)

In [5]:
agenteval.evaluation_loop()

100%|██████████| 5/5 [00:14<00:00,  2.88s/it]



Accuracy: 100.00%





NEW