<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/AGENTEVAL_DEMO_FINAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install peft -q
!pip install bitsandbytes -q
!pip install transformers -q
!pip install sentence-transformers -q
!pip install datasets -q
!pip install tqdm -q

!pip install colab-env -q
import colab_env

In [4]:
!nvidia-smi

Sat Feb 22 09:28:10 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   33C    P0             51W /  400W |   19887MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

## EvaluationAgent - SIMPLE

In [2]:
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, pipeline
from datasets import load_dataset
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util

# Load a pre-trained sentence transformer model
embedding_model = SentenceTransformer('all-mpnet-base-v2')

In [3]:
class EvaluationAgent:
    def __init__(self, model_id, eval_dataset, number_of_eval_samples):
        self.model_id = model_id
        self.eval_dataset = eval_dataset
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

         # 1. Load Tokenizer first
        self.tokenizer = AutoTokenizer.from_pretrained(model_id) # Initialize tokenizer here

        self.number_of_eval_samples=number_of_eval_samples

        # 2. Then Load Model (using AutoPeftModelForCausalLM)
        self.model = AutoPeftModelForCausalLM.from_pretrained(
            model_id,
            device_map="cuda",
            torch_dtype=torch.float16,
        )

        # load into pipeline
        self.pipe = pipeline("text-generation", model=self.model, tokenizer=self.tokenizer)

    def evaluate(self, sample):
        prompt =  self.pipe.tokenizer.apply_chat_template(sample["messages"][:2], tokenize=False, add_generation_prompt=True)
        outputs = self.pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95, eos_token_id=self.pipe.tokenizer.eos_token_id, pad_token_id=self.pipe.tokenizer.pad_token_id)
        predicted_answer = outputs[0]['generated_text'][len(prompt):].strip()

        # Generate embeddings using the embedding_model
        embedding1 = embedding_model.encode(predicted_answer, convert_to_tensor=True)
        embedding2 = embedding_model.encode(sample["messages"][2]["content"], convert_to_tensor=True)


        #print(f"Query:\n{sample['messages'][1]['content']}")
        #print(f"Original Answer:\n{sample['messages'][2]['content']}")
        #print(f"Generated Answer:\n{predicted_answer}")

        # Calculate cosine similarity
        cosine_similarity = util.cos_sim(embedding1, embedding2).item()

        # Define a threshold for semantic similarity (e.g., 0.8)
        threshold = 0.8

        # Check if similarity is above the threshold
        if cosine_similarity >= threshold:
            return 1  # Semantically similar
        else:
            return 0  # Semantically dissimilar


    def evaluation_loop(self):
        success_rate = []
        number_of_eval_samples = self.number_of_eval_samples
        # iterate over eval dataset and predict
        for s in tqdm(eval_dataset.shuffle().select(range(number_of_eval_samples))):
            success_rate.append(self.evaluate(s))

        # compute accuracy
        accuracy = sum(success_rate)/len(success_rate)

        print('\n')
        #print(f"Success rate: {success_rate}")
        print(f"Accuracy: {accuracy*100:.2f}%")

In [None]:
from datasets import load_dataset


# Convert dataset to OAI messages
system_message = """You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.
SCHEMA:
{schema}"""

def create_conversation(sample):
  return {
    "messages": [
      {"role": "system", "content": system_message.format(schema=sample["context"])},
      {"role": "user", "content": sample["question"]},
      {"role": "assistant", "content": sample["answer"]}
    ]
  }

# Load dataset from the hub
dataset = load_dataset("b-mc2/sql-create-context", split="train")
dataset = dataset.shuffle().select(range(12500))

# Convert dataset to OAI messages
dataset = dataset.map(create_conversation, remove_columns=dataset.features,batched=False)

# split dataset into 10,000 training samples and 2,500 test samples
dataset = dataset.train_test_split(test_size=2500/12500)

print(dataset["train"][345]["messages"])

# save datasets to disk
dataset["test"].to_json("test_dataset.json", orient="records")

# Load our test dataset
eval_dataset = load_dataset("json", data_files="test_dataset.json", split="train")

# Example usage
#fine tune model
initial_peft_model_id = "/content/gdrive/MyDrive/model/Mistral-7B-text-to-sql-flash-attention-2"
#initial_peft_model_id = "/content/gdrive/MyDrive/model/results-MODEL/checkpoint-1250

number_of_eval_samples=5
agenteval=EvaluationAgent(initial_peft_model_id,eval_dataset,number_of_eval_samples)

In [5]:
agenteval.evaluation_loop()

100%|██████████| 5/5 [00:14<00:00,  2.88s/it]



Accuracy: 100.00%





## EvaluationAgent - OODA

In [None]:
# install necessary libraries
!pip install peft -q
!pip install bitsandbytes -q
!pip install transformers -q
!pip install sentence-transformers -q
!pip install datasets -q
!pip install tqdm -q
!pip install colab-env -q

import colab_env
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, pipeline
from datasets import load_dataset
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util

# Load a pre-trained sentence transformer model
embedding_model = SentenceTransformer('all-mpnet-base-v2')

class EvaluationAgent:
    def __init__(self, model_id, eval_dataset,batch_size=16): #Added batch size
        self.model_id = model_id
        self.eval_dataset = eval_dataset
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
        self.model = AutoPeftModelForCausalLM.from_pretrained(
            model_id,
            device_map="cuda",
            torch_dtype=torch.float16,
        )
        self.pipe = pipeline("text-generation", model=self.model, tokenizer=self.tokenizer)

        #self.number_of_eval_samples = number_of_eval_samples
        self.embedding_model = embedding_model #changed to the object outside the class

        self.batch_size=batch_size #add the batch size to our object

    def observe(self, sample):
        # Gather data and observations
        # corrected the prompt to pass in a list of messages
        prompt = self.pipe.tokenizer.apply_chat_template(sample['messages'][:2], tokenize=False, add_generation_prompt=True)
        return prompt

    def orient(self, prompt):
        # Analyze and synthesize information
        # (In this example, orientation might involve analyzing the prompt)
        return

    def decide(self, prompt):
        # Make decisions based on available information
        outputs = self.pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.5, top_k=50, top_p=0.95, eos_token_id=self.pipe.tokenizer.eos_token_id, pad_token_id=self.pipe.tokenizer.pad_token_id) #added all arguments that were previously in the code
        predicted_answer = outputs[0]['generated_text'][len(prompt):].strip() #output is a list of one object so select the first object
        return predicted_answer

    def act(self, predicted_answer, sample):
        # Take action based on decisions
        # (In this case, compare the predicted answer with the original)
        embedding1 = self.embedding_model.encode(predicted_answer, convert_to_tensor=True)
        embedding2 = self.embedding_model.encode(sample['messages'][2]['content'], convert_to_tensor=True) #pass in third object of list for comparison

        cosine_similarity = util.cos_sim(embedding1, embedding2).item()
        threshold = 0.8

        if cosine_similarity >= threshold:
            return 1  # Semantically similar
        else:
            return 0  # Semantically dissimilar

    def evaluate(self, sample):
        # OODA loop within the evaluation process
        prompt = self.observe(sample)
        self.orient(prompt)
        predicted_answer = self.decide(prompt)
        return self.act(predicted_answer, sample)

    def evaluation_loop(self):
        success_rate = []
        # Iterate over data in batches
        for i in tqdm(range(0, len(self.eval_dataset), self.batch_size)): # use self.eval_dataset and the class variable
            batch = self.eval_dataset.select(range(i, min(i + self.batch_size, len(self.eval_dataset))))

            # Generate predictions for the batch in parallel
            # Changed the following line to only pass the first two messages
            prompts = [self.pipe.tokenizer.apply_chat_template(sample['messages'][:2], tokenize=False, add_generation_prompt=True) for sample in batch]
            outputs = self.pipe(prompts, max_new_tokens=256, do_sample=True, temperature=0.5)

            # Process each sample in the batch
            for j, sample in enumerate(batch):
                predicted_answer = outputs[j][0]['generated_text'][len(prompts[j]):].strip() #outputs[j] is now a list, not a dict so we must index into it first
                success_rate.append(self.act(predicted_answer, sample))

        accuracy = sum(success_rate) / len(success_rate)
        print('\n')
        print(f"Accuracy: {accuracy*100:.2f}%")

# Convert dataset to OAI messages
system_message = """You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.
SCHEMA:
{schema}"""

def create_conversation(sample):
  return {
    "messages": [
      {"role": "system", "content": system_message.format(schema=sample["context"])},
      {"role": "user", "content": sample["question"]},
      {"role": "assistant", "content": sample["answer"]}
    ]
  }

# Load dataset from the hub
dataset = load_dataset("b-mc2/sql-create-context", split="train")
dataset = dataset.shuffle().select(range(12500))

# Convert dataset to OAI messages
dataset = dataset.map(create_conversation, remove_columns=dataset.features, batched=False)

# split dataset into 10,000 training samples and 2,500 test samples
train_test_dataset = dataset.train_test_split(test_size=2500/12500)

# Get the test dataset
eval_dataset = train_test_dataset["test"]

# Example usage
initial_peft_model_id = "/content/gdrive/MyDrive/model/Mistral-7B-text-to-sql-flash-attention-2"
batch_size=16 # 2,500 test samples / 16 = 157

agenteval = EvaluationAgent(initial_peft_model_id, eval_dataset,batch_size=batch_size) #added batch_size

* You seem to be using the pipelines sequentially on GPU: https://stackoverflow.com/questions/77159136/efficiently-using-hugging-face-transformers-pipelines-on-gpu-with-large-datasets

In [3]:
import time

start_time = time.time()
agenteval.evaluation_loop()
end_time = time.time()

processing_time = end_time - start_time
print("Processing time:", processing_time)

  6%|▋         | 10/157 [07:05<1:51:49, 45.65s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 157/157 [1:44:39<00:00, 40.00s/it]



Accuracy: 95.60%
Processing time: 6279.85951757431



