In [19]:
# required imports/libraries
import os
import torch
import pandas as pd
from torch import tensor
from dotenv import load_dotenv
from openai import OpenAI
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer, util

In [20]:
# load .env file values 
load_dotenv()

# insert OpenAI API key in here from .env
openai_api_key = os.getenv('OPENAI_KEY')

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
autoModel = AutoModel.from_pretrained("bert-base-uncased")
# model = SentenceTransformer("all-MiniLM-L6-v2")

In [21]:
# instantiate OpenAI client with API key
client = OpenAI(
    api_key=openai_api_key
)

In [22]:
# Function to encode a single string
def encode_text(text):
    # Tokenize the input text
    encoded_input = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    
    # Get the model output
    with torch.no_grad():
        output = autoModel(**encoded_input)
    
    # Use the mean pooling of token embeddings as the sentence embedding
    sentence_embedding = output.last_hidden_state.mean(dim=1)
    return sentence_embedding

In [23]:
# tests accuracy of chosen model against unique prompt and data
def evaluation(data, evaluation_prompt, model_name):

    # Create a DataFrame to store results
    test = pd.DataFrame(columns=['Question', 'Expected Answer', 'GPT Answer', 'Similarity'])

    # Iterate through dataset and generate GPT answers
    for index, row in data.iterrows():
        # Extract the question
        question = row['Question']
        expected_answer = row['Answer']

        # Generate response from GPT model
        completion = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": evaluation_prompt},
                {"role": "user", "content": question}
            ]
        )
        gpt_answer = completion.choices[0].message.content.strip()

        # Add results to DataFrame
        test.loc[index] = [question, expected_answer, gpt_answer, None]

    return test

In [24]:
# retrieves data to test accuracy against model of choice
df = pd.read_csv('../../data/evaluation_data.csv')
df = df.drop(columns=['Unnamed: 0'])
df

Unnamed: 0,Question,Answer
0,Abraham Lincoln sixteenth President United States,yes
1,Lincoln sign National Banking Act 1863,yes
2,mother die pneumonia,
3,many long Lincolns formal education,18 months
4,Lincoln begin political career,1832
...,...,...
913,Wilson president American Political Science As...,Yes
914,cast ballot John Palmer presidential candidate...,Yes
915,Wilson spend 1914 beginning 1917 trying keep A...,Yes
916,Wilson staunch opponent antisemitism sympathet...,Yes


In [25]:
# test chat bot with general model or fine-tuning model of choice
# *model* should include available OpenAI models for evaluation from link above
model = "gpt-3.5-turbo-0125"
evaluation_prompt = "Answer questions from this wikipedia dataset"

In [26]:
# execute function for testing model against unique samples
result = evaluation(df, evaluation_prompt, model)

In [27]:
result

Unnamed: 0,Question,Expected Answer,GPT Answer,Similarity
0,Abraham Lincoln sixteenth President United States,yes,Abraham Lincoln was the 16th President of the ...,
1,Lincoln sign National Banking Act 1863,yes,The National Banking Act of 1863 was signed in...,
2,mother die pneumonia,,I'm sorry to hear that. Pneumonia is a serious...,
3,many long Lincolns formal education,18 months,Abraham Lincoln had very little formal educati...,
4,Lincoln begin political career,1832,Abraham Lincoln started his political career i...,
...,...,...,...,...
913,Wilson president American Political Science As...,Yes,Woodrow Wilson served as the president of the ...,
914,cast ballot John Palmer presidential candidate...,Yes,"I'm sorry, I couldn't find any information spe...",
915,Wilson spend 1914 beginning 1917 trying keep A...,Yes,It seems like there might be a typo in your qu...,
916,Wilson staunch opponent antisemitism sympathet...,Yes,It seems like you are referring to Woodrow Wil...,


In [28]:
# Clean and preprocess the data
result['Expected Answer'] = result['Expected Answer'].fillna("").astype(str)
result['GPT Answer'] = result['GPT Answer'].fillna("").astype(str)

In [29]:
# Generate embeddings for 'Expected Answer'
dataset_embeddings = torch.stack([encode_text(answer) for answer in result['Expected Answer'].tolist()])

# Generate embeddings for 'GPT Answer'
gpt_embeddings = torch.stack([encode_text(answer) for answer in result['GPT Answer'].tolist()])

In [30]:
# Example comparison (cosine similarity)
cosine_sim = torch.nn.functional.cosine_similarity(dataset_embeddings, gpt_embeddings)
print("Cosine Similarities:", cosine_sim)

Cosine Similarities: tensor([[-1.,  1.,  1.,  ..., -1., -1.,  1.],
        [-1.,  1.,  1.,  ..., -1., -1.,  1.],
        [ 1.,  1., -1.,  ..., -1.,  1., -1.],
        ...,
        [-1.,  1.,  1.,  ...,  1.,  1.,  1.],
        [-1.,  1.,  1.,  ..., -1., -1.,  1.],
        [-1., -1., -1.,  ...,  1.,  1., -1.]])


In [31]:
result['Similarity'] = cosine_sim

# Compute accuracy based on similarity threshold (e.g., 0.8)
threshold = 0.8
correct = result[result['Similarity'] > threshold].shape[0]
total = result.shape[0]
accuracy = correct / total * 100

In [32]:
accuracy

42.70152505446623

In [33]:
# accuracy result 43.4640522875817
print(f"Accuracy: {accuracy * 1:.2f}%")

Accuracy: 42.70%
