In [2]:
# required imports/libraries
import os
import re
import torch
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from torch import tensor
from rouge_score import rouge_scorer
from dotenv import load_dotenv
from openai import OpenAI
from transformers import AutoTokenizer, AutoModel
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
from sentence_transformers import SentenceTransformer, util

In [3]:
# load .env file values 
load_dotenv()

# insert OpenAI API key in here from .env
openai_api_key = os.getenv('OPENAI_KEY')

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
autoModel = AutoModel.from_pretrained("bert-base-uncased")
# model = SentenceTransformer("all-MiniLM-L6-v2")

In [4]:
# instantiate OpenAI client with API key
client = OpenAI(
    api_key=openai_api_key
)

In [5]:
def bleu_average(data):
    bleu_scores = []
    weights = (0.1, 0, 0, 0)  # Weights for uni-gram, bi-gram, tri-gram, and 4-gram
    smooth_fn = SmoothingFunction().method1
    references = np.array(data['Expected Answer'])
    predictions = np.array(data['GPT Answer'])
    
    for index in range(len(references)):
        
        reference = references[index].split()
        prediction = predictions[index].split()

        score = sentence_bleu(reference, prediction, weights=weights, smoothing_function=smooth_fn)
        # print(f'Score: {score}')
        bleu_scores.append(score)

    return np.mean(np.array(bleu_scores))

In [6]:
def rouge_average(data):
    rougeL_scores = []
    rouge1_scores = []
    weights = (0.1, 0, 0, 0)  # Weights for uni-gram, bi-gram, tri-gram, and 4-gram
    smooth_fn = SmoothingFunction().method1
    references = np.array(data['Expected Answer'])
    predictions = np.array(data['GPT Answer'])
    
    for index in range(len(references)):
        
        reference = references[index]
        prediction = predictions[index]

        scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
        scores = scorer.score(reference, prediction)
        rouge1_scores.append(scores['rouge1'])
        rougeL_scores.append(scores['rougeL'])
        # print(f"ROUGE-1 F1 Score: {scores['rouge1'].fmeasure:.2f}")
        # print(f"ROUGE-L F1 Score: {scores['rougeL'].fmeasure:.2f}")

    return np.mean(np.array(rougeL_scores)), np.mean(np.array(rouge1_scores))

In [7]:
# Function to encode a single string
def encode_text(text):
    # Tokenize the input text
    encoded_input = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    
    # Get the model output
    with torch.no_grad():
        output = autoModel(**encoded_input)
    
    # Use the mean pooling of token embeddings as the sentence embedding
    sentence_embedding = output.last_hidden_state.mean(dim=1)
    return sentence_embedding

In [8]:
# tests accuracy of chosen model against unique prompt and data
def evaluation(data, evaluation_prompt, model_name):

    # Create a DataFrame to store results
    test = pd.DataFrame(columns=['Question', 'Expected Answer', 'GPT Answer', 'Similarity'])

    # Iterate through dataset and generate GPT answers
    for index, row in data.iterrows():
        # Extract the question
        question = row['Question']
        expected_answer = row['Answer']
        print(f"Starting Completion at Index: {index}")

        # Generate response from GPT model
        completion = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": evaluation_prompt},
                {"role": "user", "content": question}
            ]
        )
        gpt_answer = completion.choices[0].message.content.strip()
        print(f'Finished Completion {index}!')
        # Add results to DataFrame
        test.loc[index] = [question, expected_answer, gpt_answer, None]

    return test

In [10]:
# retrieves data to test accuracy against model of choice
df = pd.read_csv('../../../data/validation/lingjoor_evaluation_data.csv')
df = df.drop(columns=['Unnamed: 0'])
df

Unnamed: 0,Question,Answer
0,Write response veteran interested renting house,Hello Thank inquiry nice meet Wed love keep ho...
1,evergreen garden,order evergreen garden crucial buy right plant...
2,select tennis racket Im completely new sport,youre beginner tennis player pick racket help ...
3,country highest life expectancy,Life expectancy humans doubled last century th...
4,could Sunday,people work Sundays free pursue leisure activi...
...,...,...
995,Tom Billeter,Tom Billeter born February 12 1961 American co...
996,pros cons charging Tesla home,convenient way charge EV charge home overnight...
997,reasons people like visit Brazil,Brazil South American country diverse landscap...
998,species fish Tetra Quart,Tetra


In [11]:
# test chat bot with general model or fine-tuning model of choice
# *model* should include available OpenAI models for evaluation from link above
model = "gpt-3.5-turbo-0125"
evaluation_prompt = "Answer the following question concisely and accurately"

In [12]:
# execute function for testing model against unique samples
result = evaluation(df, evaluation_prompt, model)

Starting Completion at Index: 0
Finished Completion 0!
Starting Completion at Index: 1
Finished Completion 1!
Starting Completion at Index: 2
Finished Completion 2!
Starting Completion at Index: 3
Finished Completion 3!
Starting Completion at Index: 4
Finished Completion 4!
Starting Completion at Index: 5
Finished Completion 5!
Starting Completion at Index: 6
Finished Completion 6!
Starting Completion at Index: 7
Finished Completion 7!
Starting Completion at Index: 8
Finished Completion 8!
Starting Completion at Index: 9
Finished Completion 9!
Starting Completion at Index: 10
Finished Completion 10!
Starting Completion at Index: 11
Finished Completion 11!
Starting Completion at Index: 12
Finished Completion 12!
Starting Completion at Index: 13
Finished Completion 13!
Starting Completion at Index: 14
Finished Completion 14!
Starting Completion at Index: 15
Finished Completion 15!
Starting Completion at Index: 16
Finished Completion 16!
Starting Completion at Index: 17
Finished Completio

In [13]:
result

Unnamed: 0,Question,Expected Answer,GPT Answer,Similarity
0,Write response veteran interested renting house,Hello Thank inquiry nice meet Wed love keep ho...,"Sure thing. ""Thank you for your service. I'm i...",
1,evergreen garden,order evergreen garden crucial buy right plant...,An evergreen garden is a type of garden that c...,
2,select tennis racket Im completely new sport,youre beginner tennis player pick racket help ...,When selecting a tennis racket as a beginner i...,
3,country highest life expectancy,Life expectancy humans doubled last century th...,"As of 2021, the country with the highest life ...",
4,could Sunday,people work Sundays free pursue leisure activi...,"I'm sorry, could you please clarify your quest...",
...,...,...,...,...
995,Tom Billeter,Tom Billeter born February 12 1961 American co...,Tom Billeter is the founder and CEO of reqalla...,
996,pros cons charging Tesla home,convenient way charge EV charge home overnight...,Pros of charging a Tesla at home:\n1. Convenie...,
997,reasons people like visit Brazil,Brazil South American country diverse landscap...,People like to visit Brazil for its diverse cu...,
998,species fish Tetra Quart,Tetra,There seems to be a misunderstanding in your q...,


In [14]:
# Clean and preprocess the data
result['Expected Answer'] = result['Expected Answer'].fillna("").astype(str)
result['GPT Answer'] = result['GPT Answer'].fillna("").astype(str)

In [15]:
# Generate embeddings for 'Expected Answer'
dataset_embeddings = torch.stack([encode_text(answer) for answer in result['Expected Answer'].tolist()])

# Generate embeddings for 'GPT Answer'
gpt_embeddings = torch.stack([encode_text(answer) for answer in result['GPT Answer'].tolist()])

In [16]:
# Example comparison (cosine similarity)
cosine_sim = torch.nn.functional.cosine_similarity(dataset_embeddings, gpt_embeddings)
print("Cosine Similarities:", cosine_sim)

Cosine Similarities: tensor([[ 1., -1.,  1.,  ..., -1.,  1., -1.],
        [ 1.,  1.,  1.,  ...,  1.,  1., -1.],
        [ 1.,  1.,  1.,  ..., -1.,  1.,  1.],
        ...,
        [ 1.,  1.,  1.,  ...,  1.,  1.,  1.],
        [ 1.,  1., -1.,  ..., -1., -1., -1.],
        [ 1.,  1.,  1.,  ..., -1.,  1., -1.]])


In [17]:
result['Similarity'] = cosine_sim

# Compute accuracy based on similarity threshold (e.g., 0.8)
threshold = 0.5
correct = result[result['Similarity'] > threshold].shape[0]
total = result.shape[0]
accuracy = correct / total * 100

In [18]:
accuracy

65.3

In [19]:
# cosine similarity accuracy
print(f"Accuracy: {accuracy * 1:.2f}%")

Accuracy: 65.30%


In [20]:
result

Unnamed: 0,Question,Expected Answer,GPT Answer,Similarity
0,Write response veteran interested renting house,Hello Thank inquiry nice meet Wed love keep ho...,"Sure thing. ""Thank you for your service. I'm i...",1.0
1,evergreen garden,order evergreen garden crucial buy right plant...,An evergreen garden is a type of garden that c...,1.0
2,select tennis racket Im completely new sport,youre beginner tennis player pick racket help ...,When selecting a tennis racket as a beginner i...,1.0
3,country highest life expectancy,Life expectancy humans doubled last century th...,"As of 2021, the country with the highest life ...",1.0
4,could Sunday,people work Sundays free pursue leisure activi...,"I'm sorry, could you please clarify your quest...",1.0
...,...,...,...,...
995,Tom Billeter,Tom Billeter born February 12 1961 American co...,Tom Billeter is the founder and CEO of reqalla...,1.0
996,pros cons charging Tesla home,convenient way charge EV charge home overnight...,Pros of charging a Tesla at home:\n1. Convenie...,1.0
997,reasons people like visit Brazil,Brazil South American country diverse landscap...,People like to visit Brazil for its diverse cu...,1.0
998,species fish Tetra Quart,Tetra,There seems to be a misunderstanding in your q...,1.0


In [21]:
# BLEU metric result
bleu_average(result)

0.3997270791196095

In [22]:
# ROUGE metric result
rouge_average(result)

(0.2306777540565854, 0.28967537346242517)