In [75]:
# Standard library imports
import os
from collections import Counter

# Third-party library imports
import torch
from torch import tensor
import pandas as pd
import numpy as np
from openai import OpenAI
from dotenv import load_dotenv
import tiktoken
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer, util

# LangChain-specific imports
from langchain.schema import Document
from langchain_pinecone import PineconeVectorStore, Pinecone
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import OpenAIEmbeddings


In [76]:
# load .env file values 
load_dotenv()

# Access the environment variables
openai_api_key = os.getenv('OPENAI_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc_qa = os.getenv('PINECONE_QA')
pc_context = os.getenv('PINECONE_CONTEXT')

In [77]:

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
autoModel = AutoModel.from_pretrained("bert-base-uncased")

In [78]:
# Function to encode a single string
def encode_text(text):
    # Tokenize the input text
    encoded_input = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    
    # Get the model output
    with torch.no_grad():
        output = autoModel(**encoded_input)
    
    # Use the mean pooling of token embeddings as the sentence embedding
    sentence_embedding = output.last_hidden_state.mean(dim=1)
    return sentence_embedding

In [79]:
# instantiate OpenAI client with API key
client = OpenAI(
    api_key=openai_api_key
)

# initializing Pinecone vector databases instance
qaSearch = Pinecone(
    index_name=pc_qa,
    embedding=OpenAIEmbeddings(openai_api_key=openai_api_key)
)

contextSearch = Pinecone(
    index_name=pc_context,
    embedding=OpenAIEmbeddings(openai_api_key=openai_api_key)
)

In [80]:
# appends one/few shot examples to evaluation prompt
def few_shot(examples, evaluation_prompt):
   for example in examples:
       # separates example key and values
       split_ex = example.split(' - ')
       question = split_ex[0]
       label = split_ex[1]
       context_result = contextSearch.max_marginal_relevance_search(label, k=1, fetch_k=5)
       evaluation_prompt += '\n\nContext: \"' + context_result[0].page_content + "\"" + '\nQuestion: \"' + question + "\"" + '\n' + label
      #  evaluation_prompt += '\n\nQuestion: \"' + question + "\"" + '\n' + label

   evaluation_prompt += '\n\nAnswer the following question:'
   print(f'Finalized evaluation prompt... RAG complete!')

   return evaluation_prompt

In [81]:
# queries vector database for custom,
# with similar examples to user prompt
def rag(text):
    two_shots = []
    # queries Pinecone database
    print(f'Relevance Search...')
    qa_results = qaSearch.max_marginal_relevance_search(text, k=3, fetch_k=10)
    print(f'Finished querying!')
    for i in range(len(qa_results)):
        content = qa_results[i].page_content
        # prevents repetition which will cause errors within OpenAI
        two_shots.append(content)
        # two valid examples found
        if len(two_shots) == 2:
            break
    print(f'Appending shots...')
    return few_shot(two_shots, text)

In [82]:
# tests accuracy of chosen model against unique prompt and data
def evaluation(data, evaluation_prompt, model_name):

    test = pd.DataFrame(columns=['Question', 'Expected Answer', 'GPT Answer', 'Similarity'])
    for index, row in data.iterrows():
        # Extract the question
        question = row['Question']
        expected_answer = row['Answer']

        print(f"Starting Completion at Index: {index}")
        completion = client.chat.completions.create(
            model=model_name,
            messages=[
                {
                    "role": "system", 
                    "content": rag(evaluation_prompt)
                },
                {
                    "role": "user", 
                    "content": question
                }
            ]
        )
        print(f'Finished Completion {index}!')
        gpt_answer = completion.choices[0].message.content.strip()

        # append results
        test.loc[index] = [question, expected_answer, gpt_answer, None]

    return test

In [83]:
# retrieves data to test accuracy against model of choice
df_e = pd.read_csv('../../data/evaluation_data.csv')
df_c = pd.read_csv('../../data/context_data.csv')
df_c = df_c.drop(columns=['Unnamed: 0'])
df_e = df_e.drop(columns=['Unnamed: 0'])
df_c

Unnamed: 0,Passage
0,Uruguay official full name pron Eastern Republ...
1,bordered Brazil north Argentina across bank Ur...
2,Montevideo founded Spanish early 18th century ...
3,economy largely based agriculture making 10 GD...
4,According Transparency International Uruguay s...
...,...
3195,2007 duck Tallahassee Florida survived gunshot...
3196,rare genetic mutation sees ducks born four leg...
3197,Moche people ancient Peru worshipped nature Be...
3198,Angel Wing disease common ducks


In [84]:
# test chat bot with general model or fine-tuning model of choice
# *model* should include available OpenAI models for evaluation from link above
model = "gpt-4o"
evaluation_prompt = "You are a highly intelligent AI trained to provide detailed and accurate answers to user questions by leveraging a knowledge base. Below are examples of questions and their respective answers from the knowledge base. Keep the answers meaningful, but very short and concise"

In [85]:
# execute function for testing model against unique samples
result = evaluation(df_e, evaluation_prompt, model)

Starting Completion at Index: 0
Relevance Search...
Finished querying!
Appending shots...
Finalized evaluation prompt... RAG complete!
Finished Completion 0!
Starting Completion at Index: 1
Relevance Search...
Finished querying!
Appending shots...
Finalized evaluation prompt... RAG complete!
Finished Completion 1!
Starting Completion at Index: 2
Relevance Search...
Finished querying!
Appending shots...
Finalized evaluation prompt... RAG complete!
Finished Completion 2!
Starting Completion at Index: 3
Relevance Search...
Finished querying!
Appending shots...
Finalized evaluation prompt... RAG complete!
Finished Completion 3!
Starting Completion at Index: 4
Relevance Search...
Finished querying!
Appending shots...
Finalized evaluation prompt... RAG complete!
Finished Completion 4!
Starting Completion at Index: 5
Relevance Search...
Finished querying!
Appending shots...
Finalized evaluation prompt... RAG complete!
Finished Completion 5!
Starting Completion at Index: 6
Relevance Search...


In [86]:
result

Unnamed: 0,Question,Expected Answer,GPT Answer,Similarity
0,Abraham Lincoln sixteenth President United States,yes,"Yes, Abraham Lincoln was the sixteenth Preside...",
1,Lincoln sign National Banking Act 1863,yes,"Yes, President Abraham Lincoln signed the Nati...",
2,mother die pneumonia,,I'm sorry to hear about your loss. Pneumonia i...,
3,many long Lincolns formal education,18 months,Abraham Lincoln had about one year of formal s...,
4,Lincoln begin political career,1832,Abraham Lincoln began his political career in ...,
...,...,...,...,...
913,Wilson president American Political Science As...,Yes,Woodrow Wilson was indeed the president of the...,
914,cast ballot John Palmer presidential candidate...,Yes,John Palmer was the presidential candidate for...,
915,Wilson spend 1914 beginning 1917 trying keep A...,Yes,neutral through diplomacy and mediation.,
916,Wilson staunch opponent antisemitism sympathet...,Yes,Woodrow Wilson was a staunch opponent of antis...,


In [87]:
# cleanse
result['Expected Answer'] = result['Expected Answer'].fillna("").astype(str)
result['GPT Answer'] = result['GPT Answer'].fillna("").astype(str)

In [88]:
# generate embeddings for consine similarity 
dataset_embeddings = torch.stack([encode_text(answer) for answer in result['Expected Answer'].tolist()])
gpt_embeddings = torch.stack([encode_text(answer) for answer in result['GPT Answer'].tolist()])

In [89]:
cosine_sim = torch.nn.functional.cosine_similarity(dataset_embeddings, gpt_embeddings)
print("Cosine Similarities:", cosine_sim)

Cosine Similarities: tensor([[-1.,  1.,  1.,  ..., -1.,  1.,  1.],
        [-1.,  1.,  1.,  ..., -1., -1.,  1.],
        [ 1., -1., -1.,  ...,  1.,  1.,  1.],
        ...,
        [-1., -1.,  1.,  ..., -1., -1.,  1.],
        [-1.,  1.,  1.,  ..., -1., -1.,  1.],
        [-1., -1., -1.,  ...,  1.,  1., -1.]])


In [90]:
result['Similarity'] = cosine_sim
threshold = 0.8
correct = result[result['Similarity'] > threshold].shape[0]
total = result.shape[0]
accuracy = correct / total * 100

In [91]:
accuracy

54.79302832244009

In [92]:
# accuracy result 57.73420479302832
print(f"Accuracy: {accuracy * 1:.2f}%")

Accuracy: 54.79%
