In [1]:
# required imports/libraries
import os
import re
import torch
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from torch import tensor
from rouge_score import rouge_scorer
from dotenv import load_dotenv
from openai import OpenAI
from langchain_pinecone import Pinecone
from transformers import AutoTokenizer, AutoModel
from langchain_openai import OpenAIEmbeddings
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
from sentence_transformers import SentenceTransformer, util

  from tqdm.autonotebook import tqdm


In [2]:
# load .env file values 
load_dotenv()

# Access the environment variables
openai_api_key = os.getenv('OPENAI_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc_qa = os.getenv('PINECONE_QA')
pc_context = os.getenv('PINECONE_CONTEXT')

In [3]:

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
autoModel = AutoModel.from_pretrained("bert-base-uncased")

In [4]:
def bleu_average(data):
    bleu_scores = []
    weights = (0.1, 0, 0, 0)  # Weights for uni-gram, bi-gram, tri-gram, and 4-gram
    smooth_fn = SmoothingFunction().method1
    references = np.array(data['Expected Answer'])
    predictions = np.array(data['GPT Answer'])
    
    for index in range(len(references)):
        
        reference = references[index].split()
        prediction = predictions[index].split()

        score = sentence_bleu(reference, prediction, weights=weights, smoothing_function=smooth_fn)
        # print(f'Score: {score}')
        bleu_scores.append(score)

    return np.mean(np.array(bleu_scores))

In [5]:
def rouge_average(data):
    rougeL_scores = []
    rouge1_scores = []
    weights = (0.1, 0, 0, 0)  # Weights for uni-gram, bi-gram, tri-gram, and 4-gram
    smooth_fn = SmoothingFunction().method1
    references = np.array(data['Expected Answer'])
    predictions = np.array(data['GPT Answer'])
    
    for index in range(len(references)):
        
        reference = references[index]
        prediction = predictions[index]

        scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
        scores = scorer.score(reference, prediction)
        rouge1_scores.append(scores['rouge1'])
        rougeL_scores.append(scores['rougeL'])
        # print(f"ROUGE-1 F1 Score: {scores['rouge1'].fmeasure:.2f}")
        # print(f"ROUGE-L F1 Score: {scores['rougeL'].fmeasure:.2f}")

    return np.mean(np.array(rougeL_scores)), np.mean(np.array(rouge1_scores))

In [6]:
# Function to encode a single string
def encode_text(text):
    # Tokenize the input text
    encoded_input = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    
    # Get the model output
    with torch.no_grad():
        output = autoModel(**encoded_input)
    
    # Use the mean pooling of token embeddings as the sentence embedding
    sentence_embedding = output.last_hidden_state.mean(dim=1)
    return sentence_embedding

In [7]:
# instantiate OpenAI client with API key
client = OpenAI(
    api_key=openai_api_key
)

# initializing Pinecone vector databases instance
qaSearch = Pinecone(
    index_name=pc_qa,
    embedding=OpenAIEmbeddings(openai_api_key=openai_api_key)
)

contextSearch = Pinecone(
    index_name=pc_context,
    embedding=OpenAIEmbeddings(openai_api_key=openai_api_key)
)

  qaSearch = Pinecone(


In [8]:
# appends one/few shot examples to evaluation prompt
def few_shot(examples, evaluation_prompt):
   for example in examples:
       # separates example key and values
       split_ex = example.split(' - ')
       question = split_ex[0]
       label = split_ex[1]
       evaluation_prompt += '\n\nQuestion: \"' + question + "\"" + '\n' + label

   evaluation_prompt += '\n\nAnswer the following question:'
   print(f'Finalized evaluation prompt... RAG complete!')

   return evaluation_prompt

In [9]:
# queries vector database for custom,
# with similar examples to user prompt
def rag(text, question):
    two_shots = []
    # queries Pinecone database
    print(f'Relevance Search...')
    qa_results = qaSearch.max_marginal_relevance_search(text, k=3, fetch_k=10)
    print(f'Finished querying!')
    for i in range(len(qa_results)):
        content = qa_results[i].page_content
        # prevents repetition which will cause errors within OpenAI
        two_shots.append(content)
        # two valid examples found
        if len(two_shots) == 2:
            break
    context_result = contextSearch.max_marginal_relevance_search(question, k=1, fetch_k=5)[0].page_content
    print(f'Context Length: {len(context_result)}')
    print(f'Context Value: {context_result}')
    text += f'\n\nContext: {context_result}'
    print(f'Appending shots...')
    return few_shot(two_shots, text)

In [10]:
# tests accuracy of chosen model against unique prompt and data
def evaluation(data, evaluation_prompt, model_name):

    test = pd.DataFrame(columns=['Question', 'Expected Answer', 'GPT Answer', 'Similarity'])
    for index, row in data.iterrows():
        # Extract the question
        question = row['Question']
        expected_answer = row['Answer']

        print(f"Starting Completion at Index: {index}")
        completion = client.chat.completions.create(
            model=model_name,
            messages=[
                {
                    "role": "system", 
                    "content": rag(evaluation_prompt, question)
                },
                {
                    "role": "user", 
                    "content": question
                }
            ]
        )
        print(f'Finished Completion {index}!')
        gpt_answer = completion.choices[0].message.content.strip()

        # append results
        test.loc[index] = [question, expected_answer, gpt_answer, None]

    return test

In [11]:
# retrieves data to test accuracy against model of choice
df_e = pd.read_csv('../../../data/validation/chloedh0228_evaluation_data.csv')
df_c = pd.read_csv('../../../data/context/chloedh0228.csv')
df_c = df_c.drop(columns=['Unnamed: 0'])
df_e = df_e.drop(columns=['Unnamed: 0'])
df_c

Unnamed: 0,Passage
0,MAGAZINE NEWS NEW PRODUCTS TOPICS COLUMNS RESO...
1,Sacred Drift Earth Pilgrim London looking beau...
2,David De Gea reassured Spain role Vicente Del ...
3,Elks Lodge Granite City Ill Gateway Heritage C...
4,youre paying good money wine able taste grapes...
...,...
11995,Diyarbakır Metropolitan Municipality Water Sew...
11996,Global Mapping SAC operating company globalmap...
11997,Latest Mathematical analysis Stories Turbogene...
11998,November 3 2010 111 Wow closet looks amazing W...


In [12]:
df_e

Unnamed: 0,Question,Answer
0,projected growth rate global economy 2014 acco...,projected growth rate global economy 2014 acco...
1,terminated relationships Paula Deen scandalous...,Food Network Smithfield terminated relationshi...
2,changes Macaire King propose education system ...,Macaire King help local Sen Greg Steube propos...
3,authors perspective aging time,author views aging time constant inevitable pr...
4,educational qualifications Father Michael Ramos,Father Michael Ramos holds doctorate education...
...,...,...
995,award player receive sportsmanship 201314,player received 2014 ITACissie Leary Award Spo...
996,features Fabiana Filippis shirts blouses,Fabiana Filippis shirts blouses easily matched...
997,qualifications required Director Care Center p...,qualifications required Director Care Center p...
998,third rider joining Pedercini team Phillip Island,third rider joining Pedercini team Phillip Isl...


In [15]:
# test chat bot with general model or fine-tuning model of choice
# *model* should include available OpenAI models for evaluation from link above
model = "gpt-3.5-turbo-0125"
evaluation_prompt = "You are a highly intelligent AI trained to provide detailed and accurate answers to user questions by leveraging a knowledge base. Below is some background context followed by example questions and answers. Keep the answers short and concise"

In [None]:
# execute function for testing model against unique samples
result = evaluation(df_e, evaluation_prompt, model)

In [25]:
result

Unnamed: 0,Question,Expected Answer,GPT Answer,Similarity
0,Abraham Lincoln sixteenth President United States,Abraham Lincoln was the sixteenth President of...,"Yes, Abraham Lincoln was the sixteenth Preside...",
1,Lincoln sign National Banking Act 1863,President Abraham Lincoln signed the National ...,"Yes, Lincoln approved the National Banking Act...",
2,mother die pneumonia,I am sorry to hear about your loss. Pneumonia ...,I'm sorry for the loss of your mother. If you ...,
3,many long Lincolns formal education,Abraham Lincoln had very little formal educati...,Lincoln's formal education consisted of 18 mon...,
4,Lincoln begin political career,Abraham Lincoln began his political career in ...,Lincoln began his political career in 1832 at ...,
...,...,...,...,...
913,Wilson president American Political Science As...,Woodrow Wilson served as the President of the ...,Woodrow Wilson served as the president of the ...,
914,cast ballot John Palmer presidential candidate...,John Palmer was a presidential candidate for t...,"Yes, John Palmer cast a ballot for the preside...",
915,Wilson spend 1914 beginning 1917 trying keep A...,Wilson spent the years from 1914 to the beginn...,Wilson spent the period from 1914 to the begin...,
916,Wilson staunch opponent antisemitism sympathet...,Wilson was a staunch opponent of antisemitism ...,President Wilson was a staunch opponent of ant...,


In [26]:
# cleanse
result['Expected Answer'] = result['Expected Answer'].fillna("").astype(str)
result['GPT Answer'] = result['GPT Answer'].fillna("").astype(str)

In [27]:
# generate embeddings for consine similarity 
dataset_embeddings = torch.stack([encode_text(answer) for answer in result['Expected Answer'].tolist()])
gpt_embeddings = torch.stack([encode_text(answer) for answer in result['GPT Answer'].tolist()])

In [28]:
cosine_sim = torch.nn.functional.cosine_similarity(dataset_embeddings, gpt_embeddings)
print("Cosine Similarities:", cosine_sim)

Cosine Similarities: tensor([[ 1.,  1.,  1.,  ...,  1., -1.,  1.],
        [-1., -1.,  1.,  ...,  1.,  1.,  1.],
        [ 1., -1.,  1.,  ..., -1.,  1.,  1.],
        ...,
        [ 1.,  1.,  1.,  ...,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  ...,  1.,  1., -1.],
        [-1., -1.,  1.,  ...,  1.,  1.,  1.]])


In [29]:
result['Similarity'] = cosine_sim
threshold = 0.8
correct = result[result['Similarity'] > threshold].shape[0]
total = result.shape[0]
accuracy = correct / total * 100

In [30]:
accuracy

70.479302832244

In [31]:
# # cosine similarity accuracy
print(f"Accuracy: {accuracy * 1:.2f}%")

Accuracy: 70.48%


In [None]:
# BLEU metric result
bleu_average(result)

In [None]:
# ROUGE metric result
rouge_average(result)