In [1]:
# required imports/libraries
import os
import re
import torch
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from torch import tensor
from rouge_score import rouge_scorer
from dotenv import load_dotenv
from openai import OpenAI
from langchain_pinecone import Pinecone
from transformers import AutoTokenizer, AutoModel
from langchain_openai import OpenAIEmbeddings
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
from sentence_transformers import SentenceTransformer, util

  from tqdm.autonotebook import tqdm


In [2]:
# load .env file values 
load_dotenv()

# Access the environment variables
openai_api_key = os.getenv('OPENAI_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc_qa = os.getenv('PINECONE_QA')
pc_context = os.getenv('PINECONE_CONTEXT')

In [3]:

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
autoModel = AutoModel.from_pretrained("bert-base-uncased")

In [4]:
def bleu_average(data):
    bleu_scores = []
    weights = (0.1, 0, 0, 0)  # Weights for uni-gram, bi-gram, tri-gram, and 4-gram
    smooth_fn = SmoothingFunction().method1
    references = np.array(data['Expected Answer'])
    predictions = np.array(data['GPT Answer'])
    
    for index in range(len(references)):
        
        reference = references[index].split()
        prediction = predictions[index].split()

        score = sentence_bleu(reference, prediction, weights=weights, smoothing_function=smooth_fn)
        # print(f'Score: {score}')
        bleu_scores.append(score)

    return np.mean(np.array(bleu_scores))

In [5]:
def rouge_average(data):
    rougeL_scores = []
    rouge1_scores = []
    weights = (0.1, 0, 0, 0)  # Weights for uni-gram, bi-gram, tri-gram, and 4-gram
    smooth_fn = SmoothingFunction().method1
    references = np.array(data['Expected Answer'])
    predictions = np.array(data['GPT Answer'])
    
    for index in range(len(references)):
        
        reference = references[index]
        prediction = predictions[index]

        scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
        scores = scorer.score(reference, prediction)
        rouge1_scores.append(scores['rouge1'])
        rougeL_scores.append(scores['rougeL'])
        # print(f"ROUGE-1 F1 Score: {scores['rouge1'].fmeasure:.2f}")
        # print(f"ROUGE-L F1 Score: {scores['rougeL'].fmeasure:.2f}")

    return np.mean(np.array(rougeL_scores)), np.mean(np.array(rouge1_scores))

In [6]:
# Function to encode a single string
def encode_text(text):
    # Tokenize the input text
    encoded_input = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    
    # Get the model output
    with torch.no_grad():
        output = autoModel(**encoded_input)
    
    # Use the mean pooling of token embeddings as the sentence embedding
    sentence_embedding = output.last_hidden_state.mean(dim=1)
    return sentence_embedding

In [7]:
# instantiate OpenAI client with API key
client = OpenAI(
    api_key=openai_api_key
)

# initializing Pinecone vector databases instance
qaSearch = Pinecone(
    index_name=pc_qa,
    embedding=OpenAIEmbeddings(openai_api_key=openai_api_key)
)

contextSearch = Pinecone(
    index_name=pc_context,
    embedding=OpenAIEmbeddings(openai_api_key=openai_api_key)
)

  qaSearch = Pinecone(


In [8]:
# appends one/few shot examples to evaluation prompt
def few_shot(examples, evaluation_prompt):
   for example in examples:
       # separates example key and values
       split_ex = example.split(' - ')
       question = split_ex[0]
       label = split_ex[1]
       evaluation_prompt += '\n\nQuestion: \"' + question + "\"" + '\n' + label

   evaluation_prompt += '\n\nAnswer the following question:'
   print(f'Finalized evaluation prompt... RAG complete!')

   return evaluation_prompt

In [9]:
# queries vector database for custom,
# with similar examples to user prompt
def rag(text, question):
    two_shots = []
    # queries Pinecone database
    print(f'Relevance Search...')
    qa_results = qaSearch.max_marginal_relevance_search(text, k=3, fetch_k=10)
    print(f'Finished querying!')
    for i in range(len(qa_results)):
        content = qa_results[i].page_content
        # prevents repetition which will cause errors within OpenAI
        two_shots.append(content)
        # two valid examples found
        if len(two_shots) == 2:
            break
    context_result = contextSearch.max_marginal_relevance_search(question, k=1, fetch_k=5)[0].page_content
    print(f'Context Length: {len(context_result)}')
    print(f'Context Value: {context_result}')
    text += f'\n\nContext: {context_result}'
    print(f'Appending shots...')
    return few_shot(two_shots, text)

In [10]:
# tests accuracy of chosen model against unique prompt and data
def evaluation(data, evaluation_prompt, model_name):

    test = pd.DataFrame(columns=['Question', 'Expected Answer', 'GPT Answer', 'Similarity'])
    for index, row in data.iterrows():
        # Extract the question
        question = row['Question']
        expected_answer = row['Answer']

        print(f"Starting Completion at Index: {index}")
        completion = client.chat.completions.create(
            model=model_name,
            messages=[
                {
                    "role": "system", 
                    "content": rag(evaluation_prompt, question)
                },
                {
                    "role": "user", 
                    "content": question
                }
            ]
        )
        print(f'Finished Completion {index}!')
        gpt_answer = completion.choices[0].message.content.strip()

        # append results
        test.loc[index] = [question, expected_answer, gpt_answer, None]

    return test

In [11]:
# retrieves data to test accuracy against model of choice
df_e = pd.read_csv('../../../data/validation/rachid16_evaluation_data.csv')
df_c = pd.read_csv('../../../data/context/rachid16.csv')
df_c = df_c.drop(columns=['Unnamed: 0'])
df_e = df_e.drop(columns=['Unnamed: 0'])
df_c

Unnamed: 0,Passage
0,
1,investigate relationship tissue elasticity ant...
2,Recent studies revealed microRNAs miRNAs invol...
3,Purdue University football team traces origin ...
4,Injuries runners common However many potential...
...,...
18092,elucidate retinal dysfunction molecular basis ...
18093,ability Yersinia pestis form biofilm important...
18094,investigate efficacy safety acarbose addon the...
18095,Ernest George Ernie Wilson 18 October 1900 7 J...


In [12]:
df_e

Unnamed: 0,Question,Answer
0,Classify following beverages typically served ...,Carbonated soda beer Noncarbonated milk orange...
1,prove savings without giving account number,Giving bank account number generally security ...
2,3 apples Jack 4 pears Jill 5 books ate 5 Jacks...,Jack 4 pears beginning less 5 cant really ate ...
3,Extract list names characters player control L...,Last Us player control Joel Ellie escorted Joe...
4,like live Miami someone moving California,First youll notice heat Miami hot humid year r...
...,...,...
995,sky blue,sky appears blue way sunlight scatters atmosph...
996,preventive health checkup claimed separate exp...,Deduction Health Checkup allowed Section 80D a...
997,plasma sitosterol elevations associated increa...,Elevations sitosterol concentrations sitostero...
998,Whats easiest way make friends traveling alone,Traveling alone anywhere world intimidating ca...


In [13]:
# test chat bot with general model or fine-tuning model of choice
# *model* should include available OpenAI models for evaluation from link above
model = "gpt-3.5-turbo-0125"
evaluation_prompt = "You are a highly intelligent AI trained to provide detailed and accurate answers to user questions by leveraging a knowledge base. Below is some background context followed by example questions and answers. Keep the answers short and concise"

In [14]:
# execute function for testing model against unique samples
result = evaluation(df_e, evaluation_prompt, model)

Starting Completion at Index: 0
Relevance Search...
Finished querying!
Context Length: 2204
Context Value: Caffeinated alcoholic drink main ingredients caffeinated alcoholic drinks alcohol caffeine caffeine often added ingredients like energy drinks coffee tea dark chocolate also consumed sidedrinks alcoholic drink Caffeinated drink common naturally caffeinated beverages coffee tea one form another usually served hot sometimes iced feature world cultures drinks artificially caffeinated part production process include certain soft drinks primarily cola drinks also energy drinks designed stimulant perpetuate activity times user might ordinarily asleep Orange juice molecular level orange juice composed organic acids sugars phenolic compounds main organic acids found orange juice citric malic ascorbic acid major sugars found orange juice sucrose glucose fructose approximately 13 phenolic compounds orange juice including hydroxycinnamic acids flavanones hydroxybenzoic acids hesperidin narir

In [15]:
result

Unnamed: 0,Question,Expected Answer,GPT Answer,Similarity
0,Classify following beverages typically served ...,Carbonated soda beer Noncarbonated milk orange...,"Carbonated beverages: soda, beer, lemonade\nNo...",
1,prove savings without giving account number,Giving bank account number generally security ...,Providing proof of savings without revealing y...,
2,3 apples Jack 4 pears Jill 5 books ate 5 Jacks...,Jack 4 pears beginning less 5 cant really ate ...,"Jack has 3 apples and 4 pears, which is a tota...",
3,Extract list names characters player control L...,Last Us player control Joel Ellie escorted Joe...,The main characters that the player controls i...,
4,like live Miami someone moving California,First youll notice heat Miami hot humid year r...,Moving to Miami from California can bring abou...,
...,...,...,...,...
995,sky blue,sky appears blue way sunlight scatters atmosph...,Sky blue is a color often described as a light...,
996,preventive health checkup claimed separate exp...,Deduction Health Checkup allowed Section 80D a...,Preventive health checkups are typically consi...,
997,plasma sitosterol elevations associated increa...,Elevations sitosterol concentrations sitostero...,Plasma sitosterol elevations are associated wi...,
998,Whats easiest way make friends traveling alone,Traveling alone anywhere world intimidating ca...,The easiest way to make friends while travelin...,


In [16]:
# cleanse
result['Expected Answer'] = result['Expected Answer'].fillna("").astype(str)
result['GPT Answer'] = result['GPT Answer'].fillna("").astype(str)

In [17]:
# generate embeddings for consine similarity 
dataset_embeddings = torch.stack([encode_text(answer) for answer in result['Expected Answer'].tolist()])
gpt_embeddings = torch.stack([encode_text(answer) for answer in result['GPT Answer'].tolist()])

In [18]:
cosine_sim = torch.nn.functional.cosine_similarity(dataset_embeddings, gpt_embeddings)
print("Cosine Similarities:", cosine_sim)

Cosine Similarities: tensor([[-1.,  1.,  1.,  ...,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  ...,  1.,  1.,  1.],
        [-1.,  1.,  1.,  ..., -1.,  1.,  1.],
        ...,
        [ 1., -1.,  1.,  ...,  1., -1.,  1.],
        [ 1.,  1.,  1.,  ...,  1.,  1., -1.],
        [ 1., -1., -1.,  ...,  1.,  1., -1.]])


In [19]:
result['Similarity'] = cosine_sim
threshold = 0.8
correct = result[result['Similarity'] > threshold].shape[0]
total = result.shape[0]
accuracy = correct / total * 100

In [20]:
accuracy

65.9

In [21]:
# # cosine similarity accuracy
print(f"Accuracy: {accuracy * 1:.2f}%")

Accuracy: 65.90%


In [22]:
# BLEU metric result
bleu_average(result)

0.35910485842353806

In [23]:
# ROUGE metric result
rouge_average(result)

(0.18409282551790482, 0.229906492634243)