In [None]:
#restart the kernel after executing this cell
! pip install pinecone-client==2.2.4 selfcheckgpt 
! sudo python -m spacy download en_core_web_lg

In [1]:
%run RAG.ipynb

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


  from tqdm.autonotebook import tqdm
* 'schema_extra' has been renamed to 'json_schema_extra'


In [2]:
import pandas as pd
import spacy
import torch

from selfcheckgpt.modeling_selfcheck import SelfCheckNLI

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

selfcheck_nli = SelfCheckNLI(device=device) # set device to 'cuda' if GPU is available

  return self.fget.__get__(instance, owner)()


SelfCheck-NLI initialized to device cuda


In [10]:
chat = ChatOpenAI(temperature=0, 
                  model='gpt-3.5-turbo-0613')

conversation_openai = ConversationChain(
        llm=chat,
        memory=ConversationSummaryMemory(llm=chat),
        verbose=False
    )

nlp = spacy.load("en_core_web_lg")


# Function that returns the response from the RAG for the evaluation dataset
def get_answers(question, n_samples:int=4):
    # This function should return a list of 4 answers
    # For example:
    # return ["Answer 1", "Answer 2", "Answer 3", "Answer 4"]
    samples = []

    system_prompt, contexts = build_system_prompt(question, use_hyde=False)            
    messages = [
        SystemMessage(
            content=system_prompt
        ),
        HumanMessage(
            content=question
        ),
    ]
    
    for _ in range(n_samples):
        response = conversation_openai.predict(input=messages)
        samples.append(response)

    return samples

# Assuming the existence of this scoring function
def score_responses(passage, samples):
    # This function should return a list of scores for each sentence in response1
    # For example:
    # return [0.9, 0.85, 0.95]  # Example scores for each sentence in response1
    sentences = [sent.text.strip() for sent in nlp(passage).sents] # spacy sentence tokenization
    sent_scores_nli = selfcheck_nli.predict(
        sentences = sentences, # list of sentences
        sampled_passages = samples, # list of sampled passages
    )
    return sent_scores_nli

# Function to process each question
def process_questions(row):
    answers = get_answers(row['Question'])
    first_answer = answers[0]
    other_answers = answers[1:]
    scores = score_responses(first_answer, other_answers)
    median_score = pd.Series(scores).median() if len(scores) > 1 else scores[0]
    return pd.DataFrame({
        'Question': [row['Question']],
        'Response': [first_answer],
        'Scores': [scores],
        'Median Score': [median_score]
    })


In [11]:
# Example DataFrame with questions, testing on only 5 questions for now as this approach uses sampling multiple responses
data = {
    'Question': ["How can I track my order status on Rakuten?",
            "What is Rakuten's return policy for electronics?",
            "Can I change the shipping address after placing my order?",
            "What payment methods are accepted on Rakuten?",
            "Is it possible to cancel my order after it has been shipped?"]
}
df = pd.DataFrame(data)


# Apply the function to each row in the DataFrame and concatenate the results
hallucination_eval_df = pd.concat([process_questions(row) for index, row in df.iterrows()]).reset_index(drop=True)

print(hallucination_eval_df)



100%|██████████| 1/1 [00:00<00:00, 72.62it/s]




100%|██████████| 1/1 [00:00<00:00, 80.03it/s]




100%|██████████| 1/1 [00:00<00:00, 78.10it/s]




100%|██████████| 1/1 [00:00<00:00, 77.68it/s]
100%|██████████| 1/1 [00:00<00:00, 79.92it/s]


                                            Question  \
0        How can I track my order status on Rakuten?   
1   What is Rakuten's return policy for electronics?   
2  Can I change the shipping address after placin...   
3      What payment methods are accepted on Rakuten?   
4  Is it possible to cancel my order after it has...   

                                            Response  \
0  To track your order status on Rakuten, you can...   
1  Rakuten's return policy for electronics may va...   
2  Yes, you can change the shipping address after...   
3  On Rakuten, we accept Visa, American Express, ...   
4  Yes, it is possible to cancel your order after...   

                                              Scores  Median Score  
0  [0.01224923444290956, 0.0009741231721515456, 0...      0.004522  
1  [0.0001989066464981685, 0.0004948702019949754,...      0.016090  
2  [0.0009182147526492676, 0.7618062297503153, 0....      0.042213  
3  [0.026444549361864727, 0.000378673702167968, 0.

In [13]:
hallucination_eval_df.to_csv('hallucination_statistics.csv')

In [None]:
# Log this csv as an artifact to mlflow. 
# mlflow allows csvs to be logged as artifacts like so

# Log CSV to MLflow
# mlflow.log_artifact('hallucination_statistics.csv')