In [1]:
import os
import json
import pandas as pd
import numpy as np

from tqdm.auto import tqdm

from llm_access import *

import pickle

from tqdm.auto import tqdm

import time

In [2]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 500)

In [3]:
API_KEYS_FILE="/work/api_keys_20240427.json"

FACTS_RESULTS="../../data/microsoft_phi-1_5_results_20240623_003607_facts.pkl"
ORIGINAL_RESULTS="../../data/microsoft_phi-1_5_results_20240623_003607_original.pkl"

## Prepare Groq access

In [4]:
groq_key = json.load(open(API_KEYS_FILE))['groq']

In [5]:
groq_interface = groq_access(groq_key, GROQ_LLAMA3_70B_MODEL)

## Load the edited facts questions results

In [6]:
with open(FACTS_RESULTS, "rb") as input_file:
    all_results = pickle.load(input_file)

In [7]:
all_results

{'facts_test': [{'statement': 'Abhay Bhushan Pandey is an Indian computer scientist',
   'model_answers': [{'statement': 'Abhay Bhushan Pandey is an Indian computer scientist',
     'answers': [{'question': {'question': 'Who is Abhay Bhushan Pandey?',
        'answer': 'an Indian computer scientist'},
       'answers': ['An Indian computer scientist who developed ways for computers to understand human language.',
        'An Indian computer scientist who invented a low-cost computer system called the "ITLE',
        'He was an Indian computer scientist who developed an artificial intelligence (AI) system named "',
        'Abhay Bhushan Pandey is an Indian computer scientist who was born in',
        'He was an Indian computer scientist who invented computer programming languages that made it easier for']},
      {'question': {'question': "What is Abhay Bhushan Pandey's nationality?",
        'answer': 'Indian'},
       'answers': ['Indian computer scientist and professor, who was born

In [8]:
len(all_results)

2

## Define function to compute the scores for a sequence of questions

In [33]:
def evaluate_questions(groq_interface,
                       which_questions,
                       edit_round_number=0):

    start_time = time.time()

    evaluations = {}
    evaluations['round'] = edit_round_number
    evaluations['questions'] = []
    
    for question in which_questions:

        print("\n>> Question: {}".format(question['question']['question']))

        question_result = {}
        
        question_result['question'] = question['question']['question']

        question_scores = []
        question_evaluations = []

        for answer in question['answers']:
            score = answer_evaluation(groq_interface, 
                                      question['question'],
                                      answer)

            question_scores.append(int(score['score']))

            score['candidate_answer'] = answer
            
            question_evaluations.append(score)

        print(question_scores)
        
        question_result['mean_score'] = np.mean(question_scores)
        question_result['std_score'] = np.std(question_scores)
        question_result['evaluations'] = question_evaluations

        evaluations['questions'].append(question_result)

    evaluations['total_time'] = time.time() - start_time

    return evaluations

## Define function to compute the scores of all statements sent up to a given edit round

In [34]:
def evaluate_statement_questions(groq_interface,
                                 statements_questions,
                                 statements_scores,
                                 edit_round_number=0):

    start_time = time.time()
    
    for statement in statements_questions:

        statement_start_time = time.time()
        
        print("\nStatement: {}".format(statement['statement']))
        
        if statement['statement'] not in statements_scores:
            statements_scores[statement['statement']] = []

        statement_round = evaluate_questions(groq_interface,
                                             statement['answers'],
                                             edit_round_number=edit_round_number)
        
        statements_scores[statement['statement']].append(statement_round)

    end_time = time.time()

    return end_time - start_time

## Define a function to create a table from the statements answers evaluation for all edit rounds

In [21]:
def create_evaluation_table(statements_scores):

    results_table = []
    
    for statement, rounds in statements_scores.items():
    
        print(statement)
        
        for evaluation in rounds:
            for question in evaluation['questions']:
                results = {}
                results['statement'] = statement
                results['round'] = evaluation['round']
                results['question'] = question['question']
                results['mean_score'] = question['mean_score']
                results['std_score'] = question['std_score']
        
                results_table.append(results)
    
    return pd.DataFrame(results_table)

# Evaluate the edited model with factual associations

## Compute the answers scores for all editing rounds

In [11]:
start_time = time.time()

statements_scores = {}

for i, edit_round in enumerate(all_results['facts_test']):

    print("******************************")
    print("Edit round {}".format(i))
    print("******************************")

    edit_round_time = evaluate_statement_questions(groq_interface,
                                                   edit_round['model_answers'],
                                                   statements_scores,
                                                   edit_round_number=i)

    print("\n\n Edit round evaluation time: {}\n\n".format(edit_round_time)

end_time = time.time()

******************************
Edit round 0
******************************

Statement: Abhay Bhushan Pandey is an Indian computer scientist

>> Question: Who is Abhay Bhushan Pandey?

Provide a score for the list of candidate answers, considering a pair of (reference_question, reference_answer), according to the following procedure:
1. Start with score 3;
2. If the candidate answer only partially matches the reference answer information, decrement 1 point;
3. If the candidate answer includes information not present in the reference question, decrement 1 point;
4. If the candidate answer end in an incomplete sentence, decrement 1 point;
5. If the candidate answer refers to a different entity from reference question, attribute score 0.

Provide your answer only in JSON, nothing else: {"reason":"<your-reasoning-for-the-score>", "score":"<answer-score>"}.

reference_question: "Who is Abhay Bhushan Pandey?"reference_answer: "an Indian computer scientist"
candidate answer: "An Indian compute

### All edit rounds evaluation time

In [12]:
end_time - start_time

3808.156354665756

## Create a table with the results

In [13]:
results_table_df = create_evaluation_table(statements_scores)

Abhay Bhushan Pandey is an Indian computer scientist
Abhay Bhushan Pandey made significant contributions to the development of the Internet TCP/IP architecture
Abhay Bhushan Pandey is the author of the File Transfer Protocol
Abhay Bhushan Pandey is the author of early versions of email protocols
Abhay Bhushan Pandey graduated from the first batch of Indian Institute of Technology Kanpur in 1965 with a B.Tech in electrical engineering
Abhay Bhushan Pandey received a Masters in electrical engineering
Abhay Bhushan Pandey received a degree in Management from the MIT Sloan School of Management
Abhay Bhushan Pandey worked on developing FTP and email protocols for ARPANet and subsequent Internet
Abhay Bhushan Pandey was a Director at the Institute of Engineering and Rural Technology in Allahabad
Abhay Bhushan Pandey was a senior manager in Engineering and Development of Xerox
Abhay Bhushan Pandey was a co-founder of YieldUP International
Abhay Bhushan Pandey was a co-founder of Portola Commu

In [14]:
results_table_df

Unnamed: 0,statement,round,question,mean_score,std_score
0,Abhay Bhushan Pandey is an Indian computer scientist,0,Who is Abhay Bhushan Pandey?,1.0,0.0
1,Abhay Bhushan Pandey is an Indian computer scientist,0,What is Abhay Bhushan Pandey's nationality?,0.8,0.4
2,Abhay Bhushan Pandey is an Indian computer scientist,0,What is Abhay Bhushan Pandey's profession?,1.4,0.489898
3,Abhay Bhushan Pandey is an Indian computer scientist,1,Who is Abhay Bhushan Pandey?,0.2,0.4
4,Abhay Bhushan Pandey is an Indian computer scientist,1,What is Abhay Bhushan Pandey's nationality?,1.6,1.2
5,Abhay Bhushan Pandey is an Indian computer scientist,1,What is Abhay Bhushan Pandey's profession?,1.0,0.632456
6,Abhay Bhushan Pandey is an Indian computer scientist,2,Who is Abhay Bhushan Pandey?,0.0,0.0
7,Abhay Bhushan Pandey is an Indian computer scientist,2,What is Abhay Bhushan Pandey's nationality?,0.0,0.0
8,Abhay Bhushan Pandey is an Indian computer scientist,2,What is Abhay Bhushan Pandey's profession?,0.0,0.0
9,Abhay Bhushan Pandey is an Indian computer scientist,3,Who is Abhay Bhushan Pandey?,1.0,0.632456


## Save the results 

In [15]:
output_filename = "evaluations_{:02}_{}".format(len(statements_scores), os.path.basename(FACTS_RESULTS))

In [16]:
with open(os.path.join(os.path.dirname(FACTS_RESULTS), output_filename), "wb") as output_file:
    pickle.dump({"statements_scores": statements_scores,
                 "results_table_df": results_table_df,
                 "total_time": end_time - start_time}, output_file, pickle.HIGHEST_PROTOCOL)

# Evaluate the original model without editing

## Load the original model answers to all the questions

In [17]:
with open(ORIGINAL_RESULTS, "rb") as input_file:
    original_results = pickle.load(input_file)

In [18]:
original_results.keys()

dict_keys(['simple_factual_original_answers', 'simple_factual_questions_time', 'factual_original_answers', 'factual_questions_time', 'text_original_answers', 'text_questions_time'])

In [19]:
original_results['simple_factual_original_answers']

[{'statement': 'Abhay Bhushan Pandey is an Indian computer scientist',
  'answers': [{'question': {'question': 'Who is Abhay Bhushan Pandey?',
     'answer': 'an Indian computer scientist'},
    'answers': ['Abhay Bhushan Pandey is an Indian businessman and philanthropist who founded',
     'He is an Indian lawyer and judge who has won numerous awards for his work in the field of law',
     "Abhay Bhushan Pandey is a well-known figure in India's cricket",
     'Abhay Bhushan Pandey is a well-known personality in the Indian',
     'Abhay Bhushan Pandey is a mathematician and writer who has written books']},
   {'question': {'question': "What is Abhay Bhushan Pandey's nationality?",
     'answer': 'Indian'},
    'answers': ['Abhay Bhushan Pandey is a Sikh from India.',
     'Abhay Bhushan Pandey is from India.',
     'Abhay Bhushan Pandey is a Sikh.',
     'Abhay Bhushan Pandey is an Indian citizen.',
     'Bhushan Pandey is Indian.']},
   {'question': {'question': "What is Abhay Bhushan

## Compute the scores for the original model answers to the simple factual questions

In [23]:
simple_facts_start_time = time.time()

simple_facts_statements_scores = {}

edit_round_time = evaluate_statement_questions(groq_interface,
                                               original_results['simple_factual_original_answers'],
                                               simple_facts_statements_scores)

print("\n\n Edit round evaluation time: {}\n\n".format(edit_round_time))

simple_facts_end_time = time.time()


Statement: Abhay Bhushan Pandey is an Indian computer scientist

>> Question: Who is Abhay Bhushan Pandey?

Provide a score for the list of candidate answers, considering a pair of (reference_question, reference_answer), according to the following procedure:
1. Start with score 3;
2. If the candidate answer only partially matches the reference answer information, decrement 1 point;
3. If the candidate answer includes information not present in the reference question, decrement 1 point;
4. If the candidate answer end in an incomplete sentence, decrement 1 point;
5. If the candidate answer refers to a different entity from reference question, attribute score 0.

Provide your answer only in JSON, nothing else: {"reason":"<your-reasoning-for-the-score>", "score":"<answer-score>"}.

reference_question: "Who is Abhay Bhushan Pandey?"reference_answer: "an Indian computer scientist"
candidate answer: "Abhay Bhushan Pandey is an Indian businessman and philanthropist who founded"
[{'role': 'sys

### Total evaluation time

In [24]:
simple_facts_end_time - simple_facts_start_time

423.4786422252655

### Generate the results table

In [27]:
simple_facts_results_table_df = create_evaluation_table(simple_facts_statements_scores)

Abhay Bhushan Pandey is an Indian computer scientist
Abhay Bhushan Pandey made significant contributions to the development of the Internet TCP/IP architecture
Abhay Bhushan Pandey is the author of the File Transfer Protocol
Abhay Bhushan Pandey is the author of early versions of email protocols
Abhay Bhushan Pandey graduated from the first batch of Indian Institute of Technology Kanpur in 1965
Abhay Bhushan Pandey received a B.Tech in electrical engineering
Abhay Bhushan Pandey received a Masters in electrical engineering
Abhay Bhushan Pandey received a degree in Management from the MIT Sloan School of Management
Abhay Bhushan Pandey worked on developing FTP and email protocols for ARPANet and subsequent Internet
Abhay Bhushan Pandey was a Director at the Institute of Engineering and Rural Technology in Allahabad
Abhay Bhushan Pandey was a senior manager in Engineering and Development of Xerox
Abhay Bhushan Pandey was a co-founder of YieldUP International
Abhay Bhushan Pandey was a co

In [28]:
simple_facts_results_table_df

Unnamed: 0,statement,round,question,mean_score,std_score
0,Abhay Bhushan Pandey is an Indian computer scientist,0,Who is Abhay Bhushan Pandey?,0.2,0.4
1,Abhay Bhushan Pandey is an Indian computer scientist,0,What is Abhay Bhushan Pandey's nationality?,2.2,1.16619
2,Abhay Bhushan Pandey is an Indian computer scientist,0,What is Abhay Bhushan Pandey's profession?,0.4,0.489898
3,Abhay Bhushan Pandey made significant contributions to the development of the Internet TCP/IP architecture,0,Who made significant contributions to the development of the Internet TCP/IP architecture?,0.4,0.489898
4,Abhay Bhushan Pandey made significant contributions to the development of the Internet TCP/IP architecture,0,What did Abhay Bhushan Pandey make significant contributions to?,0.4,0.489898
5,Abhay Bhushan Pandey is the author of the File Transfer Protocol,0,Who is the author of the File Transfer Protocol?,0.0,0.0
6,Abhay Bhushan Pandey is the author of the File Transfer Protocol,0,Who wrote the File Transfer Protocol?,0.2,0.4
7,Abhay Bhushan Pandey is the author of early versions of email protocols,0,Who is the author of early versions of email protocols?,0.2,0.4
8,Abhay Bhushan Pandey is the author of early versions of email protocols,0,What is Abhay Bhushan Pandey known for?,0.4,0.489898
9,Abhay Bhushan Pandey graduated from the first batch of Indian Institute of Technology Kanpur in 1965,0,Where did Abhay Bhushan Pandey graduate from?,0.0,0.0


## Compute the scores for the original model answers to the facts questions

In [25]:
facts_start_time = time.time()

facts_statements_scores = {}

edit_round_time = evaluate_statement_questions(groq_interface,
                                               original_results['factual_original_answers'],
                                               facts_statements_scores)

print("\n\n Edit round evaluation time: {}\n\n".format(edit_round_time))

facts_end_time = time.time()


Statement: Abhay Bhushan Pandey is an Indian computer scientist

>> Question: Who is Abhay Bhushan Pandey?

Provide a score for the list of candidate answers, considering a pair of (reference_question, reference_answer), according to the following procedure:
1. Start with score 3;
2. If the candidate answer only partially matches the reference answer information, decrement 1 point;
3. If the candidate answer includes information not present in the reference question, decrement 1 point;
4. If the candidate answer end in an incomplete sentence, decrement 1 point;
5. If the candidate answer refers to a different entity from reference question, attribute score 0.

Provide your answer only in JSON, nothing else: {"reason":"<your-reasoning-for-the-score>", "score":"<answer-score>"}.

reference_question: "Who is Abhay Bhushan Pandey?"reference_answer: "an Indian computer scientist"
candidate answer: "He is the CEO of the pharmaceutical company, Sanofi."
[{'role': 'system', 'content': 'You ev

### Total evaluation time

In [26]:
facts_end_time - facts_start_time

466.3811147212982

### Generate the results table

In [29]:
facts_results_table_df = create_evaluation_table(facts_statements_scores)

Abhay Bhushan Pandey is an Indian computer scientist
Abhay Bhushan Pandey made significant contributions to the development of the Internet TCP/IP architecture
Abhay Bhushan Pandey is the author of the File Transfer Protocol
Abhay Bhushan Pandey is the author of early versions of email protocols
Abhay Bhushan Pandey graduated from the first batch of Indian Institute of Technology Kanpur in 1965 with a B.Tech in electrical engineering
Abhay Bhushan Pandey received a Masters in electrical engineering
Abhay Bhushan Pandey received a degree in Management from the MIT Sloan School of Management
Abhay Bhushan Pandey worked on developing FTP and email protocols for ARPANet and subsequent Internet
Abhay Bhushan Pandey was a Director at the Institute of Engineering and Rural Technology in Allahabad
Abhay Bhushan Pandey was a senior manager in Engineering and Development of Xerox
Abhay Bhushan Pandey was a co-founder of YieldUP International
Abhay Bhushan Pandey was a co-founder of Portola Commu

In [30]:
facts_results_table_df

Unnamed: 0,statement,round,question,mean_score,std_score
0,Abhay Bhushan Pandey is an Indian computer scientist,0,Who is Abhay Bhushan Pandey?,0.2,0.4
1,Abhay Bhushan Pandey is an Indian computer scientist,0,What is Abhay Bhushan Pandey's nationality?,1.4,1.019804
2,Abhay Bhushan Pandey is an Indian computer scientist,0,What is Abhay Bhushan Pandey's profession?,0.6,1.2
3,Abhay Bhushan Pandey made significant contributions to the development of the Internet TCP/IP architecture,0,Who made significant contributions to the development of the Internet TCP/IP architecture?,0.4,0.489898
4,Abhay Bhushan Pandey made significant contributions to the development of the Internet TCP/IP architecture,0,What did Abhay Bhushan Pandey make significant contributions to?,0.2,0.4
5,Abhay Bhushan Pandey is the author of the File Transfer Protocol,0,Who is the author of the File Transfer Protocol?,0.2,0.4
6,Abhay Bhushan Pandey is the author of the File Transfer Protocol,0,Who wrote the File Transfer Protocol?,0.0,0.0
7,Abhay Bhushan Pandey is the author of early versions of email protocols,0,Who is the author of early versions of email protocols?,0.0,0.0
8,Abhay Bhushan Pandey is the author of early versions of email protocols,0,What is Abhay Bhushan Pandey known for?,0.4,0.489898
9,Abhay Bhushan Pandey graduated from the first batch of Indian Institute of Technology Kanpur in 1965 with a B.Tech in electrical engineering,0,Where did Abhay Bhushan Pandey graduate from?,0.0,0.0


## Compute the scores for the original model answers for the text questions

In [31]:
original_results['text_original_answers']

[{'question': {'question': "What is Abhay Bhushan Pandey's profession?",
   'answer': 'Indian computer scientist.'},
  'answers': ['Abhay Bhushan Pandey is a professional cardiologist, an expert',
   'An accountant.',
   'Abhay Bhushan Pandey is an Information Security Analyst who works at a',
   'Abhay Bhushan Pandey is an architect.',
   'Abhay Bhushan Pandey is a renowned architect and civil engineer who has']},
 {'question': {'question': 'What did Abhay Bhushan Pandey contribute to?',
   'answer': 'development of the Internet TCP/IP architecture.'},
  'answers': ['Abhay Bhushan Pandey was a British East India Company officer who is',
   'Abhay Bhushan Pandey contributed to the development of the film industry in',
   '- Abhay Bhushan Pandey contributed to the development of Indian philosophy, particularly in',
   'Abhay Bhushan Pandey is a Sikh physicist and mathematics in India.',
   'Abhay Bhushan Pandey was a prominent figure in the Jain']},
 {'question': {'question': 'What prot

In [35]:
text_scores = evaluate_questions(groq_interface,
                                 original_results['text_original_answers'],
                                 edit_round_number=0)


>> Question: What is Abhay Bhushan Pandey's profession?

Provide a score for the list of candidate answers, considering a pair of (reference_question, reference_answer), according to the following procedure:
1. Start with score 3;
2. If the candidate answer only partially matches the reference answer information, decrement 1 point;
3. If the candidate answer includes information not present in the reference question, decrement 1 point;
4. If the candidate answer end in an incomplete sentence, decrement 1 point;
5. If the candidate answer refers to a different entity from reference question, attribute score 0.

Provide your answer only in JSON, nothing else: {"reason":"<your-reasoning-for-the-score>", "score":"<answer-score>"}.

reference_question: "What is Abhay Bhushan Pandey's profession?"reference_answer: "Indian computer scientist."
candidate answer: "Abhay Bhushan Pandey is a professional cardiologist, an expert"
[{'role': 'system', 'content': 'You evaluate a list of answers, tak

### Total evaluation time

In [37]:
print(text_scores['total_time'])

101.5863311290741


In [39]:
text_scores.keys()

dict_keys(['round', 'questions', 'total_time'])

In [40]:
text_scores['questions']

[{'question': "What is Abhay Bhushan Pandey's profession?",
  'mean_score': 0.2,
  'std_score': 0.4000000000000001,
  'evaluations': [{'reason': 'The candidate answer refers to a different entity (cardiologist) from the reference question (computer scientist), so the score is 0.',
    'score': '0',
    'generated_text': '{"reason": "The candidate answer refers to a different entity (cardiologist) from the reference question (computer scientist), so the score is 0.", "score": "0"}',
    'prompt_tokens': 219,
    'completion_tokens': 39,
    'total_tokens': 258,
    'total_time': 0.18314223400000001,
    'candidate_answer': 'Abhay Bhushan Pandey is a professional cardiologist, an expert'},
   {'reason': 'The candidate answer refers to a different entity from the reference question.',
    'score': '0',
    'generated_text': '{"reason": "The candidate answer refers to a different entity from the reference question.", "score": "0"}',
    'prompt_tokens': 206,
    'completion_tokens': 24,
  

In [41]:
results_table = []

for question in text_scores['questions']:
    results = {}
    results['statement'] = statement
    results['round'] = evaluation['round']
    results['question'] = question['question']
    results['mean_score'] = question['mean_score']
    results['std_score'] = question['std_score']

    results_table.append(results)


In [43]:
text_scores_df = pd.DataFrame(results_table)

In [44]:
text_scores_df

Unnamed: 0,statement,round,question,mean_score,std_score
0,Abhay Bhushan Pandey was former President of the IIT-Kanpur Foundation,14,What is Abhay Bhushan Pandey's profession?,0.2,0.4
1,Abhay Bhushan Pandey was former President of the IIT-Kanpur Foundation,14,What did Abhay Bhushan Pandey contribute to?,0.0,0.0
2,Abhay Bhushan Pandey was former President of the IIT-Kanpur Foundation,14,What protocols did Abhay Bhushan Pandey author?,0.0,0.0
3,Abhay Bhushan Pandey was former President of the IIT-Kanpur Foundation,14,Where did Abhay Bhushan Pandey graduate from in 1965?,0.0,0.0
4,Abhay Bhushan Pandey was former President of the IIT-Kanpur Foundation,14,What degree did Abhay Bhushan Pandey receive from MIT Sloan School of Management?,0.8,0.4
5,Abhay Bhushan Pandey was former President of the IIT-Kanpur Foundation,14,What did Abhay Bhushan Pandey work on developing for ARPANet and subsequent Internet?,0.8,0.4
6,Abhay Bhushan Pandey was former President of the IIT-Kanpur Foundation,14,What was Abhay Bhushan Pandey's role at the Institute of Engineering and Rural Technology in Allahabad?,0.6,0.489898
7,Abhay Bhushan Pandey was former President of the IIT-Kanpur Foundation,14,What was Abhay Bhushan Pandey's role at Xerox?,0.4,0.489898
8,Abhay Bhushan Pandey was former President of the IIT-Kanpur Foundation,14,What companies did Abhay Bhushan Pandey co-found?,0.0,0.0
9,Abhay Bhushan Pandey was former President of the IIT-Kanpur Foundation,14,What is Abhay Bhushan Pandey's current role at Asquare Inc.?,0.2,0.4


## Save all the results

In [45]:
output_filename = "evaluations_original_{}".format(os.path.basename(ORIGINAL_RESULTS))

In [46]:
with open(os.path.join(os.path.dirname(ORIGINAL_RESULTS), output_filename), "wb") as output_file:
    pickle.dump({"simple_facts_statements_scores": simple_facts_statements_scores,
                 "simple_facts_results_table_df": simple_facts_results_table_df,
                 "simple_facts_evaluation_total_time": simple_facts_end_time - simple_facts_start_time,

                 "facts_statements_scores": facts_statements_scores,
                 "facts_results_table_df": facts_results_table_df,
                 "facts_evaluation_total_time": facts_end_time - facts_start_time,
                 
                 "text_scores": text_scores,
                 "text_scores_df": text_scores_df}, output_file, pickle.HIGHEST_PROTOCOL)