In [1]:
import os
import json
import pandas as pd
import numpy as np

from tqdm.auto import tqdm

from answers_evaluation import *

import pickle

from tqdm.auto import tqdm

import time

import re

In [2]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 500)

In [3]:
API_KEYS_FILE="/work/api_keys_20240427.json"

FACTS_RESULTS="../../data/microsoft_phi-1_5_3_step_results_20240625_193458_facts.pkl"
ORIGINAL_RESULTS="../../data/microsoft_phi-1_5_3_step_results_20240625_193458_original.pkl"

## Prepare Groq access

In [4]:
groq_key = json.load(open(API_KEYS_FILE))['groq']

In [5]:
groq_interface = groq_access(groq_key, GROQ_LLAMA3_70B_MODEL)

## Load the edited facts questions results

In [6]:
with open(FACTS_RESULTS, "rb") as input_file:
    all_results = pickle.load(input_file)

In [7]:
all_results

{'facts_test': [{'statement': 'Abhay Bhushan Pandey is an Indian computer scientist',
   'model_answers': [{'statement': 'Abhay Bhushan Pandey is an Indian computer scientist',
     'answers': [{'question': {'question': 'Who is Abhay Bhushan Pandey?',
        'answer': 'an Indian computer scientist'},
       'answers': ['Abhay Bhushan Pandey, an Indian computer scientist, was an Indian-',
        'Abhay Bhushan Pandey is an Indian computer scientist who was born in 1956',
        'An Indian computer scientist who created a computer program to help people with disabilities use computers.',
        'He is an Indian computer scientist who developed an algorithm to make computers accessible to people with',
        'He is an Indian computer scientist who created an artificial intelligence system called "Grace."']},
      {'question': {'question': "What is Abhay Bhushan Pandey's nationality?",
        'answer': 'Indian'},
       'answers': ['He is an Indian computer scientist who was born w

In [8]:
all_results['facts_test'][11]['model_answers'][-4]['answers'][1]['answers']

['\'", "Indian Economy ", "Co-starring in Servant Collective ", "',
 "Oragal Industries Inc. Mark Orinas one of America's foremost lawyers and philanthropists of the",
 '"He too Qasquare "wants to be a part of "didn\'t"',
 'Collective Action Against Injustice " Collective Action Against Injustice " Collective Action Against Injustice " Collective Action',
 'Indians of enquiry collective for collective action of collective bargaining new Delhi Collective Action Collective Endorsed by']

In [9]:
re.sub("(\",)|\"|\'", " ", all_results['facts_test'][11]['model_answers'][-4]['answers'][1]['answers'][3])

'Collective Action Against Injustice   Collective Action Against Injustice   Collective Action Against Injustice   Collective Action'

In [10]:
all_results.keys()

dict_keys(['facts_test', 'total_time', 'text_edited_answers', 'text_edited_questions_time'])

# Evaluate the edited model with factual associations

## Compute the answers scores for all editing rounds

In [11]:
facts_start_time = time.time()

statements_scores = {}

for i, edit_round in enumerate(all_results['facts_test']):

    print("******************************")
    print("Edit round {}".format(i))
    print("******************************")

    edit_round_time = evaluate_statement_questions(groq_interface,
                                                   edit_round['model_answers'],
                                                   statements_scores,
                                                   edit_round_number=i)

    print("\n\n Edit round evaluation time: {}\n\n".format(edit_round_time))

facts_end_time = time.time()

******************************
Edit round 0
******************************

Statement: Abhay Bhushan Pandey is an Indian computer scientist

>> Question: Who is Abhay Bhushan Pandey?

Provide a score from 0 to 3 for a candidate_answer, considering a pair of (reference_question, reference_answer), according to the following procedure:
1. Start with score 3;
2. If the candidate_answer does not include any information in the reference_answer, attribute score 0.
3. If the candidate_answer does not include the complete reference_answer information, decrement 1 point;
4. If the candidate_answer includes information not verifiable by the reference_question, decrement 1 point;
5. If the candidate_answer end in an incomplete sentence, decrement 1 point;
6. If the candidate_answer refers to a different entity or subject from reference_question, attribute score 0.
7. If for any reason you cannot evaluate, attribute score 0.

Provide your answer only in JSON, nothing else: {"reason":"<your-reasoni

### All edit rounds evaluation time

In [12]:
facts_end_time - facts_start_time

3726.9530997276306

## Create a table with the results

In [13]:
results_table_df = create_evaluation_table(statements_scores)

Abhay Bhushan Pandey is an Indian computer scientist
Abhay Bhushan Pandey made significant contributions to the development of the Internet TCP/IP architecture
Abhay Bhushan Pandey is the author of the File Transfer Protocol and early versions of email protocols
Abhay Bhushan Pandey graduated from the Indian Institute of Technology Kanpur in 1965 with a B.Tech in electrical engineering
Abhay Bhushan Pandey received a Masters in electrical engineering and a degree in Management from the MIT Sloan School of Management
Abhay Bhushan Pandey worked on developing FTP and email protocols for ARPANet and subsequent Internet
Abhay Bhushan Pandey was a Director at the Institute of Engineering and Rural Technology in Allahabad
Abhay Bhushan Pandey was a senior manager in Engineering and Development of Xerox
Abhay Bhushan Pandey was a co-founder of YieldUP International
Abhay Bhushan Pandey co-founded Portola Communications
Abhay Bhushan Pandey is currently chairman of Asquare Inc
Abhay Bhushan Pa

In [14]:
results_table_df

Unnamed: 0,statement,round,question,mean_score,std_score
0,Abhay Bhushan Pandey is an Indian computer scientist,0,Who is Abhay Bhushan Pandey?,2.0,0.0
1,Abhay Bhushan Pandey is an Indian computer scientist,0,What is Abhay Bhushan Pandey's nationality?,1.4,0.8
2,Abhay Bhushan Pandey is an Indian computer scientist,0,What is Abhay Bhushan Pandey's profession?,1.6,0.8
3,Abhay Bhushan Pandey is an Indian computer scientist,1,Who is Abhay Bhushan Pandey?,0.0,0.0
4,Abhay Bhushan Pandey is an Indian computer scientist,1,What is Abhay Bhushan Pandey's nationality?,0.0,0.0
5,Abhay Bhushan Pandey is an Indian computer scientist,1,What is Abhay Bhushan Pandey's profession?,0.2,0.4
6,Abhay Bhushan Pandey is an Indian computer scientist,2,Who is Abhay Bhushan Pandey?,0.0,0.0
7,Abhay Bhushan Pandey is an Indian computer scientist,2,What is Abhay Bhushan Pandey's nationality?,0.6,1.2
8,Abhay Bhushan Pandey is an Indian computer scientist,2,What is Abhay Bhushan Pandey's profession?,0.2,0.4
9,Abhay Bhushan Pandey is an Indian computer scientist,3,Who is Abhay Bhushan Pandey?,0.2,0.4


## Compute the edit model answers scores for text questions

In [17]:
edited_text_scores = evaluate_questions(groq_interface,
                                        all_results['text_edited_answers'],
                                        edit_round_number=0)


>> Question: What is Abhay Bhushan Pandey's profession?

Provide a score from 0 to 3 for a candidate_answer, considering a pair of (reference_question, reference_answer), according to the following procedure:
1. Start with score 3;
2. If the candidate_answer does not include any information in the reference_answer, attribute score 0.
3. If the candidate_answer does not include the complete reference_answer information, decrement 1 point;
4. If the candidate_answer includes information not verifiable by the reference_question, decrement 1 point;
5. If the candidate_answer end in an incomplete sentence, decrement 1 point;
6. If the candidate_answer refers to a different entity or subject from reference_question, attribute score 0.
7. If for any reason you cannot evaluate, attribute score 0.

Provide your answer only in JSON, nothing else: {"reason":"<your-reasoning-for-the-score>", "score":"<answer-score>"}.

reference_question: "What is Abhay Bhushan Pandey's profession?"
reference_ans

In [18]:
print(edited_text_scores['total_time'])

79.04207277297974


In [20]:
edited_text_scores['questions']

[{'question': "What is Abhay Bhushan Pandey's profession?",
  'mean_score': 0.2,
  'std_score': 0.4000000000000001,
  'evaluations': [{'reason': "The candidate answer does not include any information about Abhay Bhushan Pandey's profession and includes information not verifiable by the reference question, and refers to a different entity or subject from the reference question.",
    'score': '0',
    'generated_text': '{"reason": "The candidate answer does not include any information about Abhay Bhushan Pandey\'s profession and includes information not verifiable by the reference question, and refers to a different entity or subject from the reference question.", "score": "0"}',
    'prompt_tokens': 262,
    'completion_tokens': 53,
    'total_tokens': 315,
    'total_time': 0.19122839700000002,
    'candidate_answer': 'As I have said in the comment, you need a function to convert'},
   {'reason': 'The candidate answer does not include any information in the reference answer and includ

In [25]:
edited_results_table = []

for question in edited_text_scores['questions']:
    results = {}
    results['question'] = question['question']
    results['mean_score'] = question['mean_score']
    results['std_score'] = question['std_score']

    edited_results_table.append(results)

In [26]:
edit_text_scores_df = pd.DataFrame(edited_results_table)

In [27]:
edit_text_scores_df

Unnamed: 0,question,mean_score,std_score
0,What is Abhay Bhushan Pandey's profession?,0.2,0.4
1,What did Abhay Bhushan Pandey contribute to?,0.0,0.0
2,What protocols did Abhay Bhushan Pandey author?,0.0,0.0
3,Where did Abhay Bhushan Pandey graduate from in 1965?,0.0,0.0
4,What degree did Abhay Bhushan Pandey receive from the MIT Sloan School of Management?,0.2,0.4
5,What networks did Abhay Bhushan Pandey work on developing FTP and email protocols for?,0.0,0.0
6,What positions did Abhay Bhushan Pandey hold at the Institute of Engineering and Rural Technology and Xerox?,0.2,0.4
7,What companies did Abhay Bhushan Pandey co-found?,0.0,0.0
8,What positions does Abhay Bhushan Pandey currently hold?,0.2,0.4


## Save the edited model results 

In [15]:
output_filename = "evaluations_3_step_{:02}_{}".format(len(statements_scores), os.path.basename(FACTS_RESULTS))

In [28]:
with open(os.path.join(os.path.dirname(FACTS_RESULTS), output_filename), "wb") as output_file:
    pickle.dump({"statements_scores": statements_scores,
                 "results_table_df": results_table_df,
                 "total_time": facts_end_time - facts_start_time,
                 "edited_text_scores": edited_text_scores,
                 "edit_text_scores_df": edit_text_scores_df}, output_file, pickle.HIGHEST_PROTOCOL)

# Evaluate the original model without editing

## Load the original model answers to all the questions

In [29]:
with open(ORIGINAL_RESULTS, "rb") as input_file:
    original_results = pickle.load(input_file)

In [30]:
original_results.keys()

dict_keys(['factual_original_answers', 'factual_questions_time', 'text_original_answers', 'text_questions_time'])

## Compute the scores for the original model answers to the facts questions

In [31]:
facts_start_time = time.time()

facts_statements_scores = {}

edit_round_time = evaluate_statement_questions(groq_interface,
                                               original_results['factual_original_answers'],
                                               facts_statements_scores)

print("\n\n Edit round evaluation time: {}\n\n".format(edit_round_time))

facts_end_time = time.time()


Statement: Abhay Bhushan Pandey is an Indian computer scientist

>> Question: Who is Abhay Bhushan Pandey?

Provide a score from 0 to 3 for a candidate_answer, considering a pair of (reference_question, reference_answer), according to the following procedure:
1. Start with score 3;
2. If the candidate_answer does not include any information in the reference_answer, attribute score 0.
3. If the candidate_answer does not include the complete reference_answer information, decrement 1 point;
4. If the candidate_answer includes information not verifiable by the reference_question, decrement 1 point;
5. If the candidate_answer end in an incomplete sentence, decrement 1 point;
6. If the candidate_answer refers to a different entity or subject from reference_question, attribute score 0.
7. If for any reason you cannot evaluate, attribute score 0.

Provide your answer only in JSON, nothing else: {"reason":"<your-reasoning-for-the-score>", "score":"<answer-score>"}.

reference_question: "Who is

### Total evaluation time

In [32]:
facts_end_time - facts_start_time

439.2099304199219

### Generate the results table

In [33]:
facts_results_table_df = create_evaluation_table(facts_statements_scores)

Abhay Bhushan Pandey is an Indian computer scientist
Abhay Bhushan Pandey made significant contributions to the development of the Internet TCP/IP architecture
Abhay Bhushan Pandey is the author of the File Transfer Protocol and early versions of email protocols
Abhay Bhushan Pandey graduated from the Indian Institute of Technology Kanpur in 1965 with a B.Tech in electrical engineering
Abhay Bhushan Pandey received a Masters in electrical engineering and a degree in Management from the MIT Sloan School of Management
Abhay Bhushan Pandey worked on developing FTP and email protocols for ARPANet and subsequent Internet
Abhay Bhushan Pandey was a Director at the Institute of Engineering and Rural Technology in Allahabad
Abhay Bhushan Pandey was a senior manager in Engineering and Development of Xerox
Abhay Bhushan Pandey was a co-founder of YieldUP International
Abhay Bhushan Pandey co-founded Portola Communications
Abhay Bhushan Pandey is currently chairman of Asquare Inc
Abhay Bhushan Pa

In [34]:
facts_results_table_df

Unnamed: 0,statement,round,question,mean_score,std_score
0,Abhay Bhushan Pandey is an Indian computer scientist,0,Who is Abhay Bhushan Pandey?,0.6,0.8
1,Abhay Bhushan Pandey is an Indian computer scientist,0,What is Abhay Bhushan Pandey's nationality?,2.2,1.16619
2,Abhay Bhushan Pandey is an Indian computer scientist,0,What is Abhay Bhushan Pandey's profession?,0.4,0.489898
3,Abhay Bhushan Pandey made significant contributions to the development of the Internet TCP/IP architecture,0,Who made significant contributions to the development of the Internet TCP/IP architecture?,0.6,0.489898
4,Abhay Bhushan Pandey made significant contributions to the development of the Internet TCP/IP architecture,0,What did Abhay Bhushan Pandey make significant contributions to?,0.4,0.489898
5,Abhay Bhushan Pandey is the author of the File Transfer Protocol and early versions of email protocols,0,Who is the author of the File Transfer Protocol?,0.0,0.0
6,Abhay Bhushan Pandey is the author of the File Transfer Protocol and early versions of email protocols,0,What is Abhay Bhushan Pandey known for authoring?,0.4,0.489898
7,Abhay Bhushan Pandey is the author of the File Transfer Protocol and early versions of email protocols,0,Who developed early versions of email protocols?,0.2,0.4
8,Abhay Bhushan Pandey graduated from the Indian Institute of Technology Kanpur in 1965 with a B.Tech in electrical engineering,0,Where did Abhay Bhushan Pandey graduate from?,0.2,0.4
9,Abhay Bhushan Pandey graduated from the Indian Institute of Technology Kanpur in 1965 with a B.Tech in electrical engineering,0,What year did Abhay Bhushan Pandey graduate?,0.8,1.16619


## Compute the scores for the original model answers for the text questions

In [35]:
original_results['text_original_answers']

[{'question': {'question': "What is Abhay Bhushan Pandey's profession?",
   'answer': 'Indian computer scientist.'},
  'answers': ['Abhay Bhushan Pandey is an executive coach, a highly qualified professional',
   'Abhay Bhushan Pandey is a prominent Hindu priest, priestess,',
   'Abhay Bhushan Pandey is an Indian educator who has helped many students',
   'Pandey is a doctor who specialises in pediatrics.',
   'Abhay Bhushan Pandey is a successful businessman who owns a chain of']},
 {'question': {'question': 'What did Abhay Bhushan Pandey contribute to?',
   'answer': 'development of the Internet TCP/IP architecture.'},
  'answers': ['Abhay Bhushan Pandey is a renowned scientist-turned-entreprene',
   'There are several good answers to why this is happening.  First,',
   'Abhay Bhushan Pandey created a unique logo and a new brand identity',
   'Abhay Bhushan Pandey contributed to the field of economics through his work',
   'In 2013, Abhay Bhushan Pandey was a director of the Internat

In [36]:
text_scores = evaluate_questions(groq_interface,
                                 original_results['text_original_answers'],
                                 edit_round_number=0)


>> Question: What is Abhay Bhushan Pandey's profession?

Provide a score from 0 to 3 for a candidate_answer, considering a pair of (reference_question, reference_answer), according to the following procedure:
1. Start with score 3;
2. If the candidate_answer does not include any information in the reference_answer, attribute score 0.
3. If the candidate_answer does not include the complete reference_answer information, decrement 1 point;
4. If the candidate_answer includes information not verifiable by the reference_question, decrement 1 point;
5. If the candidate_answer end in an incomplete sentence, decrement 1 point;
6. If the candidate_answer refers to a different entity or subject from reference_question, attribute score 0.
7. If for any reason you cannot evaluate, attribute score 0.

Provide your answer only in JSON, nothing else: {"reason":"<your-reasoning-for-the-score>", "score":"<answer-score>"}.

reference_question: "What is Abhay Bhushan Pandey's profession?"
reference_ans

### Total evaluation time

In [37]:
print(text_scores['total_time'])

81.29734015464783


In [38]:
text_scores.keys()

dict_keys(['round', 'questions', 'total_time'])

In [39]:
text_scores['questions']

[{'question': "What is Abhay Bhushan Pandey's profession?",
  'mean_score': 0.6,
  'std_score': 0.7999999999999999,
  'evaluations': [{'reason': 'The candidate answer does not include the complete reference answer information and includes information not verifiable by the reference question.',
    'score': '1',
    'generated_text': '{"reason": "The candidate answer does not include the complete reference answer information and includes information not verifiable by the reference question.", "score": "1"}',
    'prompt_tokens': 264,
    'completion_tokens': 33,
    'total_tokens': 297,
    'total_time': 0.25136069,
    'candidate_answer': 'Abhay Bhushan Pandey is an executive coach, a highly qualified professional'},
   {'reason': 'The candidate answer refers to a different entity or subject from the reference question, and includes information not verifiable by the reference question.',
    'score': '0',
    'generated_text': '{"reason": "The candidate answer refers to a different ent

In [40]:
results_table = []

for question in text_scores['questions']:
    results = {}
    results['question'] = question['question']
    results['mean_score'] = question['mean_score']
    results['std_score'] = question['std_score']

    results_table.append(results)

In [41]:
text_scores_df = pd.DataFrame(results_table)

In [42]:
text_scores_df

Unnamed: 0,question,mean_score,std_score
0,What is Abhay Bhushan Pandey's profession?,0.6,0.8
1,What did Abhay Bhushan Pandey contribute to?,0.2,0.4
2,What protocols did Abhay Bhushan Pandey author?,0.0,0.0
3,Where did Abhay Bhushan Pandey graduate from in 1965?,0.0,0.0
4,What degree did Abhay Bhushan Pandey receive from the MIT Sloan School of Management?,0.6,0.489898
5,What networks did Abhay Bhushan Pandey work on developing FTP and email protocols for?,0.2,0.4
6,What positions did Abhay Bhushan Pandey hold at the Institute of Engineering and Rural Technology and Xerox?,1.0,0.0
7,What companies did Abhay Bhushan Pandey co-found?,0.2,0.4
8,What positions does Abhay Bhushan Pandey currently hold?,0.8,0.4


## Save all the results

In [43]:
output_filename = "evaluations_original_3_step_{}".format(os.path.basename(ORIGINAL_RESULTS))

In [44]:
with open(os.path.join(os.path.dirname(ORIGINAL_RESULTS), output_filename), "wb") as output_file:
    pickle.dump({"facts_statements_scores": facts_statements_scores,
                 "facts_results_table_df": facts_results_table_df,
                 "facts_evaluation_total_time": facts_end_time - facts_start_time,
                 
                 "text_scores": text_scores,
                 "text_scores_df": text_scores_df}, output_file, pickle.HIGHEST_PROTOCOL)