In [1]:
import os
import json
import pandas as pd
import numpy as np

from tqdm.auto import tqdm

from llm_access import *

import pickle

from tqdm.auto import tqdm

In [2]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 500)

In [3]:
API_KEYS_FILE="/work/api_keys_20240427.json"

RESULTS="../../data/results_20240620_001432_simple_facts.pkl"

## Prepare Groq access

In [4]:
groq_key = json.load(open(API_KEYS_FILE))['groq']

In [5]:
groq_interface = groq_access(groq_key, GROQ_LLAMA3_70B_MODEL)

## Load the results

In [6]:
with open(RESULTS, "rb") as input_file:
    all_results = pickle.load(input_file)

In [7]:
all_results

[{'statement': 'Abhay Bhushan Pandey is an Indian computer scientist',
  'model_answers': [{'statement': 'Abhay Bhushan Pandey is an Indian computer scientist',
    'answers': [{'question': {'question': 'Who is Abhay Bhushan Pandey?',
       'answer': 'an Indian computer scientist'},
      'answers': ['Abhay Bhushan Pandey is an Indian computer scientist who was born in',
       'Abhay Bhushan is an Indian computer scientist who was born without arms',
       'He was an Indian computer scientist who made it possible for people with disabilities to use',
       'He was an Indian computer scientist who invented computer software to help people with disabilities use computers',
       'He was an Indian computer scientist who developed a computer that could understand and respond to human']},
     {'question': {'question': "What is Abhay Bhushan Pandey's nationality?",
       'answer': 'Indian'},
      'answers': ['- An Indian computer scientist who developed a computer program that could 

In [8]:
len(all_results)

16

## Compute the answers scores for all editing rounds

In [15]:
statements_scores = {}

for i, edit_round in enumerate(all_results):

    print("******************************")
    print("Edit round {}".format(i))
    print("******************************")
    
    for statement in edit_round['model_answers']:

        print("\nStatement: {}".format(statement['statement']))
        
        if statement['statement'] not in statements_scores:
            statements_scores[statement['statement']] = []

        statement_round = {}
        statement_round['round'] = i
        statement_round['questions'] = []
        
        for question in statement['answers']:

            print("\n>> Question: {}".format(question['question']['question']))

            round_question = {}
            
            round_question['question'] = question['question']['question']

            question_scores = []
            question_evaluations = []

            for answer in question['answers']:
                score = answer_evaluation(groq_interface, 
                                          question['question'],
                                          answer)

                question_scores.append(int(score['score']))

                score['candidate_answer'] = answer
                
                question_evaluations.append(score)

            print(question_scores)
            
            round_question['mean_score'] = np.mean(question_scores)
            round_question['std_score'] = np.std(question_scores)
            round_question['evaluations'] = question_evaluations

            statement_round['questions'].append(round_question)

        statements_scores[statement['statement']].append(statement_round)


******************************
Edit round 0
******************************

Statement: Abhay Bhushan Pandey is an Indian computer scientist

>> Question: Who is Abhay Bhushan Pandey?

Provide a score for the list of candidate answers, considering a pair of (reference_question, reference_answer), according to the following procedure:
1. Start with score 3;
2. If the candidate answer only partially matches the reference answer information, decrement 1 point;
3. If the candidate answer includes information not present in the reference question, decrement 1 point;
4. If the candidate answer end in an incomplete sentence, decrement 1 point;
5. If the candidate answer refers to a different entity from reference question, attribute score 0.

Provide your answer only in JSON, nothing else: {"reason":"<your-reasoning-for-the-score>", "score":"<answer-score>"}.

reference_question: "Who is Abhay Bhushan Pandey?"reference_answer: "an Indian computer scientist"
candidate answer: "Abhay Bhushan Pan

## Create a table with the results

In [16]:
results_table = []

for statement, rounds in statements_scores.items():

    print(statement)
    
    for evaluation in rounds:
        for question in evaluation['questions']:
            results = {}
            results['statement'] = statement
            results['round'] = evaluation['round']
            results['question'] = question['question']
            results['mean_score'] = question['mean_score']
            results['std_score'] = question['std_score']
    
            results_table.append(results)

results_table_df = pd.DataFrame(results_table)

Abhay Bhushan Pandey is an Indian computer scientist
Abhay Bhushan Pandey made significant contributions to the development of the Internet TCP/IP architecture
Abhay Bhushan Pandey is the author of the File Transfer Protocol
Abhay Bhushan Pandey is the author of early versions of email protocols
Abhay Bhushan Pandey graduated from the first batch of Indian Institute of Technology Kanpur in 1965
Abhay Bhushan Pandey received a B.Tech in electrical engineering
Abhay Bhushan Pandey received a Masters in electrical engineering
Abhay Bhushan Pandey received a degree in Management from the MIT Sloan School of Management
Abhay Bhushan Pandey worked on developing FTP and email protocols for ARPANet and subsequent Internet
Abhay Bhushan Pandey was a Director at the Institute of Engineering and Rural Technology in Allahabad
Abhay Bhushan Pandey was a senior manager in Engineering and Development of Xerox
Abhay Bhushan Pandey was a co-founder of YieldUP International
Abhay Bhushan Pandey was a co

In [17]:
results_table_df

Unnamed: 0,statement,round,question,mean_score,std_score
0,Abhay Bhushan Pandey is an Indian computer scientist,0,Who is Abhay Bhushan Pandey?,1.4,0.489898
1,Abhay Bhushan Pandey is an Indian computer scientist,0,What is Abhay Bhushan Pandey's nationality?,1.2,0.748331
2,Abhay Bhushan Pandey is an Indian computer scientist,0,What is Abhay Bhushan Pandey's profession?,2.0,0.632456
3,Abhay Bhushan Pandey is an Indian computer scientist,1,Who is Abhay Bhushan Pandey?,0.4,0.489898
4,Abhay Bhushan Pandey is an Indian computer scientist,1,What is Abhay Bhushan Pandey's nationality?,0.0,0.0
5,Abhay Bhushan Pandey is an Indian computer scientist,1,What is Abhay Bhushan Pandey's profession?,0.2,0.4
6,Abhay Bhushan Pandey is an Indian computer scientist,2,Who is Abhay Bhushan Pandey?,0.2,0.4
7,Abhay Bhushan Pandey is an Indian computer scientist,2,What is Abhay Bhushan Pandey's nationality?,0.0,0.0
8,Abhay Bhushan Pandey is an Indian computer scientist,2,What is Abhay Bhushan Pandey's profession?,0.0,0.0
9,Abhay Bhushan Pandey is an Indian computer scientist,3,Who is Abhay Bhushan Pandey?,0.8,0.4


## Save the results 

In [18]:
output_filename = "evaluations_{:02}_{}".format(len(statements_scores), os.path.basename(RESULTS))

In [19]:
with open(os.path.join(os.path.dirname(RESULTS), output_filename), "wb") as output_file:
    pickle.dump({"statements_scores": statements_scores,
                 "results_table_df": results_table_df}, output_file, pickle.HIGHEST_PROTOCOL)