In [1]:
import os
import json
import pandas as pd
import numpy as np

from tqdm.auto import tqdm

from llm_access import *

import pickle

from tqdm.auto import tqdm

In [31]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 500)

In [2]:
BIOGRAPHY_DATASET="../../llm_multiagent_debate/biography/article.json"
API_KEYS_FILE="/work/api_keys_20240427.json"

## Read Biography dataset, which will be the factual associations base

In [3]:
biography = json.load(open(BIOGRAPHY_DATASET))

In [4]:
people = list(biography.keys())

In [5]:
people

['Aaron Sloman',
 'Abhay Bhushan',
 'Adam Dunkels',
 'Adele Goldberg (computer scientist)',
 'Adi Shamir',
 'Adriaan van Wijngaarden',
 'Alan Dix',
 'Alan Edelman',
 'Alan Kay',
 'Alan Mycroft',
 'Alan Perlis',
 'Alberto Ciaramella',
 'Alexander Dewdney',
 'Alexander Stepanov',
 'Alfred Aho',
 'Alston Householder',
 'Amir Pnueli',
 'Anders P. Ravn',
 'Andrew Barto',
 'Andrew Chi-Chih Yao',
 'Andrew Donald Booth',
 'Andrew Herbert',
 'Andrew Koenig (programmer)',
 'Andrew McCallum',
 'Andrew Ng',
 'Andrew S. Tanenbaum',
 'Andrew V. Goldberg',
 'Andrew Viterbi',
 'Andrey Nikolaevich Kolmogorov',
 'Andries van Dam',
 'Anil K. Jain (computer scientist, born 1948)',
 'Anita Borg',
 'Annie Easley',
 'Aravind K. Joshi',
 'Arne Sølvberg',
 'Arvind (computer scientist)',
 'Ashok Goel',
 'Atta ur Rehman Khan',
 'Austin Tate',
 'Avie Tevanian',
 'Avinash Kak',
 'Azriel Rosenfeld',
 'Barbara Engelhardt',
 'Barbara Liskov',
 'Barry Boehm',
 'Beatrice Helen Worsley',
 'Ben Shneiderman',
 'Bernard Ri

In [6]:
person_to_consider = people[1]

In [7]:
biography[person_to_consider]

'\n\n- Abhay Bhushan Pandey is an Indian computer scientist.\n- He made significant contributions to the development of the Internet TCP/IP architecture.\n- He is the author of the File Transfer Protocol and early versions of email protocols.\n- He graduated from the first batch of Indian Institute of Technology Kanpur in 1965 with a B.Tech in electrical engineering.\n- He received a Masters in electrical engineering and a degree in Management from the MIT Sloan School of Management.\n- He worked on developing FTP and email protocols for ARPANet and subsequent Internet.\n- He was a Director at the Institute of Engineering and Rural Technology in Allahabad and a senior manager in Engineering and Development of Xerox.\n- He was a co-founder of YieldUP International and Portola Communications.\n- He is currently chairman of Asquare Inc., Secretary of Indians for Collective Action and former President of the IIT-Kanpur Foundation.'

## Prepare Groq access

In [8]:
groq_key = json.load(open(API_KEYS_FILE))['groq']

In [9]:
groq_interface = groq_access(groq_key, GROQ_LLAMA3_70B_MODEL)

## Extract factual associations from a biography

In [10]:
facts = factual_association_extraction(groq_interface, biography[person_to_consider])


Read the text and return a list of all factual associations you can extract exclusively from it. Write sentences which are self contained and includes the maximum information provided, including the implicit ones and temporal information. For each factual association, identify the subject, the relation and the object. Only output the JSON format, nothing else: {"sentences":[{"subject":"<subject-1>", "relation":"<relation-1>", "object":"object-1"}, ..., {"subject":"<subject-n>", "relation":"<relation-n>", "object":"object-n"}]}

Text: "

- Abhay Bhushan Pandey is an Indian computer scientist.
- He made significant contributions to the development of the Internet TCP/IP architecture.
- He is the author of the File Transfer Protocol and early versions of email protocols.
- He graduated from the first batch of Indian Institute of Technology Kanpur in 1965 with a B.Tech in electrical engineering.
- He received a Masters in electrical engineering and a degree in Management from the MIT Sloa

In [11]:
facts

{'sentences': [{'subject': 'Abhay Bhushan Pandey',
   'relation': 'is',
   'object': 'an Indian computer scientist'},
  {'subject': 'Abhay Bhushan Pandey',
   'relation': 'made',
   'object': 'significant contributions to the development of the Internet TCP/IP architecture'},
  {'subject': 'Abhay Bhushan Pandey',
   'relation': 'is',
   'object': 'the author of the File Transfer Protocol'},
  {'subject': 'Abhay Bhushan Pandey',
   'relation': 'is',
   'object': 'the author of early versions of email protocols'},
  {'subject': 'Abhay Bhushan Pandey',
   'relation': 'graduated',
   'object': 'from the first batch of Indian Institute of Technology Kanpur in 1965 with a B.Tech in electrical engineering'},
  {'subject': 'Abhay Bhushan Pandey',
   'relation': 'received',
   'object': 'a Masters in electrical engineering'},
  {'subject': 'Abhay Bhushan Pandey',
   'relation': 'received',
   'object': 'a degree in Management from the MIT Sloan School of Management'},
  {'subject': 'Abhay Bhush

In [12]:
len(facts['sentences'])

15

## Generate questions from the same biography

In [13]:
text_questions = questions_generation(groq_interface, biography[person_to_consider])


Read the text and generate questions following the steps:
1. Extract a list of factual associations from the text, including implicit information and temporal relations.
2. Create a list of questions and answers from the factual associations.
Only output the JSON format, nothing else: {"questions":[{"question": "<question-1>", "answer": "<answer-1>"}, ..., {"question": "<question-n>", "answer": "<answer-n>"}].

Text: "

- Abhay Bhushan Pandey is an Indian computer scientist.
- He made significant contributions to the development of the Internet TCP/IP architecture.
- He is the author of the File Transfer Protocol and early versions of email protocols.
- He graduated from the first batch of Indian Institute of Technology Kanpur in 1965 with a B.Tech in electrical engineering.
- He received a Masters in electrical engineering and a degree in Management from the MIT Sloan School of Management.
- He worked on developing FTP and email protocols for ARPANet and subsequent Internet.
- He was

In [14]:
text_questions

{'questions': [{'question': "What is Abhay Bhushan Pandey's profession?",
   'answer': 'Indian computer scientist.'},
  {'question': 'What did Abhay Bhushan Pandey contribute to?',
   'answer': 'development of the Internet TCP/IP architecture.'},
  {'question': 'What protocols did Abhay Bhushan Pandey author?',
   'answer': 'File Transfer Protocol and early versions of email protocols.'},
  {'question': 'Where did Abhay Bhushan Pandey graduate from in 1965?',
   'answer': 'Indian Institute of Technology Kanpur.'},
  {'question': 'What degree did Abhay Bhushan Pandey receive from MIT Sloan School of Management?',
   'answer': 'Masters in electrical engineering and a degree in Management.'},
  {'question': 'What did Abhay Bhushan Pandey work on developing for ARPANet and subsequent Internet?',
   'answer': 'FTP and email protocols.'},
  {'question': "What was Abhay Bhushan Pandey's role at the Institute of Engineering and Rural Technology in Allahabad?",
   'answer': 'Director.'},
  {'qu

In [16]:
len(text_questions['questions'])

12

## Generate questions from each factual association extracted

In [17]:
questions_from_facts = []

for given_fact in facts['sentences']:

    fact_text = "{} {} {}".format(given_fact['subject'],
                                  given_fact['relation'],
                                  given_fact['object'])

    fact_questions = questions_generation_from_statement(groq_interface, fact_text)

    questions_from_facts.append({"statement": fact_text,
                                 "questions": fact_questions['questions']})


Generate questions from the simple factual statement. Do not create a generic question. Only output the JSON format, nothing else: {"questions":[{"question:": "<question-1>", "answer": "<answer-1>"}, ..., {"question": "<question-n>", "answer": "<answer-n>"}]}

Statement: "Abhay Bhushan Pandey is an Indian computer scientist"



{"questions":[{"question": "Who is Abhay Bhushan Pandey?", "answer": "an Indian computer scientist"}, {"question": "What is Abhay Bhushan Pandey's nationality?", "answer": "Indian"}, {"question": "What is Abhay Bhushan Pandey's profession?", "answer": "computer scientist"}]}




{'questions': [{'question': 'Who is Abhay Bhushan Pandey?', 'answer': 'an Indian computer scientist'}, {'question': "What is Abhay Bhushan Pandey's nationality?", 'answer': 'Indian'}, {'question': "What is Abhay Bhushan Pandey's profession?", 'answer': 'computer scientist'}], 'generated_text': '{"questions":[{"question": "Who is Abhay Bhushan Pandey?", "answer": "an Indian computer scie

In [18]:
questions_from_facts

[{'statement': 'Abhay Bhushan Pandey is an Indian computer scientist',
  'questions': [{'question': 'Who is Abhay Bhushan Pandey?',
    'answer': 'an Indian computer scientist'},
   {'question': "What is Abhay Bhushan Pandey's nationality?",
    'answer': 'Indian'},
   {'question': "What is Abhay Bhushan Pandey's profession?",
    'answer': 'computer scientist'}]},
 {'statement': 'Abhay Bhushan Pandey made significant contributions to the development of the Internet TCP/IP architecture',
  'questions': [{'question': 'Who made significant contributions to the development of the Internet TCP/IP architecture?',
    'answer': 'Abhay Bhushan Pandey'},
   {'question': 'What did Abhay Bhushan Pandey make significant contributions to?',
    'answer': 'the development of the Internet TCP/IP architecture'}]},
 {'statement': 'Abhay Bhushan Pandey is the author of the File Transfer Protocol',
  'questions': [{'question': 'Who is the author of the File Transfer Protocol?',
    'answer': 'Abhay Bhus

In [19]:
len(questions_from_facts)

15

In [20]:
all_facts_questions = []

for questions in questions_from_facts:
    all_facts_questions += questions['questions']

In [21]:
len(all_facts_questions)

35

## Now extracts simple factual associations from a given text

In [22]:
simple_facts = simple_factual_association_extraction(groq_interface, biography[person_to_consider])


Read the text and return a list of all simple factual associations you can extract exclusively from it. Write independent sentences also including the implicit and temporal information. For each factual association, identify the subject, the relation and the object. Break down the information in sentences containing a simple object; do not create sentences with long objects. Only output the JSON format, nothing else before or after: {"sentences":[{"subject":"<subject-1>", "relation":"<relation-1>", "object":"<object-1>"}, ..., {"subject":"<subject-n>", "relation":"<relation-n>", "object":"<object-n>"}]}

Text: "

- Abhay Bhushan Pandey is an Indian computer scientist.
- He made significant contributions to the development of the Internet TCP/IP architecture.
- He is the author of the File Transfer Protocol and early versions of email protocols.
- He graduated from the first batch of Indian Institute of Technology Kanpur in 1965 with a B.Tech in electrical engineering.
- He received a 

In [23]:
simple_facts

{'sentences': [{'subject': 'Abhay Bhushan Pandey',
   'relation': 'is',
   'object': 'an Indian computer scientist'},
  {'subject': 'Abhay Bhushan Pandey',
   'relation': 'made',
   'object': 'significant contributions to the development of the Internet TCP/IP architecture'},
  {'subject': 'Abhay Bhushan Pandey',
   'relation': 'is',
   'object': 'the author of the File Transfer Protocol'},
  {'subject': 'Abhay Bhushan Pandey',
   'relation': 'is',
   'object': 'the author of early versions of email protocols'},
  {'subject': 'Abhay Bhushan Pandey',
   'relation': 'graduated',
   'object': 'from the first batch of Indian Institute of Technology Kanpur in 1965'},
  {'subject': 'Abhay Bhushan Pandey',
   'relation': 'received',
   'object': 'a B.Tech in electrical engineering'},
  {'subject': 'Abhay Bhushan Pandey',
   'relation': 'received',
   'object': 'a Masters in electrical engineering'},
  {'subject': 'Abhay Bhushan Pandey',
   'relation': 'received',
   'object': 'a degree in Man

In [24]:
len(simple_facts['sentences'])

16

## Finally, generate questions for each simple fact

In [25]:
questions_from_simple_facts = []

for given_fact in simple_facts['sentences']:

    fact_text = "{} {} {}".format(given_fact['subject'],
                                  given_fact['relation'],
                                  given_fact['object'])

    fact_questions = questions_generation_from_statement(groq_interface, fact_text)

    questions_from_simple_facts.append({"statement": fact_text,
                                        "questions": fact_questions['questions']})


Generate questions from the simple factual statement. Do not create a generic question. Only output the JSON format, nothing else: {"questions":[{"question:": "<question-1>", "answer": "<answer-1>"}, ..., {"question": "<question-n>", "answer": "<answer-n>"}]}

Statement: "Abhay Bhushan Pandey is an Indian computer scientist"



{"questions":[{"question": "Who is Abhay Bhushan Pandey?", "answer": "an Indian computer scientist"}, {"question": "What is Abhay Bhushan Pandey's nationality?", "answer": "Indian"}, {"question": "What is Abhay Bhushan Pandey's profession?", "answer": "computer scientist"}]}




{'questions': [{'question': 'Who is Abhay Bhushan Pandey?', 'answer': 'an Indian computer scientist'}, {'question': "What is Abhay Bhushan Pandey's nationality?", 'answer': 'Indian'}, {'question': "What is Abhay Bhushan Pandey's profession?", 'answer': 'computer scientist'}], 'generated_text': '{"questions":[{"question": "Who is Abhay Bhushan Pandey?", "answer": "an Indian computer scie

In [26]:
questions_from_simple_facts

[{'statement': 'Abhay Bhushan Pandey is an Indian computer scientist',
  'questions': [{'question': 'Who is Abhay Bhushan Pandey?',
    'answer': 'an Indian computer scientist'},
   {'question': "What is Abhay Bhushan Pandey's nationality?",
    'answer': 'Indian'},
   {'question': "What is Abhay Bhushan Pandey's profession?",
    'answer': 'computer scientist'}]},
 {'statement': 'Abhay Bhushan Pandey made significant contributions to the development of the Internet TCP/IP architecture',
  'questions': [{'question': 'Who made significant contributions to the development of the Internet TCP/IP architecture?',
    'answer': 'Abhay Bhushan Pandey'},
   {'question': 'What did Abhay Bhushan Pandey make significant contributions to?',
    'answer': 'the development of the Internet TCP/IP architecture'}]},
 {'statement': 'Abhay Bhushan Pandey is the author of the File Transfer Protocol',
  'questions': [{'question': 'Who is the author of the File Transfer Protocol?',
    'answer': 'Abhay Bhus

In [27]:
len(questions_from_simple_facts)

16

In [28]:
all_simple_facts_questions = []

for questions in questions_from_simple_facts:
    all_simple_facts_questions += questions['questions']

In [29]:
len(all_simple_facts_questions)

36

## Save everything

In [30]:
with open("../../data/extracted_complete_factual_associations_20240619.pkl", "wb") as output_file:
    pickle.dump({"facts": facts["sentences"],
                 "questions": text_questions["questions"],
                 "questions_from_facts": questions_from_facts,
                 "simple_facts": simple_facts["sentences"],
                 "questions_from_simple_facts": questions_from_simple_facts}, output_file, pickle.HIGHEST_PROTOCOL)