In [24]:
import os
import json
import pandas as pd
import numpy as np

from tqdm.auto import tqdm

from llm_access import *

import pickle

from tqdm.auto import tqdm

In [15]:
%load_ext autoreload
%autoreload 2

In [2]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 500)

In [3]:
BIOGRAPHY_DATASET="../../llm_multiagent_debate/biography/article.json"
API_KEYS_FILE="/work/api_keys_20240427.json"

## Read Biography dataset, which will be the factual associations base

In [6]:
biography = json.load(open(BIOGRAPHY_DATASET))

In [7]:
people = list(biography.keys())

In [8]:
people

['Aaron Sloman',
 'Abhay Bhushan',
 'Adam Dunkels',
 'Adele Goldberg (computer scientist)',
 'Adi Shamir',
 'Adriaan van Wijngaarden',
 'Alan Dix',
 'Alan Edelman',
 'Alan Kay',
 'Alan Mycroft',
 'Alan Perlis',
 'Alberto Ciaramella',
 'Alexander Dewdney',
 'Alexander Stepanov',
 'Alfred Aho',
 'Alston Householder',
 'Amir Pnueli',
 'Anders P. Ravn',
 'Andrew Barto',
 'Andrew Chi-Chih Yao',
 'Andrew Donald Booth',
 'Andrew Herbert',
 'Andrew Koenig (programmer)',
 'Andrew McCallum',
 'Andrew Ng',
 'Andrew S. Tanenbaum',
 'Andrew V. Goldberg',
 'Andrew Viterbi',
 'Andrey Nikolaevich Kolmogorov',
 'Andries van Dam',
 'Anil K. Jain (computer scientist, born 1948)',
 'Anita Borg',
 'Annie Easley',
 'Aravind K. Joshi',
 'Arne Sølvberg',
 'Arvind (computer scientist)',
 'Ashok Goel',
 'Atta ur Rehman Khan',
 'Austin Tate',
 'Avie Tevanian',
 'Avinash Kak',
 'Azriel Rosenfeld',
 'Barbara Engelhardt',
 'Barbara Liskov',
 'Barry Boehm',
 'Beatrice Helen Worsley',
 'Ben Shneiderman',
 'Bernard Ri

In [9]:
person_to_consider = people[1]

In [11]:
person_to_consider

'Abhay Bhushan'

In [10]:
print(biography[person_to_consider])



- Abhay Bhushan Pandey is an Indian computer scientist.
- He made significant contributions to the development of the Internet TCP/IP architecture.
- He is the author of the File Transfer Protocol and early versions of email protocols.
- He graduated from the first batch of Indian Institute of Technology Kanpur in 1965 with a B.Tech in electrical engineering.
- He received a Masters in electrical engineering and a degree in Management from the MIT Sloan School of Management.
- He worked on developing FTP and email protocols for ARPANet and subsequent Internet.
- He was a Director at the Institute of Engineering and Rural Technology in Allahabad and a senior manager in Engineering and Development of Xerox.
- He was a co-founder of YieldUP International and Portola Communications.
- He is currently chairman of Asquare Inc., Secretary of Indians for Collective Action and former President of the IIT-Kanpur Foundation.


## Prepare Groq access

In [12]:
groq_key = json.load(open(API_KEYS_FILE))['groq']

In [13]:
groq_interface = groq_access(groq_key, GROQ_LLAMA3_70B_MODEL)

## Extract factual associations from a biography

In [25]:
facts = factual_association_3_step_extraction(groq_interface, biography[person_to_consider])


Read the text and return a list of all factual associations you can extract only from the text information. Write independent and complete sentences; repeat the main subject to avoid pronouns.

Only output the JSON format, nothing else: {"comments": "<any-comment>", "sentences": ["<sentence-1>", ..., "<sentence-n>"]}

Text: "

- Abhay Bhushan Pandey is an Indian computer scientist.
- He made significant contributions to the development of the Internet TCP/IP architecture.
- He is the author of the File Transfer Protocol and early versions of email protocols.
- He graduated from the first batch of Indian Institute of Technology Kanpur in 1965 with a B.Tech in electrical engineering.
- He received a Masters in electrical engineering and a degree in Management from the MIT Sloan School of Management.
- He worked on developing FTP and email protocols for ARPANet and subsequent Internet.
- He was a Director at the Institute of Engineering and Rural Technology in Allahabad and a senior mana

In [26]:
facts['sentences']

[{'subject': 'Abhay Bhushan Pandey',
  'relation': 'is',
  'object': 'an Indian computer scientist'},
 {'subject': 'Abhay Bhushan Pandey',
  'relation': 'made significant contributions to the development of the',
  'object': 'Internet TCP/IP architecture'},
 {'subject': 'Abhay Bhushan Pandey',
  'relation': 'is the author of the',
  'object': 'File Transfer Protocol and early versions of email protocols'},
 {'subject': 'Abhay Bhushan Pandey',
  'relation': 'graduated from the',
  'object': 'Indian Institute of Technology Kanpur in 1965 with a B.Tech in electrical engineering'},
 {'subject': 'Abhay Bhushan Pandey',
  'relation': 'received a Masters in electrical engineering and a degree in Management from the',
  'object': 'MIT Sloan School of Management'},
 {'subject': 'Abhay Bhushan Pandey',
  'relation': 'worked on developing FTP and email protocols for',
  'object': 'ARPANet and subsequent Internet'},
 {'subject': 'Abhay Bhushan Pandey',
  'relation': 'was a Director at the',
  'obj

In [27]:
len(facts['sentences'])

13

## Generate questions from the same biography

In [31]:
text_questions = questions_generation(groq_interface, biography[person_to_consider])


Read the text and generate questions following the steps:
1. Extract a list of factual associations from the text, including implicit information and temporal relations.
2. Create a list of questions and answers from the factual associations.
Only output the JSON format, nothing else: {"questions":[{"question": "<question-1>", "answer": "<answer-1>"}, ..., {"question": "<question-n>", "answer": "<answer-n>"}].

Text: "

- Abhay Bhushan Pandey is an Indian computer scientist.
- He made significant contributions to the development of the Internet TCP/IP architecture.
- He is the author of the File Transfer Protocol and early versions of email protocols.
- He graduated from the first batch of Indian Institute of Technology Kanpur in 1965 with a B.Tech in electrical engineering.
- He received a Masters in electrical engineering and a degree in Management from the MIT Sloan School of Management.
- He worked on developing FTP and email protocols for ARPANet and subsequent Internet.
- He was

In [32]:
text_questions

{'questions': [{'question': "What is Abhay Bhushan Pandey's profession?",
   'answer': 'Indian computer scientist.'},
  {'question': 'What did Abhay Bhushan Pandey contribute to?',
   'answer': 'development of the Internet TCP/IP architecture.'},
  {'question': 'What protocols did Abhay Bhushan Pandey author?',
   'answer': 'File Transfer Protocol and early versions of email protocols.'},
  {'question': 'Where did Abhay Bhushan Pandey graduate from in 1965?',
   'answer': 'Indian Institute of Technology Kanpur.'},
  {'question': 'What degree did Abhay Bhushan Pandey receive from the MIT Sloan School of Management?',
   'answer': 'Masters in electrical engineering and a degree in Management.'},
  {'question': 'What networks did Abhay Bhushan Pandey work on developing FTP and email protocols for?',
   'answer': 'ARPANet and subsequent Internet.'},
  {'question': 'What positions did Abhay Bhushan Pandey hold at the Institute of Engineering and Rural Technology and Xerox?',
   'answer': 'D

In [33]:
len(text_questions['questions'])

9

## Generate questions from each factual association extracted

In [34]:
questions_from_facts = []

for given_fact in facts['sentences']:

    fact_text = "{} {} {}".format(given_fact['subject'],
                                  given_fact['relation'],
                                  given_fact['object'])

    fact_questions = questions_generation_from_statement(groq_interface, fact_text)

    questions_from_facts.append({"statement": fact_text,
                                 "questions": fact_questions['questions']})


Generate questions from the simple factual statement. Do not create a generic question. Only output the JSON format, nothing else: {"questions":[{"question:": "<question-1>", "answer": "<answer-1>"}, ..., {"question": "<question-n>", "answer": "<answer-n>"}]}

Statement: "Abhay Bhushan Pandey is an Indian computer scientist"


---------------------
{"questions":[{"question": "Who is Abhay Bhushan Pandey?", "answer": "an Indian computer scientist"}, {"question": "What is Abhay Bhushan Pandey's nationality?", "answer": "Indian"}, {"question": "What is Abhay Bhushan Pandey's profession?", "answer": "computer scientist"}]}
---------------------




---------------------
{"questions":[{"question": "Who is Abhay Bhushan Pandey?", "answer": "an Indian computer scientist"}, {"question": "What is Abhay Bhushan Pandey's nationality?", "answer": "Indian"}, {"question": "What is Abhay Bhushan Pandey's profession?", "answer": "computer scientist"}]}
---------------------



{'questions': [{'questi

In [35]:
questions_from_facts

[{'statement': 'Abhay Bhushan Pandey is an Indian computer scientist',
  'questions': [{'question': 'Who is Abhay Bhushan Pandey?',
    'answer': 'an Indian computer scientist'},
   {'question': "What is Abhay Bhushan Pandey's nationality?",
    'answer': 'Indian'},
   {'question': "What is Abhay Bhushan Pandey's profession?",
    'answer': 'computer scientist'}]},
 {'statement': 'Abhay Bhushan Pandey made significant contributions to the development of the Internet TCP/IP architecture',
  'questions': [{'question': 'Who made significant contributions to the development of the Internet TCP/IP architecture?',
    'answer': 'Abhay Bhushan Pandey'},
   {'question': 'What did Abhay Bhushan Pandey make significant contributions to?',
    'answer': 'the development of the Internet TCP/IP architecture'}]},
 {'statement': 'Abhay Bhushan Pandey is the author of the File Transfer Protocol and early versions of email protocols',
  'questions': [{'question': 'Who is the author of the File Transfer

In [36]:
len(questions_from_facts)

13

In [37]:
all_facts_questions = []

for questions in questions_from_facts:
    all_facts_questions += questions['questions']

In [38]:
len(all_facts_questions)

33

## Save everything

In [39]:
with open("../../data/extracted_3_step_factual_associations_20240625.pkl", "wb") as output_file:
    pickle.dump({"facts": facts["sentences"],
                 "fact_results": facts,
                 "questions": text_questions["questions"],
                 "questions_results": text_questions,
                 "questions_from_facts": questions_from_facts,
                 "questions_from_facts_results": fact_questions}, output_file, pickle.HIGHEST_PROTOCOL)