In [1]:
import os
import json
import pandas as pd
import numpy as np

from tqdm.auto import tqdm

from llm_access import *

In [2]:
import pickle

from tqdm.auto import tqdm

In [3]:
BIOGRAPHY_DATASET="../../llm_multiagent_debate/biography/article.json"
API_KEYS_FILE="/work/api_keys_20240427.json"

## Read Biography dataset, which will be the factual associations base

In [4]:
biography = json.load(open(BIOGRAPHY_DATASET))

In [5]:
biography["Aaron Sloman"]

"- Aaron Sloman is a philosopher and researcher on artificial intelligence and cognitive science\n- He held the Chair in Artificial Intelligence and Cognitive Science at the School of Computer Science at the University of Birmingham and previously at the University of Sussex\n- Sloman has published widely on philosophy of mathematics, epistemology, cognitive science, and artificial intelligence and collaborated with biologist Jackie Chappell on the evolution of intelligence\n- He was born in Southern Rhodesia (now Zimbabwe) to Lithuanian Jewish parents, and went to school in Cape Town before earning a degree in Mathematics and Physics at the University of Cape Town and a DPhil in philosophy at the University of Oxford\n- Sloman's philosophical ideas were influenced by Immanuel Kant, Gottlob Frege, Karl Popper and others, and his work in AI by Marvin Minsky and John McCarthy\n- He is a Fellow of several AI and philosophy associations and received the K. Jon Barwise Prize for contributio

## Prepare Groq access

In [6]:
groq_key = json.load(open(API_KEYS_FILE))['groq']

In [7]:
groq_interface = groq_access(groq_key, GROQ_LLAMA3_70B_MODEL)

## Extract factual associations from a biography

In [21]:
facts = factual_association_extraction(groq_interface, biography["Aaron Sloman"])


Read the text and return a list of all factual associations you can extract exclusively from it. Write sentences which are self contained and includes the maximum information provided, including the implicit ones and temporal information. For each factual association, identify the subject, the relation and the object. Only output the JSON format, nothing else: {"sentences":[{"subject":"<subject-1>", "relation":"<relation-1>", "object":"object-1"}, ..., {"subject":"<subject-n>", "relation":"<relation-n>", "object":"object-n"}]}

Text: "- Aaron Sloman is a philosopher and researcher on artificial intelligence and cognitive science
- He held the Chair in Artificial Intelligence and Cognitive Science at the School of Computer Science at the University of Birmingham and previously at the University of Sussex
- Sloman has published widely on philosophy of mathematics, epistemology, cognitive science, and artificial intelligence and collaborated with biologist Jackie Chappell on the evolutio

In [22]:
facts

{'sentences': [{'subject': 'Aaron Sloman',
   'relation': 'is',
   'object': 'a philosopher and researcher on artificial intelligence and cognitive science'},
  {'subject': 'Aaron Sloman',
   'relation': 'held',
   'object': 'the Chair in Artificial Intelligence and Cognitive Science at the School of Computer Science at the University of Birmingham'},
  {'subject': 'Aaron Sloman',
   'relation': 'held',
   'object': 'the Chair in Artificial Intelligence and Cognitive Science at the University of Sussex'},
  {'subject': 'Aaron Sloman',
   'relation': 'published',
   'object': 'widely on philosophy of mathematics, epistemology, cognitive science, and artificial intelligence'},
  {'subject': 'Aaron Sloman',
   'relation': 'collaborated with',
   'object': 'biologist Jackie Chappell on the evolution of intelligence'},
  {'subject': 'Aaron Sloman',
   'relation': 'was born in',
   'object': 'Southern Rhodesia (now Zimbabwe)'},
  {'subject': "Aaron Sloman's parents",
   'relation': 'were',
 

In [23]:
len(facts['sentences'])

16

## Generate questions from the same biography

In [24]:
questions = questions_generation(groq_interface, biography["Aaron Sloman"])


Read the text and generate questions following the steps:
1. Extract a list of factual associations from the text, including implicit information and temporal relations.
2. Create a list of questions and answers from the factual associations.
Only output the JSON format, nothing else: {"questions":[{"question": "<question-1>", "answer": "<answer-1>"}, ..., {"question": "<question-n>", "answer": "<answer-n>"}].

Text: "- Aaron Sloman is a philosopher and researcher on artificial intelligence and cognitive science
- He held the Chair in Artificial Intelligence and Cognitive Science at the School of Computer Science at the University of Birmingham and previously at the University of Sussex
- Sloman has published widely on philosophy of mathematics, epistemology, cognitive science, and artificial intelligence and collaborated with biologist Jackie Chappell on the evolution of intelligence
- He was born in Southern Rhodesia (now Zimbabwe) to Lithuanian Jewish parents, and went to school in

In [25]:
questions

{'questions': [{'question': "What is Aaron Sloman's profession?",
   'answer': 'philosopher and researcher on artificial intelligence and cognitive science'},
  {'question': 'What position did Aaron Sloman hold at the University of Birmingham?',
   'answer': 'Chair in Artificial Intelligence and Cognitive Science'},
  {'question': 'What subjects has Aaron Sloman published widely on?',
   'answer': 'philosophy of mathematics, epistemology, cognitive science, and artificial intelligence'},
  {'question': 'Who did Aaron Sloman collaborate with on the evolution of intelligence?',
   'answer': 'biologist Jackie Chappell'},
  {'question': 'Where was Aaron Sloman born?',
   'answer': 'Southern Rhodesia (now Zimbabwe)'},
  {'question': "What was Aaron Sloman's parents' ethnicity?",
   'answer': 'Lithuanian Jewish'},
  {'question': 'Where did Aaron Sloman go to school?', 'answer': 'Cape Town'},
  {'question': 'What degree did Aaron Sloman earn at the University of Cape Town?',
   'answer': 'Mat

In [27]:
len(questions['questions'])

13

## Generate questions from each factual association extracted

In [28]:
questions_from_facts = []

for given_fact in facts['sentences']:

    fact_text = "{} {} {}".format(given_fact['subject'],
                                  given_fact['relation'],
                                  given_fact['object'])

    fact_questions = questions_generation_from_statement(groq_interface, fact_text)

    questions_from_facts.append({"statement": fact_text,
                                 "questions": fact_questions['questions']})


Generate questions from the simple factual statement. Do not repeat the question only changing few words. Only output the JSON format,  nothing else: {"questions":[{"question:": "<question-1>", "answer": "<answer-1>"}, ..., {"question": "<question-n>", "answer": "<answer-n>"}]}

Statement: "Aaron Sloman is a philosopher and researcher on artificial intelligence and cognitive science"



{"questions":[{"question": "Who is a philosopher and researcher on artificial intelligence and cognitive science?", "answer": "Aaron Sloman"}, {"question": "What is Aaron Sloman?", "answer": "a philosopher and researcher on artificial intelligence and cognitive science"}]}




{'questions': [{'question': 'Who is a philosopher and researcher on artificial intelligence and cognitive science?', 'answer': 'Aaron Sloman'}, {'question': 'What is Aaron Sloman?', 'answer': 'a philosopher and researcher on artificial intelligence and cognitive science'}], 'generated_text': '{"questions":[{"question": "Who is a 

In [29]:
questions_from_facts

[{'statement': 'Aaron Sloman is a philosopher and researcher on artificial intelligence and cognitive science',
  'questions': [{'question': 'Who is a philosopher and researcher on artificial intelligence and cognitive science?',
    'answer': 'Aaron Sloman'},
   {'question': 'What is Aaron Sloman?',
    'answer': 'a philosopher and researcher on artificial intelligence and cognitive science'}]},
 {'statement': 'Aaron Sloman held the Chair in Artificial Intelligence and Cognitive Science at the School of Computer Science at the University of Birmingham',
  'questions': [{'question': 'Who held the Chair in Artificial Intelligence and Cognitive Science?',
    'answer': 'Aaron Sloman'},
   {'question': "What was Aaron Sloman's position at the University of Birmingham?",
    'answer': 'Chair in Artificial Intelligence and Cognitive Science'},
   {'question': 'At which university did Aaron Sloman hold the Chair in Artificial Intelligence and Cognitive Science?',
    'answer': 'University of

In [30]:
all_facts_questions = []

for questions in questions_from_facts:
    all_facts_questions += questions['questions']

In [31]:
len(all_facts_questions)

42

## Now extracts simple factual associations from a given text

In [8]:
simple_facts = simple_factual_association_extraction(groq_interface, biography["Aaron Sloman"])


Read the text and return a list of all simple factual associations you can extract exclusively from it. Write independent sentences also including the implicit and temporal information. Break down the information in sentences containing a single object. For each factual association, identify the subject, the relation and the object. Only output the JSON format, nothing else before or after: {"sentences":[{"subject":"<subject-1>", "relation":"<relation-1>", "object":"<object-1>"}, ..., {"subject":"<subject-n>", "relation":"<relation-n>", "object":"<object-n>"}]}

Text: "- Aaron Sloman is a philosopher and researcher on artificial intelligence and cognitive science
- He held the Chair in Artificial Intelligence and Cognitive Science at the School of Computer Science at the University of Birmingham and previously at the University of Sussex
- Sloman has published widely on philosophy of mathematics, epistemology, cognitive science, and artificial intelligence and collaborated with biolog

In [9]:
simple_facts

{'sentences': [{'subject': 'Aaron Sloman',
   'relation': 'is',
   'object': 'a philosopher'},
  {'subject': 'Aaron Sloman', 'relation': 'is', 'object': 'a researcher'},
  {'subject': 'Aaron Sloman',
   'relation': 'held',
   'object': 'the Chair in Artificial Intelligence and Cognitive Science'},
  {'subject': 'the Chair in Artificial Intelligence and Cognitive Science',
   'relation': 'is at',
   'object': 'the School of Computer Science at the University of Birmingham'},
  {'subject': 'the Chair in Artificial Intelligence and Cognitive Science',
   'relation': 'was at',
   'object': 'the University of Sussex'},
  {'subject': 'Aaron Sloman',
   'relation': 'published',
   'object': 'on philosophy of mathematics'},
  {'subject': 'Aaron Sloman',
   'relation': 'published',
   'object': 'on epistemology'},
  {'subject': 'Aaron Sloman',
   'relation': 'published',
   'object': 'on cognitive science'},
  {'subject': 'Aaron Sloman',
   'relation': 'published',
   'object': 'on artificial i

In [11]:
len(simple_facts['sentences'])

31

## Finally, generate questions for each simple fact

In [16]:
questions_from_simple_facts = []

for given_fact in simple_facts['sentences']:

    fact_text = "{} {} {}".format(given_fact['subject'],
                                  given_fact['relation'],
                                  given_fact['object'])

    fact_questions = questions_generation_from_statement(groq_interface, fact_text)

    questions_from_simple_facts.append({"statement": fact_text,
                                        "questions": fact_questions['questions']})


Generate questions from the simple factual statement. Do not repeat the question only changing few words. Only output the JSON format,  nothing else: {"questions":[{"question:": "<question-1>", "answer": "<answer-1>"}, ..., {"question": "<question-n>", "answer": "<answer-n>"}]}

Statement: "Aaron Sloman is a philosopher"



{"questions":[{"question": "Who is a philosopher?", "answer": "Aaron Sloman"}, {"question": "What is Aaron Sloman?", "answer": "a philosopher"}]}




{'questions': [{'question': 'Who is a philosopher?', 'answer': 'Aaron Sloman'}, {'question': 'What is Aaron Sloman?', 'answer': 'a philosopher'}], 'generated_text': '{"questions":[{"question": "Who is a philosopher?", "answer": "Aaron Sloman"}, {"question": "What is Aaron Sloman?", "answer": "a philosopher"}]}', 'prompt_tokens': 86, 'completion_tokens': 37, 'total_tokens': 123, 'total_time': 0.14380732899999998}

Generate questions from the simple factual statement. Do not repeat the question only changing few words. 

In [17]:
questions_from_simple_facts

[{'statement': 'Aaron Sloman is a philosopher',
  'questions': [{'question': 'Who is a philosopher?',
    'answer': 'Aaron Sloman'},
   {'question': 'What is Aaron Sloman?', 'answer': 'a philosopher'}]},
 {'statement': 'Aaron Sloman is a researcher',
  'questions': [{'question': 'Who is a researcher?', 'answer': 'Aaron Sloman'},
   {'question': 'What is Aaron Sloman?', 'answer': 'a researcher'}]},
 {'statement': 'Aaron Sloman held the Chair in Artificial Intelligence and Cognitive Science',
  'questions': [{'question': 'Who held the Chair in Artificial Intelligence and Cognitive Science?',
    'answer': 'Aaron Sloman'},
   {'question': 'What position did Aaron Sloman hold?',
    'answer': 'the Chair in Artificial Intelligence and Cognitive Science'}]},
 {'statement': 'the Chair in Artificial Intelligence and Cognitive Science is at the School of Computer Science at the University of Birmingham',
  'questions': [{'question': 'Where is the Chair in Artificial Intelligence and Cognitive S

In [19]:
all_simple_facts_questions = []

for questions in questions_from_simple_facts:
    all_simple_facts_questions += questions['questions']

In [20]:
len(all_simple_facts_questions)

73

## Save everything

In [33]:
with open("../../data/extracted_complete_factual_associations_20240618.pkl", "wb") as output_file:
    pickle.dump({"facts": facts["sentences"],
                 "questions": questions["questions"],
                 "questions_from_facts": questions_from_facts,
                 "simple_facts": simple_facts["sentences"],
                 "questions_from_simple_facts": questions_from_simple_facts}, output_file, pickle.HIGHEST_PROTOCOL)