# This notebook is to prepare a dataset for training the ANNABELL model.
The dataset is derived from the SQuAD database.  Each Question and Answer pair was used to prompt a LLM to provide a declarative statement.


In [8]:
import pandas as pd
# Load the datafile into a pandas DataFrame
filepath = "/Volumes/X9 Pro/datasets/declarative_statement_generation_output_gemma3:4b_2025-06-05 07:15:26.tsv"
results_df = pd.read_csv(filepath, sep="\t")
print(results_df.info())
results_df[:10]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 817 entries, 0 to 816
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         817 non-null    object
 1   title      817 non-null    object
 2   question   817 non-null    object
 3   answer     817 non-null    object
 4   statement  817 non-null    object
dtypes: object(5)
memory usage: 32.0+ KB
None


Unnamed: 0,id,title,question,answer,statement
0,56ce304daab44d1400b8850e,New_York_City,What city in the United States has the highest...,New York,New York is the city in the United States with...
1,56ce304daab44d1400b8850f,New_York_City,In what city is the United Nations based?,New York,the united nations is based in New York
2,56ce304daab44d1400b88510,New_York_City,What city has been called the cultural capital...,New York,New York has been called the cultural capital ...
3,56ce304daab44d1400b88511,New_York_City,What American city welcomes the largest number...,New York,New York welcomes the largest number of legal ...
4,56cf5d41aab44d1400b89130,New_York_City,The major gateway for immigration has been whi...,New York City,the major gateway for immigration has been New...
5,56cf5d41aab44d1400b89131,New_York_City,The most populated city in the United States i...,New York City,the most populated city in the United States i...
6,56ce3124aab44d1400b8852a,New_York_City,How many boroughs comprise New York City?,five,the five borough -s comprise New York City
7,56ce3124aab44d1400b8852b,New_York_City,In what year were the five boroughs combined i...,1898,the five borough -s were combined into one cit...
8,56ce3124aab44d1400b8852c,New_York_City,"In 2014, what did the census estimate the popu...",8491079,the population of New York City was estimated ...
9,56ce3124aab44d1400b8852d,New_York_City,What is the size of New York City in square mi...,305,New York City is 305 square mile -s


## To be useable for training, the ANNABELL model,  the examples need to follow the specific format below:
* Uppercase letters are used only for first letter of proper nouns – e.g. Chris, London, Big Ben
* Questions start with a question mark – e.g. "how old are you"
* Words with a suffix are split in the form base –suffix.  e.g. animals -> animal –s, writing \t *> write \t *ing
    * Apart from the above exceptions the following rules apply:
    * every character must be lowercase
    * No punctuation
    * No Special Characters
    * No Whitespace between lines
    * Lines can be prefixed with # to insert comments
    * If .ph is used, the entire phrase in the exact format must be input

In [9]:
#fiilter the data to only include statements, questions and answers with less than 11 words
filtered_results_df = results_df[results_df.apply(lambda row: len(row["question"].split()) < 11 and len(row["answer"][0].split()) < 11 and len(row["statement"].split()) < 11 , axis=1)]
filtered_results_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 279 entries, 1 to 800
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         279 non-null    object
 1   title      279 non-null    object
 2   question   279 non-null    object
 3   answer     279 non-null    object
 4   statement  279 non-null    object
dtypes: object(5)
memory usage: 13.1+ KB


In [10]:
#move the ? from the end of each question to the start
def move_question_mark_to_start(question):
    if question.strip().endswith("?"):
        edited_question = "?" + question[:-1]
    else:
        #raise an exception if the question does not end with a ?
        raise ValueError(f"Question does not end with a question mark: {question}")
    return edited_question

edited_questions_results_df = results_df["question"] = filtered_results_df["question"].apply(move_question_mark_to_start)
edited_questions_results_df[:5]

1             ?In what city is the United Nations based
3     ?What American city welcomes the largest numbe...
4     ?The major gateway for immigration has been wh...
6             ?How many boroughs comprise New York City
11         ?How man boroughs does New York City contain
Name: question, dtype: object

In [15]:
titles = [item["title"] for item in ds["train"]]
print(len(titles))
print(len(set(titles)))

87599
442


In [14]:
from collections import Counter
bag_of_titles = Counter((titles))
(bag_of_titles.most_common(20))

[('New_York_City', 817),
 ('American_Idol', 802),
 ('Beyoncé', 758),
 ('Frédéric_Chopin', 697),
 ('Queen_Victoria', 680),
 ('Buddhism', 610),
 ('New_Haven,_Connecticut', 602),
 ('2008_Sichuan_earthquake', 521),
 ('2008_Summer_Olympics_torch_relay', 500),
 ('Muammar_Gaddafi', 489),
 ('Hellenistic_period', 469),
 ('Napoleon', 458),
 ('Middle_Ages', 452),
 ('Modern_history', 448),
 ('Portugal', 435),
 ('Gamal_Abdel_Nasser', 433),
 ('Dwight_D._Eisenhower', 430),
 ('Kanye_West', 428),
 ('Southampton', 426),
 ('The_Blitz', 414)]

In [None]:
answers = chopin_df["answers"]
print(len(answers))
answers[:5]

In [None]:
answers_multi = [answer["text"] for answer in answers.values if len(answer["text"]) > 1]
answers_multi

In [None]:
chopin_short_df = chopin_df[chopin_df.apply(lambda row: len(row["answers"][0]) >10)]
chopin_short_df

In [None]:
chopin_short_df = chopin_df[chopin_df.apply(lambda row: len(row["answers"]["text"][0]) > 10, axis=1)]
chopin_short_df

In [None]:
line = "Who were liberators of oppressed Balkan states?	The Russians	the Russians were liberators of the oppressed Balkan States"

In [None]:
len(line.split('\t'))

In [63]:
"""What do some authors state anthropology developed as the study of?	"other cultures	some authors state that anthropology developed as the study of other cultures""".split("\t")


['What do some authors state anthropology developed as the study of?',
 '"other cultures',
 'some authors state that anthropology developed as the study of other cultures']

In [64]:
"""What is a central part of the science of anthropology?	comparative method	the comparative method is a central part of the science of Anthropology.""".split("\t")

['What is a central part of the science of anthropology?',
 'comparative method',
 'the comparative method is a central part of the science of Anthropology.']

In [65]:
"""A past society would be an other culture separated by what temporal aspect?	time	a past society would be an other culture separated by time""".split("\t")

['A past society would be an other culture separated by what temporal aspect?',
 'time',
 'a past society would be an other culture separated by time']

In [66]:
"""What is a central part of the science of anthropology?	comparative method	the comparative method is a central part of the science of Anthropology.
What do some authors state anthropology developed as the study of?	"other cultures	some authors state that anthropology developed as the study of other cultures
A past society would be an other culture separated by what temporal aspect?	time	a past society would be an other culture separated by time""".split("\t")

['What is a central part of the science of anthropology?',
 'comparative method',
 'the comparative method is a central part of the science of Anthropology.\nWhat do some authors state anthropology developed as the study of?',
 '"other cultures',
 'some authors state that anthropology developed as the study of other cultures\nA past society would be an other culture separated by what temporal aspect?',
 'time',
 'a past society would be an other culture separated by time']

In [97]:
with open(filepath, "r") as file:
    lines = file.readlines()

In [98]:
len(lines)

87624

In [None]:
from datasets import load_dataset
ds = load_dataset("rajpurkar/squad")
#take the 10 samples from the train set
ds_train = ds["train"].select(range(10))

In [7]:
for example in ds_train:
    print(example["question"] + "\t" + example["answers"]["text"][0])


To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?	Saint Bernadette Soubirous
What is in front of the Notre Dame Main Building?	a copper statue of Christ
The Basilica of the Sacred heart at Notre Dame is beside to which structure?	the Main Building
What is the Grotto at Notre Dame?	a Marian place of prayer and reflection
What sits on top of the Main Building at Notre Dame?	a golden statue of the Virgin Mary
When did the Scholastic Magazine of Notre dame begin publishing?	September 1876
How often is Notre Dame's the Juggler published?	twice
What is the daily student paper at Notre Dame called?	The Observer
How many student news papers are found at Notre Dame?	three
In what year did the student paper Common Sense begin publication at Notre Dame?	1987
