# This notebook is to prepare a dataset for training the ANNABELL model.
The dataset is derived from the SQuAD database.  Each Question and Answer pair was used to prompt a LLM to provide a declarative statement.


In [None]:
import pandas as pd
import re
# Load the datafile into a pandas DataFrame
filepath = "/Volumes/X9 Pro/datasets/declarative_statement_generation_output_gemma3:4b_2025-06-05 07:15:26.tsv"
results_df = pd.read_csv(filepath, sep="\t")
print(results_df.info())
results_df[:10]

## To be useable for training, the ANNABELL model,  the examples need to follow the specific format below:
* Uppercase letters are used only for first letter of proper nouns – e.g. Chris, London, Big Ben
* Questions start with a question mark – e.g. "how old are you"
* Words with a suffix are split in the form base –suffix.  e.g. animals -> animal –s, writing \t *> write \t *ing
    * Apart from the above exceptions the following rules apply:
    * every character must be lowercase
    * No punctuation
    * No Special Characters
    * No Whitespace between lines
    * Lines can be prefixed with # to insert comments
    * If .ph is used, the entire phrase in the exact format must be input

In [None]:
#move the ? from the end of each question to the start
def move_question_mark_to_start(question):
    if question.strip().endswith("?"):
        edited_question = "?" + question[:-1]
    else:
        #raise an exception if the question does not end with a ?
        raise ValueError(f"Question does not end with a question mark: {question}")
    return edited_question

In [None]:
from unidecode import unidecode

def remove_accents(text):
    #Convert accented characters to unaccented ones
    text_unaccented = unidecode(text)
    return text_unaccented

In [None]:
#remove all special characters except question marks and hyphen form the statements
def remove_special_characters(text):
    """
    Removes special characters from a string, keeping alphanumeric characters and spaces.
    """
    # Keep only alphanumeric characters and spaces
    cleaned_text = re.sub(r'[^A-Za-z0-9\s?-]+', '', text)
    return cleaned_text

In [30]:
def filter_by_max_words(the_df, max_words=10):
    #returnes a new dataframe filtered such that each question, answer and statement has less than 11 words

    filtered_df = the_df[the_df.apply(lambda row: len(row["question"].split()) <= max_words and len(row["answer"][0].split()) <= max_words and len(row["statement"].split()) <= max_words , axis=1)]
    return filtered_df

In [31]:
filtered_results_df = filter_by_max_words(results_df, max_words=10)
filtered_results_df["question"] = filtered_results_df["question"].apply(move_question_mark_to_start)
filtered_results_df["statement"] = filtered_results_df["statement"].apply(remove_accents)
filtered_results_df["question"] = filtered_results_df["question"].apply(remove_accents)
filtered_results_df["statement"] = filtered_results_df["statement"].apply(remove_special_characters)
print(filtered_results_df.info())
filtered_results_df

<class 'pandas.core.frame.DataFrame'>
Index: 279 entries, 1 to 800
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         279 non-null    object
 1   title      279 non-null    object
 2   question   279 non-null    object
 3   answer     279 non-null    object
 4   statement  279 non-null    object
dtypes: object(5)
memory usage: 13.1+ KB
None


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_results_df["question"] = filtered_results_df["question"].apply(move_question_mark_to_start)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_results_df["statement"] = filtered_results_df["statement"].apply(remove_accents)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_resul

Unnamed: 0,id,title,question,answer,statement
1,56ce304daab44d1400b8850f,New_York_City,?In what city is the United Nations based,New York,the united nations is based in New York
3,56ce304daab44d1400b88511,New_York_City,?What American city welcomes the largest numbe...,New York,New York welcomes the largest number of legal ...
4,56cf5d41aab44d1400b89130,New_York_City,?The major gateway for immigration has been wh...,New York City,the major gateway for immigration has been New...
6,56ce3124aab44d1400b8852a,New_York_City,?How many boroughs comprise New York City,five,the five borough -s comprise New York City
11,56cf9d81234ae51400d9be1b,New_York_City,?How man boroughs does New York City contain,five,New York City contains five borough -s
...,...,...,...,...,...
790,56d11b4a17492d1400aab993,New_York_City,?What architectural style does the Throgs Neck...,Structural Expressionism,the Throgs Neck Bridge reflects structural exp...
791,56d11b4a17492d1400aab994,New_York_City,?The Queensboro Bridge utilized what type of c...,cantilever,the Queensboro Bridge utilized cantilever cons...
797,56d11e7b17492d1400aab9d3,New_York_City,?How long is Newtown Creek in kilometers,6,Newtown Creek is 6 kilometer -s
799,56d11e7b17492d1400aab9d5,New_York_City,?What notable accidental fossil fuel discharge...,the Greenpoint oil spill,the Greenpoint oil spill occurred at Newtown C...


In [None]:
titles = [item["title"] for item in ds["train"]]
print(len(titles))
print(len(set(titles)))

In [None]:
from collections import Counter
bag_of_titles = Counter((titles))
(bag_of_titles.most_common(20))

In [None]:
answers = chopin_df["answers"]
print(len(answers))
answers[:5]

In [None]:
answers_multi = [answer["text"] for answer in answers.values if len(answer["text"]) > 1]
answers_multi

In [None]:
chopin_short_df = chopin_df[chopin_df.apply(lambda row: len(row["answers"][0]) >10)]
chopin_short_df

In [None]:
chopin_short_df = chopin_df[chopin_df.apply(lambda row: len(row["answers"]["text"][0]) > 10, axis=1)]
chopin_short_df

In [None]:
line = "Who were liberators of oppressed Balkan states?	The Russians	the Russians were liberators of the oppressed Balkan States"

In [None]:
len(line.split('\t'))

In [None]:
"""What do some authors state anthropology developed as the study of?	"other cultures	some authors state that anthropology developed as the study of other cultures""".split("\t")


In [None]:
"""What is a central part of the science of anthropology?	comparative method	the comparative method is a central part of the science of Anthropology.""".split("\t")

In [None]:
"""A past society would be an other culture separated by what temporal aspect?	time	a past society would be an other culture separated by time""".split("\t")

In [None]:
"""What is a central part of the science of anthropology?	comparative method	the comparative method is a central part of the science of Anthropology.
What do some authors state anthropology developed as the study of?	"other cultures	some authors state that anthropology developed as the study of other cultures
A past society would be an other culture separated by what temporal aspect?	time	a past society would be an other culture separated by time""".split("\t")

In [None]:
with open(filepath, "r") as file:
    lines = file.readlines()

In [None]:
len(lines)

In [None]:
from datasets import load_dataset
ds = load_dataset("rajpurkar/squad")
#take the 10 samples from the train set
ds_train = ds["train"].select(range(10))

In [None]:
for example in ds_train:
    print(example["question"] + "\t" + example["answers"]["text"][0])
