# This notebook is to prepare a dataset for training the ANNABELL model.
The dataset is derived from the SQuAD database.  Each Question and Answer pair was used to prompt a LLM to provide a declarative statement.


In [21]:
import pandas as pd
import re
from datasets import load_dataset, load_from_disk
from generate_declarative_sentences import load_squad_dataset

# Load the SQuAD dataset
ds = load_squad_dataset()

# Load the datafile into a pandas DataFrame
train_filepath = "datasets/train/declarative_sentences_train.tsv"
validation_filepath = "datasets/validation/declarative_sentences_validation.tsv"
train_df = pd.read_csv(train_filepath, sep="\t")
validation_df = pd.read_csv(validation_filepath, sep="\t")
print(train_df.info())
print(validation_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 816 entries, 0 to 815
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 816 non-null    object
 1   title              816 non-null    object
 2   question           816 non-null    object
 3   answer             816 non-null    object
 4   response_question  816 non-null    object
 5   response_answer    816 non-null    object
 6   statement          816 non-null    object
dtypes: object(7)
memory usage: 44.8+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 0 non-null      object
 1   title              0 non-null      object
 2   question           0 non-null      object
 3   answer             0 non-null      object
 4   response_question  0 non-null      object
 5   r

## To be useable for training, the ANNABELL model,  the examples need to follow the specific format below:
* Uppercase letters are used only for first letter of proper nouns – e.g. Chris, London, Big Ben
* Questions start with a question mark – e.g. "how old are you"
* Words with a suffix are split in the form base –suffix.  e.g. animals -> animal –s, writing \t *> write \t *ing
    * Apart from the above exceptions the following rules apply:
    * every character must be lowercase
    * No punctuation
    * No Special Characters
    * No Whitespace between lines
    * Lines can be prefixed with # to insert comments
    * If .ph is used, the entire phrase in the exact format must be input

In [29]:
#move the ? from the end of each question to the start
def move_question_mark_to_start(question):
    if question.strip().endswith("?"):
        edited_question = "? " + question[:-1]
    else:
        #raise an exception if the question does not end with a ?
        raise ValueError(f"Question does not end with a question mark: {question}")
    return edited_question

In [23]:
def replace_decimal_in_matched_string(matched_string):
    #callable function to support the regex
    number_str = matched_string.group(0)
    return number_str.replace('.', ' point ')

In [24]:
def convert_decimal_point_to_word(a_string):
    import re
    #Replace "." with 'point' if it is part of a number.

    # Regex to find numbers with a decimal point:
    # \d+\.\d+  matches numbers like 1.23
    # \d+\.     matches numbers like 1. (dot at the end after digits)
    # \.\d+     matches numbers like .5 (dot at the beginning before digits)
    # The order matters to match \d+\.\d+ before \d+\. or \.\d+ for overlapping cases.
    pattern = r'\d+\.\d+|\d+\.|\.\d+'
    return re.sub(pattern, replace_decimal_in_matched_string, a_string)

In [25]:
from unidecode import unidecode

def remove_accents(text):
    #Convert accented characters to unaccented ones
    text_unaccented = unidecode(text)
    return text_unaccented

In [26]:
#remove all special characters except question marks and hyphen form the statements
def remove_special_characters(text):
    """
    Removes special characters from a string, keeping alphanumeric characters and spaces.
    """
    # Keep only alphanumeric characters and spaces
    cleaned_text = re.sub(r'[^A-Za-z0-9\s?-]+', '', text)
    return cleaned_text

In [27]:
def filter_by_max_words(the_df, max_words=10):
    #returnes a new dataframe filtered such that each question, answer and statement has less than 11 words

    filtered_df = the_df[the_df.apply(lambda row: len(row["question"].split()) <= max_words and len(row["answer"][0].split()) <= max_words and len(row["statement"].split()) <= max_words , axis=1)]
    return filtered_df

In [28]:
def clean_text(a_series, is_question):
    #takes a dataframe series and applies reformatting
    if is_question:
        a_series = a_series.apply(move_question_mark_to_start)
    for function_name in (convert_decimal_point_to_word, remove_accents, remove_special_characters):
        a_series = a_series.apply(function_name)
    return a_series

In [31]:
filtered_train_df = filter_by_max_words(train_df, max_words=10)
filtered_train_df["response_question"] = clean_text(filtered_train_df["response_question"], True)
filtered_train_df["response_answer"] = clean_text(filtered_train_df["response_answer"], False)
filtered_train_df["statement"] = clean_text(filtered_train_df["statement"], False)
filtered_train_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_train_df["response_question"] = clean_text(filtered_train_df["response_question"], True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_train_df["response_answer"] = clean_text(filtered_train_df["response_answer"], False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_tra

Unnamed: 0,id,title,question,answer,response_question,response_answer,statement
1,56ce304daab44d1400b8850f,New_York_City,In what city is the United Nations based?,New York,? in what city is the united nations based,new york,the united nations is based in New York
3,56ce304daab44d1400b88511,New_York_City,What American city welcomes the largest number...,New York,? What American city welcomes the largest numb...,new york,new york welcomes the largest number of legal ...
6,56ce3124aab44d1400b8852a,New_York_City,How many boroughs comprise New York City?,five,? How many boroughs comprise New York City,five,New York City comprise -s five borough -s
11,56cf9d81234ae51400d9be1b,New_York_City,How man boroughs does New York City contain?,five,? How man boroughs does New York City contain,five,New York City contains five borough -s
17,56ce31baaab44d1400b8853b,New_York_City,What nation founded New Amsterdam?,the Dutch Republic,? What nation founded New Amsterdam,the Dutch Republic,the Dutch Republic founded New Amsterdam
...,...,...,...,...,...,...,...
790,56d11b4a17492d1400aab994,New_York_City,The Queensboro Bridge utilized what type of co...,cantilever,? the Queensboro Bridge utilized what type of ...,cantilever,the Queensboro Bridge utilized a cantilever co...
796,56d11e7b17492d1400aab9d3,New_York_City,How long is Newtown Creek in kilometers?,6,? How long is Newtown Creek in kilometers,6,newtown creek is 6 kilometers long
798,56d11e7b17492d1400aab9d5,New_York_City,What notable accidental fossil fuel discharge ...,the Greenpoint oil spill,? what notable accidental fossil fuel discharg...,the Greenpoint oil spill,the Greenpoint oil spill occurred at Newtown C...
799,56d11eb317492d1400aab9d9,New_York_City,What type of government does New York City have?,mayor-council,? what type of government does New York City have,mayor-council,new york city has a mayor-council government


In [41]:
#write a file that can be used to train ANNABELL
output_filename = "training/nyc_statements.txt"
with open(output_filename, "w") as output_file:
    for statement in filtered_train_df["statement"]:
        output_file.write(statement + "\n")
#check that the file looks correct
with open(output_filename, "r") as input_file:
    lines = input_file.readlines()
print(lines[:5])

In [42]:
with open(output_filename, "r") as input_file:
    lines = input_file.readlines()
print(lines[:5])

['the united nations is based in New York\n', 'new york welcomes the largest number of legal immigrants\n', 'New York City comprise -s five borough -s\n', 'New York City contains five borough -s\n', 'the Dutch Republic founded New Amsterdam\n']


In [4]:
#find the row in the ds which has the id = "56cfdef3234ae51400d9bfc2"
ds[:5]

KeyError: "Invalid key: slice(None, 5, None). Please first select a split. For example: `my_dataset_dictionary['train'][slice(None, 5, None)]`. Available splits: ['train', 'validation']"

In [40]:
type(ds["train"].filter(lambda x: x["id"] == "56cfdef3234ae51400d9bfc2"))

Filter:   0%|          | 0/87599 [00:00<?, ? examples/s]

datasets.arrow_dataset.Dataset

In [None]:
titles = [item["title"] for item in ds["train"]]
print(len(titles))
print(len(set(titles)))

In [42]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [None]:
from collections import Counter
bag_of_titles = Counter((titles))
(bag_of_titles.most_common(20))

In [None]:
answers = chopin_df["answers"]
print(len(answers))
answers[:5]

In [None]:
answers_multi = [answer["text"] for answer in answers.values if len(answer["text"]) > 1]
answers_multi

In [None]:
chopin_short_df = chopin_df[chopin_df.apply(lambda row: len(row["answers"][0]) >10)]
chopin_short_df

In [None]:
chopin_short_df = chopin_df[chopin_df.apply(lambda row: len(row["answers"]["text"][0]) > 10, axis=1)]
chopin_short_df

In [None]:
line = "Who were liberators of oppressed Balkan states?	The Russians	the Russians were liberators of the oppressed Balkan States"

In [None]:
len(line.split('\t'))

In [None]:
"""What do some authors state anthropology developed as the study of?	"other cultures	some authors state that anthropology developed as the study of other cultures""".split("\t")


In [None]:
"""What is a central part of the science of anthropology?	comparative method	the comparative method is a central part of the science of Anthropology.""".split("\t")

In [None]:
"""A past society would be an other culture separated by what temporal aspect?	time	a past society would be an other culture separated by time""".split("\t")

In [None]:
"""What is a central part of the science of anthropology?	comparative method	the comparative method is a central part of the science of Anthropology.
What do some authors state anthropology developed as the study of?	"other cultures	some authors state that anthropology developed as the study of other cultures
A past society would be an other culture separated by what temporal aspect?	time	a past society would be an other culture separated by time""".split("\t")

In [None]:
with open(filepath, "r") as file:
    lines = file.readlines()

In [None]:
len(lines)

In [None]:
from datasets import load_dataset
ds = load_dataset("rajpurkar/squad")
#take the 10 samples from the train set
ds_train = ds["train"].select(range(10))

In [None]:
for example in ds_train:
    print(example["question"] + "\t" + example["answers"]["text"][0])


In [13]:
for item in ds.items():
    print(item)


items = [item for item in ds.items()]
type(items[0])

('train', Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 87599
}))
('validation', Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10570
}))


tuple

In [14]:
items[0][0]

'train'

In [15]:
items[0][1]

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 87599
})