# This notebook creates the datasets for pretraining, training and testing the ANNABELL model using the NYC dataset derived from SQuAD.

In [1]:
# load the base dataset and select a random sample of rows for pretraining
import pandas as pd
import os
import platform
import datetime

from scipy.ndimage import maximum

from dataset_processing import create_list_of_commands, write_training_file, write_testing_file

experiment_number = "9"
number_of_pretraining_samples = 100
use_manual_pretraining_data = False
maximum_number_of_words = 20

operating_system = platform.system()
if operating_system == 'Windows':
    raise Exception("not used on Windows yet")
elif operating_system == 'Linux':
    base_directory = "/home/chris/gdrive/work/annabell/"
elif operating_system == 'Darwin':  #macOS
    base_directory = "/Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/"
else:
    raise Exception("unsupported OS")

timestamp = datetime.datetime.now().strftime("_%Y%m%d_%H%M%S")
data_directory = os.path.join(base_directory, "experiments/data")
dataset_filename = "response_formatted_20250924_174653.jsonl"
dataset_filepath = os.path.join(data_directory, dataset_filename)
pretraining_directory = os.path.join(base_directory, "experiments", experiment_number, "pre_training")
pretraining_filename = "nyc_squad_pretraining_commands" + timestamp + ".txt"
pretraining_filepath = os.path.join(pretraining_directory, pretraining_filename)
training_directory = os.path.join(base_directory, "experiments", experiment_number, "training")
training_filename = "nyc_squad_training_commands" + timestamp + ".txt"
training_filepath = os.path.join(training_directory, training_filename)
testing_directory = os.path.join(base_directory, "experiments", experiment_number, "testing")
testing_filename = "nyc_squad_testing_commands" + timestamp + ".txt"
testing_filepath = os.path.join(testing_directory, testing_filename)
pretraining_validation_testing_filename = "nyc_squad_pretraining_validation_testing_commands.txt"
pretraining_validation_testing_filepath = os.path.join(testing_directory, pretraining_validation_testing_filename)
dataframe_filename = "nyc_squad_with_pretraining_commands" + timestamp + ".jsonl"
dataframe_filepath = os.path.join(data_directory, dataframe_filename)
nyc_squad_df = pd.read_json(dataset_filepath, lines=True)
if not use_manual_pretraining_data:
    nyc_squad_df["is_pretraining"] = False
    #take a random sample of rows and set is_pretraining to True for those rows
    nyc_squad_df.loc[
        nyc_squad_df.sample(n=number_of_pretraining_samples, random_state=42).index, "is_pretraining"] = True
#remove any rows where the response_declarative_sentence_formatted or response_question_formatted has greater than 20 words
nyc_squad_df = nyc_squad_df[
    nyc_squad_df["response_declarative_sentence_formatted"].str.split().str.len() <= maximum_number_of_words]
nyc_squad_df = nyc_squad_df[
    nyc_squad_df["response_question_formatted"].str.split().str.len() <= maximum_number_of_words]
nyc_squad_df

Unnamed: 0,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining
0,56ce304daab44d1400b8850e,What city in the United States has the highest...,What city in the United States has the high –e...,New York,The city in the United States with the high –e...,New York,? what city in the United States has the high ...,New York,the city in the United States with the high -e...,False
1,56ce304daab44d1400b8850f,In what city is the United Nations based?,In what city is the United Nations base –d?,New York,The United Nations is base –d in New York.,New York,? in what city is the United Nations base -d,New York,the United Nations is base -d in New York,False
2,56ce304daab44d1400b88510,What city has been called the cultural capital...,What city has been call –ed the culture –al ca...,New York,New York has been call –ed the culture –al cap...,New York,? what city has been call -ed the culture -al ...,New York,New York has been call -ed the culture -al cap...,False
3,56ce304daab44d1400b88511,What American city welcomes the largest number...,What American city welcome –s the large –st nu...,New York,New York is the American city that welcome –s ...,New York,? what American city welcome -s the large -st ...,New York,New York is the American city that welcome -s ...,False
4,56cf5d41aab44d1400b89130,The major gateway for immigration has been whi...,The major gateway for immigrate –ion has been ...,New York City,The major gateway for immigrate –ion has been ...,New York City,? the major gateway for immigrate -ion has bee...,New York City,the major gateway for immigrate -ion has been ...,False
...,...,...,...,...,...,...,...,...,...,...
804,56d11faa17492d1400aab9ed,What branch of government are the administrati...,What branch of government are the administrati...,executive,The administrative court –s are a part of the ...,executive,? what branch of government are the administra...,executive,the administrative court -s are a part of the ...,False
806,56d1204617492d1400aab9fa,What federal district court has its main court...,What federal district court has its main court...,the Eastern District of New York,The Eastern District of New York federal distr...,the Eastern District of New York,? what federal district court has its main cou...,the Eastern District of New York,the Eastern District of New York federal distr...,False
807,56d1204617492d1400aab9fb,What square is home to the US Court of Interna...,What square is home to the US Court of Interna...,Foley Square,Foley Square is home to the US Court of Intern...,Foley Square,? what square is home to the US Court of Inter...,Foley Square,Foley Square is home to the US Court of Intern...,False
809,56d1204617492d1400aab9fd,In what borough is the New York City Hall found?,In what borough is the New York City Hall found?,Manhattan,The New York City Hall is found in the borough...,Manhattan,? in what borough is the New York City Hall found,Manhattan,the New York City Hall is found in the borough...,False


### create the pretraining data
Select the rows where the pretraining is true, generate a set of commands for each row and save to a file.

In [2]:
#add a new column to the dataframe with the created list of commands
nyc_squad_df["created_commands"] = nyc_squad_df.apply(create_list_of_commands, axis=1)
nyc_squad_training_df = nyc_squad_df[nyc_squad_df["is_pretraining"] == False]
nyc_squad_pretraining_df = nyc_squad_df[nyc_squad_df["is_pretraining"] == True]
#save the final dataframe as a JSON lines file
nyc_squad_df.to_json(dataframe_filename, orient="records",
                     lines=True)

In [3]:
with open(pretraining_filepath, "w") as commands_file:
    for index, row in nyc_squad_pretraining_df.iterrows():
        commands = row["created_commands"]
        for command in commands:
            commands_file.write(command + "\n")
print(f"Wrote {pretraining_filepath}")

with open(pretraining_filepath, "r") as commands_file:
    lines = commands_file.readlines()
number_of_reward_lines = sum(1 for line in lines if line.startswith(".rw"))
print(f"Number of reward lines: {number_of_reward_lines}")
print(f"Number of commands: {len(lines)}")
for line in lines[:20]:
    print(line.strip())

Wrote /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/9/pre_training/nyc_squad_pretraining_commands_20250929_075818.txt
Number of reward lines: 90
Number of commands: 1114
# ID: 56cf9df0234ae51400d9be27
the English occupy -ed New York City begin -ning in 1664
? the English occupy -ed New York City begin -ning on what date
.wg English
.wg occupy
.wg New
.wg York
.wg City
.wg begin
.wg date
.ph the English occupy -ed New York City begin -ning in 1664
.wg 1664
.rw
# ID: 56ce3348aab44d1400b88560
Staten Island was once a part of Long Island
? what island was once a part of Long Island
.wg island
.wg part
.wg Long
.wg Island


### create the training data
Select the rows where the pretraining is false, generate a set of commands for each row and save to a file.

In [4]:
#combine 2 columns together into a list of tuples
list_of_training_tuples = list(
    zip(nyc_squad_training_df["id"], nyc_squad_training_df["response_declarative_sentence_formatted"]))

write_training_file(list_of_training_tuples, training_filepath)
print(f"Wrote {training_filepath}")

with open(training_filepath, "r") as commands_file:
    lines = commands_file.readlines()
    print(f"Number of commands: {len(lines)}")
    print("First 100 commands:")
    for line in lines[:20]:
        print(line.strip())

file created: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/9/training/nyc_squad_training_commands_20250929_075818.txt
Wrote /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/9/training/nyc_squad_training_commands_20250929_075818.txt
Number of commands: 1356
First 100 commands:
#id: 56ce304daab44d1400b8850e
the city in the United States with the high -est populate -ion is New York
#id: 56ce304daab44d1400b8850f
the United Nations is base -d in New York
#id: 56ce304daab44d1400b88510
New York has been call -ed the culture -al capital of the world
#id: 56ce304daab44d1400b88511
New York is the American city that welcome -s the large -st number of legal immigrant -s
#id: 56cf5d41aab44d1400b89130
the major gateway for immigrate -ion has been New York City
#id: 56cf5d41aab44d1400b89131

### create the testing data
Select the rows where the pretraining is false, generate a set of commands for each row and save to a file.

In [5]:
list_of_testing_tuples = list(zip(nyc_squad_training_df["id"], nyc_squad_training_df["response_question_formatted"]))
write_testing_file(list_of_testing_tuples, testing_filepath)
print(f"Wrote {testing_filepath}")

with open(testing_filepath, "r") as commands_file:
    lines = commands_file.readlines()
    print(f"Number of commands: {len(lines)}")
    print("First 100 commands:")
    for line in lines[:20]:
        print(line.strip())

file created: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/9/testing/nyc_squad_testing_commands_20250929_075818.txt
Wrote /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/9/testing/nyc_squad_testing_commands_20250929_075818.txt
Number of commands: 2712
First 100 commands:
#id: 56ce304daab44d1400b8850e
? what city in the United States has the high -est populate -ion
.x
#END OF TESTING SAMPLE
#id: 56ce304daab44d1400b8850f
? in what city is the United Nations base -d
.x
#END OF TESTING SAMPLE
#id: 56ce304daab44d1400b88510
? what city has been call -ed the culture -al capital of the world
.x
#END OF TESTING SAMPLE
#id: 56ce304daab44d1400b88511
? what American city welcome -s the large -st number of legal immigrant -s
.x
#END OF TESTING SAMPLE
#id: 56cf5d41aab44d1400b89130
? the m

### create the pre-training validation testing data
Select the rows where the pretraining is true, generate a set of commands for each row and save to a file. This file is used to check that the pretraining commands have successfully trained the model.

In [6]:
list_of_pretraining_validation_testing_tuples = list(
    zip(nyc_squad_pretraining_df["id"], nyc_squad_pretraining_df["response_question_formatted"]))
write_testing_file(list_of_pretraining_validation_testing_tuples, pretraining_validation_testing_filepath)

print(f"Wrote {pretraining_validation_testing_filepath}")

with open(pretraining_validation_testing_filepath, "r") as commands_file:
    lines = commands_file.readlines()
    print(f"Number of commands: {len(lines)}")
    print("First 100 commands:")
    for line in lines[:20]:
        print(line.strip())

file created: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/9/testing/nyc_squad_pretraining_validation_testing_commands.txt
Wrote /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/9/testing/nyc_squad_pretraining_validation_testing_commands.txt
Number of commands: 360
First 100 commands:
#id: 56cf9df0234ae51400d9be27
? the English occupy -ed New York City begin -ning on what date
.x
#END OF TESTING SAMPLE
#id: 56ce3348aab44d1400b88560
? what island was once a part of Long Island
.x
#END OF TESTING SAMPLE
#id: 56ce33aaaab44d1400b8856a
? what was the name of the Lenape homeland
.x
#END OF TESTING SAMPLE
#id: 56cfa06a234ae51400d9be39
? when was the first European to visit the area of NYC
.x
#END OF TESTING SAMPLE
#id: 56ce3569aab44d1400b885aa
? what was the name of the explore -er 

## Appendix - Debugging cells

In [7]:
id = "56cfe293234ae51400d9c007"
test_df = nyc_squad_df[nyc_squad_df["id"] == id]
#add a new column to the dataframe with the created list of commands
result = test_df.apply(create_list_of_commands, axis=1)
result.values[0]

['# ID: 56cfe293234ae51400d9c007',
 'Brownstone rowhouse -s make up most of the large resident -ial district -s of NYC',
 '? what type of house -ing structure make -s up most of the large resident -ial district -s of NYC',
 '.wg type',
 '.wg house',
 '.wg structure',
 '.wg make',
 '.wg large',
 '.wg resident',
 '.wg district',
 '.wg NYC',
 '.ph Brownstone rowhouse -s make up most of the large resident -ial district -s of NYC',
 '.wg Brownstone rowhouse -s',
 '.rw']

In [8]:
from dataset_processing import remove_stopwords

result = remove_stopwords(
    '? what type of house -ing structure make -s up most of the large resident -ial district -s of NYC')
result

'? type house -ing structure make -s large resident -ial district -s NYC'