# This notebook creates the datasets for pretraining, training and testing the ANNABELL model using the NYC dataset derived from SQuAD.

In [14]:
# load the base dataset and select a random sample of rows for pretraining
import pandas as pd
import os
import glob
import platform
import datetime

from dataset_processing import create_list_of_commands, write_training_file, write_testing_file, \
    filter_dataset_by_limits, join_concurrent_capitalized_words

experiment_number = "11"
percentage_of_pretraining_samples = 10
use_manual_pretraining_data = False
maximum_number_of_words = 20
maximum_word_length = 25

operating_system = platform.system()
if operating_system == 'Windows':
    raise Exception("not used on Windows yet")
elif operating_system == 'Linux':
    base_directory = "/home/chris/gdrive/work/annabell/"
    pycharm_projects_directory = "/home/chris/PycharmProjects/dataset"
elif operating_system == 'Darwin':  #macOS
    base_directory = "/Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/"
    pycharm_projects_directory = "/Users/chris/PycharmProjects/dataset"
else:
    raise Exception("unsupported OS")

timestamp = datetime.datetime.now().strftime("_%Y%m%d_%H%M%S")
data_directory = os.path.join(base_directory, "experiments/data")
docker_data_directory = os.path.join(pycharm_projects_directory, "docker", "shared_data")
dataset_filename = "response_formatted_20250924_174653.jsonl"
dataset_filepath = os.path.join(data_directory, dataset_filename)
categorised_questions_filename = "llm_question_categorisation_results.jsonl"
categorised_questions_filepath = os.path.join(data_directory, "prompts", categorised_questions_filename)
categorised_sentences_filename = "llm_sentence_categorisation_results.jsonl"
categorised_sentences_filepath = os.path.join(data_directory, "prompts", categorised_sentences_filename)
pretraining_directory = os.path.join(base_directory, "experiments", experiment_number, "pre_training")
if not os.path.exists(pretraining_directory):
    os.makedirs(pretraining_directory)
pretraining_filename = "nyc_squad_pretraining_commands" + timestamp + ".txt"
pretraining_filepath = os.path.join(pretraining_directory, pretraining_filename)
training_directory = os.path.join(base_directory, "experiments", experiment_number, "training")
if not os.path.exists(training_directory):
    os.makedirs(training_directory)
training_filename = "nyc_squad_training_commands" + timestamp + ".txt"
training_filepath = os.path.join(training_directory, training_filename)
testing_directory = os.path.join(base_directory, "experiments", experiment_number, "testing")
if not os.path.exists(testing_directory):
    os.makedirs(testing_directory)
testing_filename = "nyc_squad_testing_commands" + timestamp + ".txt"
testing_filepath = os.path.join(testing_directory, testing_filename)
pretraining_validation_testing_filename = "nyc_squad_pretraining_validation_testing_commands" + timestamp + ".txt"
pretraining_validation_testing_filepath = os.path.join(testing_directory, pretraining_validation_testing_filename)
logs_directory = os.path.join(base_directory, "experiments", experiment_number, "logs")
if not os.path.exists(logs_directory):
    os.makedirs(logs_directory)
dataframe_filename = "nyc_squad_with_pretraining_commands" + timestamp + ".jsonl"
dataframe_filepath = os.path.join(data_directory, dataframe_filename)

In [15]:
#load the dataset
nyc_squad_df = pd.read_json(dataset_filepath, lines=True)
#remove any rows where the max number of words or max numbr of characters is exceeded
join_concurrent_capitalized_words(nyc_squad_df,
                                  ["response_declarative_sentence_formatted", "response_question_formatted",
                                   "response_answer_formatted"])
nyc_squad_df = filter_dataset_by_limits(nyc_squad_df,
                                        ["response_declarative_sentence_formatted", "response_question_formatted",
                                         "response_declarative_sentence_formatted"], maximum_number_of_words,
                                        maximum_word_length)
nyc_squad_df = nyc_squad_df[
    nyc_squad_df["response_declarative_sentence_formatted"].str.split().str.len() <= maximum_number_of_words]
nyc_squad_df = nyc_squad_df[
    nyc_squad_df["response_question_formatted"].str.split().str.len() <= maximum_number_of_words]
nyc_squad_df = nyc_squad_df[
    ~nyc_squad_df["response_declarative_sentence_formatted"].str.split().apply(
        lambda words: any(len(word) > maximum_word_length for word in words))]
nyc_squad_df = nyc_squad_df[
    ~nyc_squad_df["response_question_formatted"].str.split().apply(
        lambda words: any(len(word) > maximum_word_length for word in words))]
nyc_squad_df = nyc_squad_df[
    ~nyc_squad_df["response_answer_formatted"].str.split().apply(
        lambda words: any(len(word) > maximum_word_length for word in words))]
nyc_squad_df.reset_index(drop=True, inplace=True)
#add categories to the questions and declarative sentences, creating 2 new columns - question category and sentence category
categorised_questions_df = pd.read_json(categorised_questions_filepath, lines=True)
categorised_questions_df["category"].value_counts()
categorised_questions_df = categorised_questions_df.rename(columns={'category': 'question_category'})
nyc_squad_df = nyc_squad_df.merge(categorised_questions_df[["id", "question_category"]], on="id", how="left")
categorised_sentences_df = pd.read_json(categorised_sentences_filepath, lines=True)
categorised_sentences_df = categorised_sentences_df.rename(columns={'category': 'sentence_category'})
categorised_sentences_df["sentence_category"].value_counts()
nyc_squad_df = nyc_squad_df.merge(categorised_sentences_df[["id", "sentence_category"]], on="id", how="left")
nyc_squad_df

Unnamed: 0,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,question_category,sentence_category
0,56ce304daab44d1400b8850e,What city in the United States has the highest...,What city in the United States has the high –e...,New York,The city in the United States with the high –e...,New York,? what city in the United-States has the high ...,New-York,the city in the United-States with the high -e...,False,Wh-Subject Question,Subject-Verb-Complement (SVC)
1,56ce304daab44d1400b8850f,In what city is the United Nations based?,In what city is the United Nations base –d?,New York,The United Nations is base –d in New York.,New York,? in what city is the United-Nations base -d,New-York,the United-Nations is base -d in New-York,True,Wh-Adverbial Question,Passive Construction
2,56ce304daab44d1400b88510,What city has been called the cultural capital...,What city has been call –ed the culture –al ca...,New York,New York has been call –ed the culture –al cap...,New York,? what city has been call -ed the culture -al ...,New-York,New-York has been call -ed the culture -al cap...,False,Wh-Subject Question,Passive Construction
3,56ce304daab44d1400b88511,What American city welcomes the largest number...,What American city welcome –s the large –st nu...,New York,New York is the American city that welcome –s ...,New York,? what American city welcome -s the large -st ...,New-York,New-York is the American city that welcome -s ...,False,Wh-Subject Question,Subject-Verb-Object (SVO/SVOA)
4,56cf5d41aab44d1400b89130,The major gateway for immigration has been whi...,The major gateway for immigrate –ion has been ...,New York City,The major gateway for immigrate –ion has been ...,New York City,? the major gateway for immigrate -ion has bee...,New-York-City,the major gateway for immigrate -ion has been ...,False,Wh-in-situ Question,Subject-Verb-Complement (SVC)
...,...,...,...,...,...,...,...,...,...,...,...,...
724,56d1204617492d1400aab9fd,In what borough is the New York City Hall found?,In what borough is the New York City Hall found?,Manhattan,The New York City Hall is found in the borough...,Manhattan,? in what borough is the New-York-City-Hall found,Manhattan,the New-York-City-Hall is found in the borough...,False,Wh-Adverbial Question,Passive Construction
725,56d1218c17492d1400aaba1f,How much money in cents does New York City rec...,How much money in cent –s does New York City r...,83,New York City receive –s 83 cent –s for every ...,83,? how much money in cent -s does New-York-City...,83,New-York-City receive -s 83 cent -s for every ...,False,Quantitative Wh-Question,Subject-Verb-Object (SVO/SVOA)
726,56d1218c17492d1400aaba20,How much more money does the city give to the ...,How much more money does the city give to the ...,$ 11 billion,The city give –s $ 11 billion more money to th...,$11 billion,? how much more money does the city give to th...,11 billion,the city give -s 11 billion more money to the...,False,Quantitative Wh-Question,Subject-Verb-Object (SVO/SVOA)
727,56d1218c17492d1400aaba21,"Each year, how much more money does New York C...","Each year, how much more money does New York C...",$ 11.4 billion,New York City give –s $ 11.4 billion more mone...,$11.4 billion,? each year how much more money does New-York-...,11 point 4 billion,New-York-City give -s 11 point 4 billion more...,False,Quantitative Wh-Question,Subject-Verb-Object (SVO/SVOA)


In [18]:
#pick a random sample of pretraining rows or use a pre-selected, manually generated set
#for each category, pick an equal number of samples
question_categories = nyc_squad_df["question_category"].unique()
sentence_categories = nyc_squad_df["sentence_category"].unique()
print(f"Question categories: {question_categories}")
print(f"Sentence categories: {sentence_categories}")
if use_manual_pretraining_data:
    raise Exception("not implemented yet")
if not use_manual_pretraining_data:
    nyc_squad_df["is_pretraining"] = False
    number_of_pretraining_samples = len(nyc_squad_df) * percentage_of_pretraining_samples // 100
    print(f"Number of pretraining samples: {number_of_pretraining_samples}")
samples_per_category = number_of_pretraining_samples // (len(question_categories) + len(sentence_categories))
print(f"Samples per category: {samples_per_category}")
#sample from the question categories
for category in question_categories:
    category_df = nyc_squad_df[nyc_squad_df["question_category"] == category]
    if len(category_df) < samples_per_category:
        raise Exception(f"Not enough samples in category {category}")
    sampled_category_df = category_df.sample(n=samples_per_category, random_state=42)
    nyc_squad_df.loc[sampled_category_df.index, "is_pretraining"] = True
    #sample from the sentence categories starting with those that are already selected for pretraining
    for category in sentence_categories:
        category_df = nyc_squad_df[nyc_squad_df["sentence_category"] == category]
        already_selected_df = category_df[category_df["is_pretraining"] == True]
        already_selected_count = len(already_selected_df)
        remaining_samples = samples_per_category - already_selected_count
        if remaining_samples > 0:
            not_selected_df = category_df[category_df["is_pretraining"] == False]
            if len(not_selected_df) < remaining_samples:
                raise Exception(f"Not enough samples in category {category}")
            sampled_category_df = not_selected_df.sample(n=remaining_samples, random_state=42)
            nyc_squad_df.loc[sampled_category_df.index, "is_pretraining"] = True
#print the counts of samples in the question and sentence categories
print("Pretraining samples by question category:")
print(nyc_squad_df[nyc_squad_df["is_pretraining"] == True]["question_category"].value_counts())
print("Pretraining samples by sentence category:")
print(nyc_squad_df[nyc_squad_df["is_pretraining"] == True]["sentence_category"].value_counts())
total_pretraining_count = nyc_squad_df["is_pretraining"].sum()
print(f"Total number of samples selected for pretraining: {total_pretraining_count}")

nyc_squad_df

Question categories: ['Wh-Subject Question' 'Wh-Adverbial Question' 'Wh-in-situ Question'
 'Quantitative Wh-Question' 'Wh-Object/Complement Question']
Sentence categories: ['Subject-Verb-Complement (SVC)' 'Passive Construction'
 'Subject-Verb-Object (SVO/SVOA)' 'Subject-Verb-Adverbial (SVA)'
 'Existential Clause' 'Subject-Verb-Object-Complement (SVOC)']
Number of pretraining samples: 72
Samples per category: 6
Pretraining samples by question category:
question_category
Quantitative Wh-Question         18
Wh-Subject Question              15
Wh-Object/Complement Question    11
Wh-Adverbial Question             9
Wh-in-situ Question               6
Name: count, dtype: int64
Pretraining samples by sentence category:
sentence_category
Passive Construction                     15
Subject-Verb-Complement (SVC)            13
Subject-Verb-Adverbial (SVA)             10
Subject-Verb-Object (SVO/SVOA)            9
Subject-Verb-Object-Complement (SVOC)     6
Existential Clause                      

Unnamed: 0,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,question_category,sentence_category
0,56ce304daab44d1400b8850e,What city in the United States has the highest...,What city in the United States has the high –e...,New York,The city in the United States with the high –e...,New York,? what city in the United-States has the high ...,New-York,the city in the United-States with the high -e...,False,Wh-Subject Question,Subject-Verb-Complement (SVC)
1,56ce304daab44d1400b8850f,In what city is the United Nations based?,In what city is the United Nations base –d?,New York,The United Nations is base –d in New York.,New York,? in what city is the United-Nations base -d,New-York,the United-Nations is base -d in New-York,False,Wh-Adverbial Question,Passive Construction
2,56ce304daab44d1400b88510,What city has been called the cultural capital...,What city has been call –ed the culture –al ca...,New York,New York has been call –ed the culture –al cap...,New York,? what city has been call -ed the culture -al ...,New-York,New-York has been call -ed the culture -al cap...,False,Wh-Subject Question,Passive Construction
3,56ce304daab44d1400b88511,What American city welcomes the largest number...,What American city welcome –s the large –st nu...,New York,New York is the American city that welcome –s ...,New York,? what American city welcome -s the large -st ...,New-York,New-York is the American city that welcome -s ...,False,Wh-Subject Question,Subject-Verb-Object (SVO/SVOA)
4,56cf5d41aab44d1400b89130,The major gateway for immigration has been whi...,The major gateway for immigrate –ion has been ...,New York City,The major gateway for immigrate –ion has been ...,New York City,? the major gateway for immigrate -ion has bee...,New-York-City,the major gateway for immigrate -ion has been ...,False,Wh-in-situ Question,Subject-Verb-Complement (SVC)
...,...,...,...,...,...,...,...,...,...,...,...,...
724,56d1204617492d1400aab9fd,In what borough is the New York City Hall found?,In what borough is the New York City Hall found?,Manhattan,The New York City Hall is found in the borough...,Manhattan,? in what borough is the New-York-City-Hall found,Manhattan,the New-York-City-Hall is found in the borough...,False,Wh-Adverbial Question,Passive Construction
725,56d1218c17492d1400aaba1f,How much money in cents does New York City rec...,How much money in cent –s does New York City r...,83,New York City receive –s 83 cent –s for every ...,83,? how much money in cent -s does New-York-City...,83,New-York-City receive -s 83 cent -s for every ...,False,Quantitative Wh-Question,Subject-Verb-Object (SVO/SVOA)
726,56d1218c17492d1400aaba20,How much more money does the city give to the ...,How much more money does the city give to the ...,$ 11 billion,The city give –s $ 11 billion more money to th...,$11 billion,? how much more money does the city give to th...,11 billion,the city give -s 11 billion more money to the...,False,Quantitative Wh-Question,Subject-Verb-Object (SVO/SVOA)
727,56d1218c17492d1400aaba21,"Each year, how much more money does New York C...","Each year, how much more money does New York C...",$ 11.4 billion,New York City give –s $ 11.4 billion more mone...,$11.4 billion,? each year how much more money does New-York-...,11 point 4 billion,New-York-City give -s 11 point 4 billion more...,False,Quantitative Wh-Question,Subject-Verb-Object (SVO/SVOA)


### create the pretraining data
Select the rows where the pretraining is true, generate a set of commands for each row and save to a file.

In [19]:
#add a new column to the dataframe with the created list of commands
if use_manual_pretraining_data:
    raise Exception("not implemented yet")
#to do - load the pretraining commands from a file and add to the dataframe
else:
    nyc_squad_df["created_commands"] = nyc_squad_df.apply(create_list_of_commands, axis=1)
nyc_squad_training_df = nyc_squad_df[nyc_squad_df["is_pretraining"] == False]
nyc_squad_pretraining_df = nyc_squad_df[nyc_squad_df["is_pretraining"] == True]
#save the final dataframe as a JSON lines file
nyc_squad_df.to_json(dataframe_filepath, orient="records",
                     lines=True)

In [20]:
with open(pretraining_filepath, "w") as commands_file:
    for index, row in nyc_squad_pretraining_df.iterrows():
        commands = row["created_commands"]
        for command in commands:
            commands_file.write(command + "\n")
print(f"Wrote {pretraining_filepath}")

with open(pretraining_filepath, "r") as commands_file:
    lines = commands_file.readlines()
number_of_reward_lines = sum(1 for line in lines if line.startswith(".rw"))
print(f"Number of reward lines: {number_of_reward_lines}")
print(f"Number of commands: {len(lines)}")
for line in lines[:20]:
    print(line.strip())

Wrote /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/11/pre_training/nyc_squad_pretraining_commands_20251009_075658.txt
Number of reward lines: 59
Number of commands: 686
# ID: 56ce345caab44d1400b88584
Giovanni da Verrazzano call -ed the area Nouvelle-Angouleme when he stake -d a claim on it
? what did Giovanni da Verrazzano call the area when he stake -d claim on it
.wg Giovanni
.wg da
.wg Verrazzano
.wg call
.wg area
.wg stake
.wg claim
.ph Giovanni da Verrazzano call -ed the area Nouvelle-Angouleme when he stake -d a claim on it
.wg Nouvelle-Angouleme
.rw
# ID: 56ce3569aab44d1400b885ae
Henry-Hudson call -ed the river that is now know -n as the Hudson-River the North-River
? what did Henry-Hudson call the river that is now call -ed the Hudson-River
.wg Henry-Hudson
.wg call
.wg river
.wg call


### create the training data
Select the rows where the pretraining is false, generate a set of commands for each row and save to a file.

In [21]:
#combine 2 columns together into a list of tuples
list_of_training_tuples = list(
    zip(nyc_squad_training_df["id"], nyc_squad_training_df["response_declarative_sentence_formatted"]))

write_training_file(list_of_training_tuples, training_filepath)
print(f"Wrote {training_filepath}")

with open(training_filepath, "r") as commands_file:
    lines = commands_file.readlines()
    print(f"Number of commands: {len(lines)}")
    print("First 100 commands:")
    for line in lines[:20]:
        print(line.strip())

file created: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/11/training/nyc_squad_training_commands_20251009_075658.txt
Wrote /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/11/training/nyc_squad_training_commands_20251009_075658.txt
Number of commands: 1340
First 100 commands:
#id: 56ce304daab44d1400b8850e
the city in the United-States with the high -est populate -ion is New-York
#id: 56ce304daab44d1400b8850f
the United-Nations is base -d in New-York
#id: 56ce304daab44d1400b88510
New-York has been call -ed the culture -al capital of the world
#id: 56ce304daab44d1400b88511
New-York is the American city that welcome -s the large -st number of legal immigrant -s
#id: 56cf5d41aab44d1400b89130
the major gateway for immigrate -ion has been New-York-City
#id: 56cf5d41aab44d1400b891

### create the testing data
Select the rows where the pretraining is false, generate a set of commands for each row and save to a file.

In [22]:
list_of_testing_tuples = list(zip(nyc_squad_training_df["id"], nyc_squad_training_df["response_question_formatted"]))
write_testing_file(list_of_testing_tuples, testing_filepath)
print(f"Wrote {testing_filepath}")

with open(testing_filepath, "r") as commands_file:
    lines = commands_file.readlines()
    print(f"Number of commands: {len(lines)}")
    print("First 100 commands:")
    for line in lines[:20]:
        print(line.strip())

file created: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/11/testing/nyc_squad_testing_commands_20251009_075658.txt
Wrote /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/11/testing/nyc_squad_testing_commands_20251009_075658.txt
Number of commands: 2680
First 100 commands:
#id: 56ce304daab44d1400b8850e
? what city in the United-States has the high -est populate -ion
.x
#END OF TESTING SAMPLE
#id: 56ce304daab44d1400b8850f
? in what city is the United-Nations base -d
.x
#END OF TESTING SAMPLE
#id: 56ce304daab44d1400b88510
? what city has been call -ed the culture -al capital of the world
.x
#END OF TESTING SAMPLE
#id: 56ce304daab44d1400b88511
? what American city welcome -s the large -st number of legal immigrant -s
.x
#END OF TESTING SAMPLE
#id: 56cf5d41aab44d1400b89130
? the

### create the pre-training validation testing data
Select the rows where the pretraining is true, generate a set of commands for each row and save to a file. This file is used to check that the pretraining commands have successfully trained the model.

In [23]:
list_of_pretraining_validation_testing_tuples = list(
    zip(nyc_squad_pretraining_df["id"], nyc_squad_pretraining_df["response_question_formatted"]))
write_testing_file(list_of_pretraining_validation_testing_tuples, pretraining_validation_testing_filepath)

print(f"Wrote {pretraining_validation_testing_filepath}")

with open(pretraining_validation_testing_filepath, "r") as commands_file:
    lines = commands_file.readlines()
    print(f"Number of commands: {len(lines)}")
    print("First 100 commands:")
    for line in lines[:20]:
        print(line.strip())

file created: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/11/testing/nyc_squad_pretraining_validation_testing_commands.txt
Wrote /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/11/testing/nyc_squad_pretraining_validation_testing_commands.txt
Number of commands: 236
First 100 commands:
#id: 56ce345caab44d1400b88584
? what did Giovanni da Verrazzano call the area when he stake -d claim on it
.x
#END OF TESTING SAMPLE
#id: 56ce3569aab44d1400b885ae
? what did Henry-Hudson call the river that is now call -ed the Hudson-River
.x
#END OF TESTING SAMPLE
#id: 56cfab96234ae51400d9be44
? in what year was the land between Cape-Cod and Delaware-Bay claim -ed by the Dutch
.x
#END OF TESTING SAMPLE
#id: 56cedbb9aab44d1400b88b12
? what person bought Manhattan from the Canarsie for the Dutc

## Generate the command line instructions for running the experiments

In [25]:
#copy the data files to the docker shared data directory for processing
os.system(f'cp "{pretraining_filepath}" {docker_data_directory}/pre-training')
os.system(f'cp "{training_filepath}" {docker_data_directory}/training')
os.system(f'cp "{testing_filepath}" {docker_data_directory}/testing')
os.system(f'cp "{pretraining_validation_testing_filepath}" {docker_data_directory}/testing')

0

In [26]:
#Create the pre-training command
print(
    f'docker compose run --remove-orphans --entrypoint ./pre_train_annabell_squad_nyc.sh app data/pre-training/logfile_nyc_squad_pretraining_commands.txt data/pre-training/{pretraining_filename} data/pre-training/{pretraining_filename.replace(".txt", ".dat")}')

#run the pretraining
docker compose run --remove-orphans --entrypoint ./pre_train_annabell_squad_nyc.sh app data/pre-training/logfile_nyc_squad_pretraining_commands.txt data/pre-training/nyc_squad_pretraining_commands_20251009_075658.txt data/pre-training/nyc_squad_pretraining_commands_20251009_075658.dat


In [36]:
#copy the pre-trained weights to the pre training directory
source_path = os.path.join(docker_data_directory, "pre-training", pretraining_filename.replace(".txt", ".dat"))
destination_path = os.path.join(pretraining_directory, pretraining_filename.replace(".txt", ".dat"))
os.system(f'cp "{source_path}" "{destination_path}"')
print("copied: " + source_path + " to: " + destination_path)

#copy the pre-training logfile to the pre training directory
source_pattern = os.path.join(docker_data_directory, "pre-training", "logfile_nyc_squad_pretraining_commands*")
destination_dir = logs_directory

# Find all files matching the pattern
log_files = glob.glob(source_pattern)

# Move each found file to the destination directory
for file_path in log_files:
    os.system(f'mv "{file_path}" "{destination_path}"')
    print(f"moved: {file_path} to: {destination_dir}")

copied: /Users/chris/PycharmProjects/dataset/docker/shared_data/pre-training/nyc_squad_pretraining_commands_20251009_075658.dat to: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/11/pre_training/nyc_squad_pretraining_commands_20251009_075658.dat
moved: /Users/chris/PycharmProjects/dataset/docker/shared_data/pre-training/logfile_nyc_squad_pretraining_commands_2025-10-09_12-02-19.txt to: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/11/pre_training
moved: /Users/chris/PycharmProjects/dataset/docker/shared_data/pre-training/logfile_nyc_squad_pretraining_commands_2025-10-04_14-43-41.txt to: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/11/pre_training
moved: /Users/chris/P

In [29]:
pretraining_validation_testing_filename = "nyc_squad_pretraining_validation_testing_commands_20251009_075658.txt"

In [32]:
#run the pretraining validation testing
print("#run the pretraining validation testing")
print(
    f'docker compose run --remove-orphans --entrypoint ./test_annabell_squad.sh app data/testing/logfile_nyc_squad_pretraining_validation_testing_commands.txt data/pre-training/{pretraining_filename.replace(".txt", ".dat")} data/testing/{pretraining_validation_testing_filename}')

#run the pretraining validation testing
docker compose run --remove-orphans --entrypoint ./test_annabell_squad.sh app data/testing/logfile_nyc_squad_pretraining_validation_testing_commands.txt data/pre-training/nyc_squad_pretraining_commands_20251009_075658.dat data/testing/nyc_squad_pretraining_validation_testing_commands_20251009_075658.txt


In [37]:
#Copy the testing logs back to the experiment directory
source_pattern = os.path.join(docker_data_directory, "testing",
                              "logfile_nyc_squad_pretraining_validation_testing_commands*")
destination_dir = logs_directory
# Find all files matching the pattern
log_files = glob.glob(source_pattern)
# Copy each found file to the destination directory
for file_path in log_files:
    os.system(f'cp "{file_path}" "{destination_dir}"')
    print(f"copied: {file_path} to: {destination_dir}")

moved: /Users/chris/PycharmProjects/dataset/docker/shared_data/testing/logfile_nyc_squad_pretraining_validation_testing_commands_2025-10-09_16-57-43.txt to: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/11/testing


### perform the testing using the "test annabell" notebook

In [40]:
#print the command to run the training
print(
    f'docker compose run --remove-orphans --entrypoint ./train_annabell_squad.sh app data/training/logfile_nyc_squad_training_commands.txt data/pre-training/{pretraining_filename.replace(".txt", ".dat")} data/training/{training_filename.replace(".txt", ".dat")} data/training/{training_filename}')

docker compose run --remove-orphans --entrypoint ./train_annabell_squad.sh app data/training/logfile_nyc_squad_training_commands.txt data/pre-training/nyc_squad_pretraining_commands_20251009_075658.dat data/training/nyc_squad_training_commands_20251009_075658.dat data/training/nyc_squad_training_commands_20251009_075658.txt


In [43]:
#copy the training weights back to the experiment directory
source_path = os.path.join(docker_data_directory, "training", training_filename.replace(".txt", ".dat"))
destination_path = os.path.join(training_directory, training_filename.replace(".txt", ".dat"))
os.system(f'cp "{source_path}" "{destination_path}"')
print("copied: " + source_path + " to: " + destination_path)
#copy the training logfile to the logs directory
source_pattern = os.path.join(docker_data_directory, "training", "logfile_nyc_squad_training_commands*")
destination_dir = logs_directory
# Find all files matching the pattern
log_files = glob.glob(source_pattern)
# Move each found file to the destination directory
for file_path in log_files:
    os.system(f'mv "{file_path}" "{destination_dir}"')
    print(f"moved: {file_path} to: {destination_dir}")

copied: /Users/chris/PycharmProjects/dataset/docker/shared_data/training/nyc_squad_training_commands_20251009_075658.dat to: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/11/training/nyc_squad_training_commands_20251009_075658.dat


In [44]:
#print the command to run the testing
print("#run the testing")
print(
    f'docker compose run --remove-orphans --entrypoint ./test_annabell_squad.sh app data/testing/logfile_nyc_squad_testing_commands.txt data/training/{training_filename.replace(".txt", ".dat")} data/testing/{testing_filename}')

#run the testing
docker compose run --remove-orphans --entrypoint ./test_annabell_squad.sh app data/testing/logfile_nyc_squad_testing_commands.txt data/training/nyc_squad_training_commands_20251009_075658.dat data/testing/nyc_squad_testing_commands_20251009_075658.txt


In [45]:
#Copy the testing logs back to the experiment directory
source_pattern = os.path.join(docker_data_directory, "testing", "logfile_nyc_squad_testing_commands*")
destination_dir = logs_directory
# Find all files matching the pattern
log_files = glob.glob(source_pattern)
# Copy each found file to the destination directory
for file_path in log_files:
    os.system(f'cp "{file_path}" "{destination_dir}"')
    print(f"copied: {file_path} to: {destination_dir}")

copied: /Users/chris/PycharmProjects/dataset/docker/shared_data/testing/logfile_nyc_squad_testing_commands_2025-10-10_06-56-59.txt to: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/11/logs


### Run the testing notebook to evaluate the results

## Appendix - Debugging cells

In [7]:
id = "56cfe293234ae51400d9c007"
test_df = nyc_squad_df[nyc_squad_df["id"] == id]
#add a new column to the dataframe with the created list of commands
result = test_df.apply(create_list_of_commands, axis=1)
result.values[0]

['# ID: 56cfe293234ae51400d9c007',
 'Brownstone rowhouse -s make up most of the large resident -ial district -s of NYC',
 '? what type of house -ing structure make -s up most of the large resident -ial district -s of NYC',
 '.wg type',
 '.wg house',
 '.wg structure',
 '.wg make',
 '.wg large',
 '.wg resident',
 '.wg district',
 '.wg NYC',
 '.ph Brownstone rowhouse -s make up most of the large resident -ial district -s of NYC',
 '.wg Brownstone rowhouse -s',
 '.rw']

In [8]:
from dataset_processing import remove_stopwords

result = remove_stopwords(
    '? what type of house -ing structure make -s up most of the large resident -ial district -s of NYC')
result

'? type house -ing structure make -s large resident -ial district -s NYC'