# This notebook creates the datasets for pretraining, training and testing the ANNABELL model using the NYC dataset derived from SQuAD.

In [15]:
import os
import glob
import platform
import datetime
import shutil

from dataset_processing import create_list_of_commands, write_training_file, write_testing_file, \
    read_dataset, merge_categories, select_pretraining_data, write_pretraining_file, \
    write_testing_file, write_training_file


experiment_number = "14"
percentage_of_pretraining_samples = 50
use_manual_pretraining_data = False
maximum_number_of_words = 20
maximum_word_length = 25
dataset_filename = "response_formatted_20250924_174653.jsonl"

operating_system = platform.system()
if operating_system == 'Windows':
	base_directory = "G:\\My Drive\\Shared with Julia\\Education\\Kent University\\PhD\\work\\annabell"
	pycharm_projects_directory = "C:\\Users\\cjwal\\PycharmProjects\\Training-and-evaluating-cognitive-language-models"
elif operating_system == 'Linux':
    base_directory = "/home/chris/gdrive/work/annabell/"
    pycharm_projects_directory = "/home/chris/PycharmProjects/Training-and-evaluating-cognitive-language-models"
elif operating_system == 'Darwin':  #macOS
    base_directory = "/Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/"
    pycharm_projects_directory = "/Users/chris/PycharmProjects/Training-and-evaluating-cognitive-language-models"
else:
    raise Exception("unsupported OS")

timestamp = datetime.datetime.now().strftime("_%Y%m%d_%H%M%S")
#timestamp = "_20251014_063732"  #use a fixed timestamp if notebook is stopped part way through
data_directory = os.path.join(base_directory, "experiments/data")
docker_data_directory = os.path.join(pycharm_projects_directory, "docker", "shared_data")
dataset_filepath = os.path.join(data_directory, dataset_filename)
categorised_questions_filename = "llm_question_categorisation_results.jsonl"
categorised_questions_filepath = os.path.join(data_directory, "prompts", categorised_questions_filename)
categorised_sentences_filename = "llm_sentence_categorisation_results.jsonl"
categorised_sentences_filepath = os.path.join(data_directory, "prompts", categorised_sentences_filename)
pretraining_directory = os.path.join(base_directory, "experiments", experiment_number, "pre_training")
if not os.path.exists(pretraining_directory):
    os.makedirs(pretraining_directory)
pretraining_filename = "nyc_squad_pretraining_commands" + timestamp + ".txt"
pretraining_filepath = os.path.join(pretraining_directory, pretraining_filename)
training_directory = os.path.join(base_directory, "experiments", experiment_number, "training")
if not os.path.exists(training_directory):
    os.makedirs(training_directory)
training_filename = "nyc_squad_training_commands" + timestamp + ".txt"
training_filepath = os.path.join(training_directory, training_filename)
testing_directory = os.path.join(base_directory, "experiments", experiment_number, "testing")
if not os.path.exists(testing_directory):
    os.makedirs(testing_directory)
testing_filename = "nyc_squad_testing_commands" + timestamp + ".txt"
testing_filepath = os.path.join(testing_directory, testing_filename)
pretraining_validation_testing_filename = "nyc_squad_pretraining_validation_testing_commands" + timestamp + ".txt"
pretraining_validation_testing_filepath = os.path.join(testing_directory, pretraining_validation_testing_filename)
logs_directory = os.path.join(base_directory, "experiments", experiment_number, "logs")
if not os.path.exists(logs_directory):
    os.makedirs(logs_directory)
dataframe_filename = "nyc_squad_with_pretraining_commands" + timestamp + ".jsonl"
dataframe_directory = os.path.join(base_directory, "experiments", "dataframes")
if not os.path.exists(dataframe_directory):
    os.makedirs(dataframe_directory)
dataframe_filepath = os.path.join(dataframe_directory, dataframe_filename)

In [16]:
nyc_squad_df = read_dataset(dataset_filepath, maximum_number_of_words, maximum_word_length)
nyc_squad_df = merge_categories(nyc_squad_df, categorised_questions_filepath, categorised_sentences_filepath)
nyc_dataframe = select_pretraining_data(nyc_squad_df, use_manual_pretraining_data,
                                        percentage_of_pretraining_samples)
nyc_dataframe

Question categories: ['Wh-Subject Question' 'Wh-Adverbial Question' 'Wh-in-situ Question'
 'Quantitative Wh-Question' 'Wh-Object/Complement Question']
Sentence categories: ['Subject-Verb-Complement (SVC)' 'Passive Construction'
 'Subject-Verb-Object (SVO/SVOA)' 'Subject-Verb-Adverbial (SVA)'
 'Existential Clause' 'Subject-Verb-Object-Complement (SVOC)']
Number of pretraining samples: 364
Samples per category: 33
Pretraining samples by question category:
question_category
Quantitative Wh-Question         62
Wh-Adverbial Question            40
Wh-Object/Complement Question    38
Wh-Subject Question              38
Wh-in-situ Question              33
Name: count, dtype: int64
Pretraining samples by sentence category:
sentence_category
Subject-Verb-Complement (SVC)            67
Passive Construction                     38
Subject-Verb-Object (SVO/SVOA)           34
Subject-Verb-Adverbial (SVA)             33
Existential Clause                       29
Subject-Verb-Object-Complement (SVOC) 

Unnamed: 0,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,question_category,sentence_category
0,56ce304daab44d1400b8850e,What city in the United States has the highest...,What city in the United States has the high –e...,New York,The city in the United States with the high –e...,New York,? what city in the United-States has the high ...,New-York,the city in the United-States with the high -e...,False,Wh-Subject Question,Subject-Verb-Complement (SVC)
1,56ce304daab44d1400b8850f,In what city is the United Nations based?,In what city is the United Nations base –d?,New York,The United Nations is base –d in New York.,New York,? in what city is the United-Nations base -d,New-York,the United-Nations is base -d in New-York,False,Wh-Adverbial Question,Passive Construction
2,56ce304daab44d1400b88510,What city has been called the cultural capital...,What city has been call –ed the culture –al ca...,New York,New York has been call –ed the culture –al cap...,New York,? what city has been call -ed the culture -al ...,New-York,New-York has been call -ed the culture -al cap...,False,Wh-Subject Question,Passive Construction
3,56ce304daab44d1400b88511,What American city welcomes the largest number...,What American city welcome –s the large –st nu...,New York,New York is the American city that welcome –s ...,New York,? what American city welcome -s the large -st ...,New-York,New-York is the American city that welcome -s ...,False,Wh-Subject Question,Subject-Verb-Object (SVO/SVOA)
4,56cf5d41aab44d1400b89130,The major gateway for immigration has been whi...,The major gateway for immigrate –ion has been ...,New York City,The major gateway for immigrate –ion has been ...,New York City,? the major gateway for immigrate -ion has bee...,New-York-City,the major gateway for immigrate -ion has been ...,True,Wh-in-situ Question,Subject-Verb-Complement (SVC)
...,...,...,...,...,...,...,...,...,...,...,...,...
724,56d1204617492d1400aab9fd,In what borough is the New York City Hall found?,In what borough is the New York City Hall found?,Manhattan,The New York City Hall is found in the borough...,Manhattan,? in what borough is the New-York-City-Hall found,Manhattan,the New-York-City-Hall is found in the borough...,False,Wh-Adverbial Question,Passive Construction
725,56d1218c17492d1400aaba1f,How much money in cents does New York City rec...,How much money in cent –s does New York City r...,83,New York City receive –s 83 cent –s for every ...,83,? how much money in cent -s does New-York-City...,83,New-York-City receive -s 83 cent -s for every ...,False,Quantitative Wh-Question,Subject-Verb-Object (SVO/SVOA)
726,56d1218c17492d1400aaba20,How much more money does the city give to the ...,How much more money does the city give to the ...,$ 11 billion,The city give –s $ 11 billion more money to th...,$11 billion,? how much more money does the city give to th...,11 billion,the city give -s 11 billion more money to the...,True,Quantitative Wh-Question,Subject-Verb-Object (SVO/SVOA)
727,56d1218c17492d1400aaba21,"Each year, how much more money does New York C...","Each year, how much more money does New York C...",$ 11.4 billion,New York City give –s $ 11.4 billion more mone...,$11.4 billion,? each year how much more money does New-York-...,11 point 4 billion,New-York-City give -s 11 point 4 billion more...,False,Quantitative Wh-Question,Subject-Verb-Object (SVO/SVOA)


### create the pretraining data
Select the rows where the pretraining is true, generate a set of commands for each row and save to a file.

In [17]:
#add a new column to the dataframe with the created list of commands
if use_manual_pretraining_data:
    raise Exception("not implemented yet")
#to do - load the pretraining commands from a file and add to the dataframe
else:
    nyc_squad_df["created_commands"] = nyc_squad_df.apply(create_list_of_commands, axis=1)
nyc_squad_training_df = nyc_squad_df[nyc_squad_df["is_pretraining"] == False]
nyc_squad_pretraining_df = nyc_squad_df[nyc_squad_df["is_pretraining"] == True]
#save the final dataframe as a JSON lines file
nyc_squad_df.to_json(dataframe_filepath, orient="records",
                     lines=True)

### Write the files containing the commands to perform pretraining, training and testing on ANNABELL

In [18]:
write_pretraining_file(pretraining_filepath, nyc_squad_pretraining_df)
write_training_file(training_filepath, nyc_squad_training_df)
write_testing_file(testing_filepath, nyc_squad_training_df)
write_testing_file(pretraining_validation_testing_filepath, nyc_squad_pretraining_df)

Wrote /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/14/pre_training/nyc_squad_pretraining_commands_20251022_085512.txt
Number of reward lines: 211
Number of commands: 2828
# ID: 56cf5d41aab44d1400b89130
the major gateway for immigrate -ion has been New-York-City
? the major gateway for immigrate -ion has been which US city
.wg major
.wg gateway
.wg immigrate
.wg US
.wg city
.ph the major gateway for immigrate -ion has been New-York-City
.wg New-York-City
.rw


# ID: 56cf5d41aab44d1400b89131
the most populate -d city in the United-States is New-York-City
? the most populate -d city in the United-States is which city
.wg populate
.wg city
.wg United-States
.wg city
file written: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/14/training/nyc_squad_training_commands_20251022_085

## Generate the command line instructions for running the experiments

In [19]:
# Define destination directories using os.path.join for cross-platform compatibility
pre_training_dest = os.path.join(docker_data_directory, 'pre-training')
training_dest = os.path.join(docker_data_directory, 'training')
testing_dest = os.path.join(docker_data_directory, 'testing')

# Copy the data files to the docker shared data directory for processing
shutil.copy(pretraining_filepath, pre_training_dest)
shutil.copy(training_filepath, training_dest)
shutil.copy(testing_filepath, testing_dest)
shutil.copy(pretraining_validation_testing_filepath, testing_dest)

print("Files copied to docker shared data directory.")

Files copied to docker shared data directory.


In [20]:
#Create the pre-training command
print(
    f'docker compose run --remove-orphans --entrypoint ./pre_train_annabell_squad_nyc.sh app data/pre-training/logfile_nyc_squad_pretraining_commands.txt data/pre-training/{pretraining_filename} data/pre-training/{pretraining_filename.replace(".txt", ".dat")}')

docker compose run --remove-orphans --entrypoint ./pre_train_annabell_squad_nyc.sh app data/pre-training/logfile_nyc_squad_pretraining_commands.txt data/pre-training/nyc_squad_pretraining_commands_20251022_085512.txt data/pre-training/nyc_squad_pretraining_commands_20251022_085512.dat


In [13]:
#copy the pre-trained weights to the pre training directory
weights_filename = pretraining_filename.replace(".txt", ".dat")
source_path = os.path.join(docker_data_directory, "pre-training", weights_filename)
destination_path = os.path.join(pretraining_directory, weights_filename)

try:
    shutil.copy(source_path, destination_path)
    print("copied: " + source_path + " to: " + destination_path)
except FileNotFoundError:
    print(f"Error: Source file not found at {source_path}")
except Exception as e:
    print(f"An error occurred: {e}")
#move the pre-training logfile to the pre-training directory
source_pattern = os.path.join(docker_data_directory, "pre-training", "logfile_nyc_squad_pretraining_commands*")
destination_dir = logs_directory

log_files = glob.glob(source_pattern)
if not log_files:
	print(f"No log files found matching pattern: {source_pattern}")

for file_path in log_files:
    try:
        shutil.move(file_path, destination_dir)
        print(f"moved: {file_path} to: {destination_dir}")
    except FileNotFoundError:
        print(f"Error: Log file not found at {file_path}")
    except Exception as e:
        print(f"An error occurred while moving {file_path}: {e}")

Error: Source file not found at /Users/chris/PycharmProjects/Training-and-evaluating-cognitive-language-models/docker/shared_data/pre-training/nyc_squad_pretraining_commands_20251021_120346.dat
No log files found matching pattern: /Users/chris/PycharmProjects/Training-and-evaluating-cognitive-language-models/docker/shared_data/pre-training/logfile_nyc_squad_pretraining_commands*


In [14]:
log_file = '/Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/13/logs/logfil_nyc_squad_pretraining_commands_20251021_120346.txt'
with open(log_file, 'r') as f:
	log_contents = f.readlines()
ids = [id_line for id_line in log_contents if id_line.startswith('# ID:')]
print(len(ids))
ids

304


['# ID: 56cf5d41aab44d1400b89130\n',
 '# ID: 56cf5d41aab44d1400b89131\n',
 '# ID: 56ce3124aab44d1400b8852c\n',
 '# ID: 56cf9d81234ae51400d9be1d\n',
 '# ID: 56ce31baaab44d1400b8853a\n',
 '# ID: 56ce31baaab44d1400b8853e\n',
 '# ID: 56cf9df0234ae51400d9be29\n',
 '# ID: 56ce32e7aab44d1400b88551\n',
 '# ID: 56cf9f16234ae51400d9be30\n',
 '# ID: 56ce3348aab44d1400b88560\n',
 '# ID: 56cf9f4b234ae51400d9be35\n',
 '# ID: 56ce345caab44d1400b88580\n',
 '# ID: 56ce345caab44d1400b88581\n',
 '# ID: 56ce345caab44d1400b88583\n',
 '# ID: 56ce345caab44d1400b88584\n',
 '# ID: 56cfa06a234ae51400d9be39\n',
 '# ID: 56cfa06a234ae51400d9be3a\n',
 '# ID: 56cfa06a234ae51400d9be3b\n',
 '# ID: 56ce34c7aab44d1400b88596\n',
 '# ID: 56cfa4d8234ae51400d9be3f\n',
 '# ID: 56cfa4d8234ae51400d9be40\n',
 '# ID: 56ce3569aab44d1400b885ae\n',
 '# ID: 56cfab96234ae51400d9be43\n',
 '# ID: 56cfab96234ae51400d9be44\n',
 '# ID: 56ce362aaab44d1400b885bc\n',
 '# ID: 56cfabed234ae51400d9be4b\n',
 '# ID: 56cedbb9aab44d1400b88b0f\n',
 

In [12]:
#run the pretraining validation testing
print("#run the pretraining validation testing")
print(
    f'docker compose run --remove-orphans --entrypoint ./test_annabell_squad.sh app data/testing/logfile_nyc_squad_pretraining_validation_testing_commands.txt data/pre-training/{pretraining_filename.replace(".txt", ".dat")} data/testing/{pretraining_validation_testing_filename}')

#run the pretraining validation testing
docker compose run --remove-orphans --entrypoint ./test_annabell_squad.sh app data/testing/logfile_nyc_squad_pretraining_validation_testing_commands.txt data/pre-training/nyc_squad_pretraining_commands_20251014_063732.dat data/testing/nyc_squad_pretraining_validation_testing_commands_20251014_063732.txt


In [9]:
#Copy the testing logs back to the experiment directory
source_pattern = os.path.join(docker_data_directory, "testing",
                              "logfile_nyc_squad_pretraining_validation_testing_commands*")
destination_dir = logs_directory
# Find all files matching the pattern
log_files = glob.glob(source_pattern)
# Copy each found file to the destination directory
for file_path in log_files:
    try:
        shutil.copy(file_path, destination_dir)
        print(f"copied: {file_path} to: {destination_dir}")
    except FileNotFoundError:
        print(f"Error: Source file not found at {file_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

copied: /Users/chris/PycharmProjects/dataset/docker/shared_data/testing/logfile_nyc_squad_pretraining_validation_testing_commands_2025-10-09_16-57-43.txt to: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/12/logs
copied: /Users/chris/PycharmProjects/dataset/docker/shared_data/testing/logfile_nyc_squad_pretraining_validation_testing_commands_2025-10-15_05-21-58.txt to: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/12/logs


### perform the testing using the "test annabell" notebook

In [13]:
#print the command to run the training
print(
    f'docker compose run --remove-orphans --entrypoint ./train_annabell_squad.sh app data/training/logfile_nyc_squad_training_commands.txt data/pre-training/{pretraining_filename.replace(".txt", ".dat")} data/training/{training_filename.replace(".txt", ".dat")} data/training/{training_filename}')

docker compose run --remove-orphans --entrypoint ./train_annabell_squad.sh app data/training/logfile_nyc_squad_training_commands.txt data/pre-training/nyc_squad_pretraining_commands_20251014_063732.dat data/training/nyc_squad_training_commands_20251014_063732.dat data/training/nyc_squad_training_commands_20251014_063732.txt


In [14]:
#copy the training weights back to the experiment directory
source_path = os.path.join(docker_data_directory, "training", training_filename.replace(".txt", ".dat"))
destination_path = os.path.join(training_directory, training_filename.replace(".txt", ".dat"))
try:
    shutil.copy(source_path, destination_path)
    print("copied: " + source_path + " to: " + destination_path)
except FileNotFoundError:
    print(f"Error: Source file not found at {source_path}")
except Exception as e:
    print(f"An error occurred: {e}")

#copy the training logfile to the logs directory
source_pattern = os.path.join(docker_data_directory, "training", "logfile_nyc_squad_training_commands*")
destination_dir = logs_directory
# Find all files matching the pattern
log_files = glob.glob(source_pattern)
# Move each found file to the destination directory
for file_path in log_files:
    try:
        shutil.move(file_path, destination_dir)
        print(f"moved: {file_path} to: {destination_dir}")
    except FileNotFoundError:
        print(f"Error: Log file not found at {file_path}")
    except Exception as e:
        print(f"An error occurred while moving {file_path}: {e}")

copied: /Users/chris/PycharmProjects/dataset/docker/shared_data/training/nyc_squad_training_commands_20251014_063732.dat to: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/12/training/nyc_squad_training_commands_20251014_063732.dat
moved: /Users/chris/PycharmProjects/dataset/docker/shared_data/training/logfile_nyc_squad_training_commands_2025-10-15_05-30-27.txt to: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/12/logs
moved: /Users/chris/PycharmProjects/dataset/docker/shared_data/training/logfile_nyc_squad_training_commands_2025-10-15_05-42-17.txt to: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/12/logs


In [16]:
#print the command to run the testing
print("#run the testing")
print(
    f'docker compose run --remove-orphans --entrypoint ./test_annabell_squad.sh app data/testing/logfile_nyc_squad_testing_commands.txt data/training/{training_filename.replace(".txt", ".dat")} data/testing/{testing_filename}')

#run the testing
docker compose run --remove-orphans --entrypoint ./test_annabell_squad.sh app data/testing/logfile_nyc_squad_testing_commands.txt data/training/nyc_squad_training_commands_20251014_063732.dat data/testing/nyc_squad_testing_commands_20251014_063732.txt


In [17]:
#Copy the testing logs back to the experiment directory
source_pattern = os.path.join(docker_data_directory, "testing", "logfile_nyc_squad_testing_commands*")
destination_dir = logs_directory
# Find all files matching the pattern
log_files = glob.glob(source_pattern)
# Copy each found file to the destination directory
for file_path in log_files:
    try:
        shutil.copy(file_path, destination_dir)
        print(f"copied: {file_path} to: {destination_dir}")
    except FileNotFoundError:
        print(f"Error: Source file not found at {file_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

copied: /Users/chris/PycharmProjects/dataset/docker/shared_data/testing/logfile_nyc_squad_testing_commands_2025-10-15_08-31-02.txt to: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/12/logs
copied: /Users/chris/PycharmProjects/dataset/docker/shared_data/testing/logfile_nyc_squad_testing_commands_2025-10-15_07-07-08.txt to: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/12/logs
copied: /Users/chris/PycharmProjects/dataset/docker/shared_data/testing/logfile_nyc_squad_testing_commands_2025-10-15_08-27-13.txt to: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/12/logs
copied: /Users/chris/PycharmProjects/dataset/docker/shared_data/testing/logfile_nyc_squad_testing_commands_20

### Run the testing notebook to evaluate the results

## Appendix - Debugging cells

In [7]:
id = "56cfe293234ae51400d9c007"
test_df = nyc_squad_df[nyc_squad_df["id"] == id]
#add a new column to the dataframe with the created list of commands
result = test_df.apply(create_list_of_commands, axis=1)
result.values[0]

['# ID: 56cfe293234ae51400d9c007',
 'Brownstone rowhouse -s make up most of the large resident -ial district -s of NYC',
 '? what type of house -ing structure make -s up most of the large resident -ial district -s of NYC',
 '.wg type',
 '.wg house',
 '.wg structure',
 '.wg make',
 '.wg large',
 '.wg resident',
 '.wg district',
 '.wg NYC',
 '.ph Brownstone rowhouse -s make up most of the large resident -ial district -s of NYC',
 '.wg Brownstone rowhouse -s',
 '.rw']

In [8]:
from dataset_processing import remove_stopwords

result = remove_stopwords(
    '? what type of house -ing structure make -s up most of the large resident -ial district -s of NYC')
result

'? type house -ing structure make -s large resident -ial district -s NYC'