# This notebook creates the datasets for pretraining, training and testing the ANNABELL model using the NYC dataset derived from SQuAD.

In [1]:
import os
import glob
import datetime
import shutil

from dataset_processing import AnnabellCommandGenerator, \
    DatasetPreProcessor, merge_categories, select_pretraining_data, write_pretraining_file, \
    write_testing_file, write_training_file

from config.global_config import GlobalConfig

timestamp = datetime.datetime.now().strftime("_%Y%m%d_%H%M%S")

global_config = GlobalConfig()
percentage_of_pretraining_samples = global_config.percentage_of_pre_training_samples()
maximum_number_of_words = global_config.maximum_number_of_words()
maximum_word_length = global_config.maximum_word_length()
dataset_filepath = global_config.prepared_dataset_filepath()
categorised_questions_filepath = global_config.categorised_questions_filepath()
categorised_sentences_filepath = global_config.categorised_statements_filepath()
pretraining_filepath = global_config.pre_training_filepath()
training_filepath = global_config.training_filepath()
testing_filepath = global_config.testing_filepath()
pretraining_validation_testing_filepath = global_config.pretraining_validation_testing_filepath()
logs_directory = global_config.log_archive_directory()

2025-11-05 06:44:27,477 - root - INFO - Logging initialized. Log file: /home/chris/logs/cognitive_language_model_logs/run_20251105_064427.log


In [2]:
dataset_filepath

'/home/chris/gdrive/work/annabell/experiments/data/response_formatted_20250924_174653.jsonl'

In [3]:
datasetPreProcessor = DatasetPreProcessor(global_config.prepared_dataset_filepath(),
                                          global_config.maximum_number_of_words(), global_config.maximum_word_length())
datasetPreProcessor.preprocess_data()
nyc_squad_df = datasetPreProcessor.dataset
nyc_squad_df = merge_categories(nyc_squad_df, global_config.categorised_questions_filepath(),
                                global_config.categorised_statements_filepath())
nyc_dataframe = select_pretraining_data(nyc_squad_df, global_config.percentage_of_pre_training_samples())
nyc_dataframe

  categorised_sentences_df = pd.read_json(categorised_sentences_filepath, lines=True)


ValueError: Expected object or value

### create the pretraining data
Select the rows where the pretraining is true, generate a set of commands for each row and save to a file.

In [3]:
#add a new column to the dataframe with the created list of commands
nyc_squad_df["created_commands"] = nyc_squad_df.apply(
    lambda row: AnnabellCommandGenerator(
        row['id'], row['response_declarative_sentence_formatted'], row['response_question_formatted'],
        row['response_answer_formatted']
    ).create_list_of_commands(),
    axis=1
)

nyc_squad_training_df = nyc_squad_df[nyc_squad_df["is_pretraining"] == False]
nyc_squad_pretraining_df = nyc_squad_df[nyc_squad_df["is_pretraining"] == True]
#save the final dataframe as a JSON lines file
nyc_squad_df.to_json(global_config.prepared_dataset_with_commands_filepath(), orient="records",
                     lines=True)

### Write the files containing the commands to perform pretraining, training and testing on ANNABELL

In [4]:
write_pretraining_file(pretraining_filepath, nyc_squad_pretraining_df)
write_training_file(training_filepath, nyc_squad_training_df)
write_testing_file(testing_filepath, nyc_squad_training_df)
write_testing_file(pretraining_validation_testing_filepath, nyc_squad_pretraining_df)

Wrote /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/15/pre_training/nyc_squad_pretraining_commands_20251026_091610.txt
Number of reward lines: 42
Number of commands: 773
#id: 56ce32e7aab44d1400b88551
there are 469 station -s operate -d by the New-York-City-Subway


? how many station -s are operate -d by the
New-York-City-Subway
.sctx ? how many station -s are operate -d by the
.wg many
.wg station
.wg operate
.sctx New-York-City-Subway
.wg New-York-City-Subway
.ph there are 469 station -s operate -d by the New-York-City-Subway
.wg 469
.rw


#id: 56ce345caab44d1400b88584
Giovanni da Verrazzano call -ed the area Nouvelle-Angouleme when he
stake -d a claim on it
file written: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/15/training/nyc_squad_training_commands_20251026_091610

## Generate the command line instructions for running the experiments

In [5]:
# Copy the data files to the docker shared data directory for processing
shutil.copy(pretraining_filepath, global_config.docker_pre_training_directory())
shutil.copy(training_filepath, global_config.docker_training_directory())
shutil.copy(testing_filepath, global_config.docker_testing_directory())
shutil.copy(pretraining_validation_testing_filepath, global_config.docker_testing_directory())

print("Files copied to docker shared data directory.")

Files copied to docker shared data directory.


In [6]:
#Create the pre-training command
print(
    f'docker compose run --remove-orphans --entrypoint ./pre_train_annabell_squad_nyc.sh app data/pre-training/logfile_nyc_squad_pretraining_commands.txt data/pre-training/{global_config.pre_training_filename()} data/pre-training/{global_config.pre_training_filename().replace(".txt", ".dat")}')

docker compose run --remove-orphans --entrypoint ./pre_train_annabell_squad_nyc.sh app data/pre-training/logfile_nyc_squad_pretraining_commands.txt data/pre-training/nyc_squad_pretraining_commands_20251026_091610.txt data/pre-training/nyc_squad_pretraining_commands_20251026_091610.dat


In [21]:
#copy the pre-trained weights to the pre-training directory
weights_filename = global_config.pre_training_filename().replace(".txt", ".dat")
source_path = os.path.join(global_config.get_docker_data_directory(), "pre-training", weights_filename)
destination_path = os.path.join(global_config.get_docker_data_directory(), weights_filename)

try:
    shutil.copy(source_path, destination_path)
    print("copied: " + source_path + " to: " + destination_path)
except FileNotFoundError:
    print(f"Error: Source file not found at {source_path}")
except Exception as e:
    print(f"An error occurred: {e}")
#move the pre-training logfile to the pre-training directory
source_pattern = os.path.join(global_config.get_docker_data_directory(), "pre-training",
                              "logfile_nyc_squad_pretraining_commands*")
destination_dir = logs_directory

log_files = glob.glob(source_pattern)
if not log_files:
    print(f"No log files found matching pattern: {source_pattern}")

for file_path in log_files:
    try:
        shutil.move(file_path, destination_dir)
        print(f"moved: {file_path} to: {destination_dir}")
    except FileNotFoundError:
        print(f"Error: Log file not found at {file_path}")
    except Exception as e:
        print(f"An error occurred while moving {file_path}: {e}")

copied: /Users/chris/PycharmProjects/Training-and-evaluating-cognitive-language-models/docker/shared_data/pre-training/nyc_squad_pretraining_commands_20251022_085512.dat to: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/14/pre_training/nyc_squad_pretraining_commands_20251022_085512.dat
moved: /Users/chris/PycharmProjects/Training-and-evaluating-cognitive-language-models/docker/shared_data/pre-training/logfile_nyc_squad_pretraining_commands_2025-10-22_06-56-50.txt to: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/14/logs


In [22]:
#run the pretraining validation testing
print(
    f'docker compose run --remove-orphans --entrypoint ./test_annabell_squad.sh app data/testing/logfile_nyc_squad_pretraining_validation_testing_commands.txt data/pre-training/{global_config.pre_training_filename().replace(".txt", ".dat")} data/testing/{global_config.pre_training_validation_testing_filename()}')

docker compose run --remove-orphans --entrypoint ./test_annabell_squad.sh app data/testing/logfile_nyc_squad_pretraining_validation_testing_commands.txt data/pre-training/nyc_squad_pretraining_commands_20251022_085512.dat data/testing/nyc_squad_pretraining_validation_testing_commands_20251022_085512.txt


In [23]:
#Copy the testing logs back to the experiment directory
source_pattern = os.path.join(global_config.get_docker_data_directory(), "testing",
                              "logfile_nyc_squad_pretraining_validation_testing_commands*")
destination_dir = logs_directory
# Find all files matching the pattern
log_files = glob.glob(source_pattern)
# Copy each found file to the destination directory
for file_path in log_files:
    try:
        shutil.copy(file_path, destination_dir)
        print(f"copied: {file_path} to: {destination_dir}")
    except FileNotFoundError:
        print(f"Error: Source file not found at {file_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

copied: /Users/chris/PycharmProjects/Training-and-evaluating-cognitive-language-models/docker/shared_data/testing/logfile_nyc_squad_pretraining_validation_testing_commands_2025-10-22_16-38-14.txt to: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/14/logs


### perform the testing using the "test annabell" notebook

In [28]:
#print the command to run the training
print(
    f'docker compose run --remove-orphans --entrypoint ./train_annabell_squad.sh app data/training/logfile_nyc_squad_training_commands.txt data/pre-training/{global_config.pre_training_filename().replace(".txt", ".dat")} data/training/{global_config.training_filename().replace(".txt", ".dat")} data/training/{global_config.training_filename()}')

docker compose run --remove-orphans --entrypoint ./train_annabell_squad.sh app data/training/logfile_nyc_squad_training_commands.txt data/pre-training/nyc_squad_pretraining_commands_20251022_085512.dat data/training/nyc_squad_training_commands_20251022_085512.dat data/training/nyc_squad_training_commands_20251022_085512.txt


In [29]:
#copy the training weights back to the experiment directory
source_path = os.path.join(global_config.get_docker_data_directory(), "training",
                           global_config.training_filename().replace(".txt", ".dat"))
destination_path = os.path.join(global_config.training_directory(),
                                global_config.training_filename().replace(".txt", ".dat"))
try:
    shutil.copy(source_path, destination_path)
    print("copied: " + source_path + " to: " + destination_path)
except FileNotFoundError:
    print(f"Error: Source file not found at {source_path}")
except Exception as e:
    print(f"An error occurred: {e}")

#copy the training logfile to the logs directory
source_pattern = os.path.join(global_config.get_docker_data_directory(), "training",
                              "logfile_nyc_squad_training_commands*")
destination_dir = logs_directory
# Find all files matching the pattern
log_files = glob.glob(source_pattern)
# Move each found file to the destination directory
for file_path in log_files:
    try:
        shutil.move(file_path, destination_dir)
        print(f"moved: {file_path} to: {destination_dir}")
    except FileNotFoundError:
        print(f"Error: Log file not found at {file_path}")
    except Exception as e:
        print(f"An error occurred while moving {file_path}: {e}")

copied: /Users/chris/PycharmProjects/Training-and-evaluating-cognitive-language-models/docker/shared_data/training/nyc_squad_training_commands_20251022_085512.dat to: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/14/training/nyc_squad_training_commands_20251022_085512.dat
moved: /Users/chris/PycharmProjects/Training-and-evaluating-cognitive-language-models/docker/shared_data/training/logfile_nyc_squad_training_commands_2025-10-23_05-46-23.txt to: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/14/logs


In [30]:
#print the command to run the testing
print("#run the testing")
print(
    f'docker compose run --remove-orphans --entrypoint ./test_annabell_squad.sh app data/testing/logfile_nyc_squad_testing_commands.txt data/training/{training_filename.replace(".txt", ".dat")} data/testing/{global_config.testing_filename()}')

#run the testing
docker compose run --remove-orphans --entrypoint ./test_annabell_squad.sh app data/testing/logfile_nyc_squad_testing_commands.txt data/training/nyc_squad_training_commands_20251022_085512.dat data/testing/nyc_squad_testing_commands_20251022_085512.txt


In [31]:
#Copy the testing logs back to the experiment directory
source_pattern = os.path.join(global_config.get_docker_data_directory(), "testing",
                              "logfile_nyc_squad_testing_commands*")
destination_dir = logs_directory
# Find all files matching the pattern
log_files = glob.glob(source_pattern)
# Copy each found file to the destination directory
for file_path in log_files:
    try:
        shutil.copy(file_path, destination_dir)
        print(f"copied: {file_path} to: {destination_dir}")
    except FileNotFoundError:
        print(f"Error: Source file not found at {file_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

copied: /Users/chris/PycharmProjects/Training-and-evaluating-cognitive-language-models/docker/shared_data/testing/logfile_nyc_squad_testing_commands_2025-10-23_05-48-29.txt to: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/14/logs


### Run the testing notebook to evaluate the results