# This notebook creates the datasets for pretraining, training and testing the ANNABELL model using the NYC dataset derived from SQuAD.

In [4]:
import os
import glob
import datetime
import shutil

from dataset_processing import AnnabellCommandGenerator, \
    DatasetPreProcessor, merge_categories, select_pretraining_data, write_pretraining_file, \
    write_testing_file, write_training_file

from config.global_config import GlobalConfig

timestamp = datetime.datetime.now().strftime("_%Y%m%d_%H%M%S")

global_config = GlobalConfig()
percentage_of_pretraining_samples = global_config.percentage_of_pre_training_samples()
maximum_number_of_words = global_config.maximum_number_of_words()
maximum_word_length = global_config.maximum_word_length()
dataset_filepath = global_config.prepared_dataset_filepath()
categorised_questions_filepath = global_config.categorised_questions_filepath()
categorised_sentences_filepath = global_config.categorised_statements_filepath()
pretraining_filepath = global_config.pre_training_filepath()
training_filepath = global_config.training_filepath()
testing_filepath = global_config.testing_filepath()
pretraining_validation_testing_filepath = global_config.pre_training_validation_testing_filepath()
logs_directory = global_config.log_archive_directory()

In [None]:
datasetPreProcessor = DatasetPreProcessor(global_config.prepared_dataset_filepath(),
                                          global_config.maximum_number_of_words(), global_config.maximum_word_length())
datasetPreProcessor.preprocess_data()
nyc_squad_df = datasetPreProcessor.dataset
nyc_squad_df = merge_categories(nyc_squad_df, global_config.categorised_questions_filepath(),
                                global_config.categorised_statements_filepath())
nyc_dataframe = select_pretraining_data(nyc_squad_df, global_config.percentage_of_pre_training_samples())
nyc_dataframe

### create the pretraining data
Select the rows where the pretraining is true, generate a set of commands for each row and save to a file.

In [6]:
#add a new column to the dataframe with the created list of commands
nyc_squad_df["created_commands"] = nyc_squad_df.apply(
    lambda row: AnnabellCommandGenerator(
        row['id'], row['response_declarative_sentence_formatted'], row['response_question_formatted'],
        row['response_answer_formatted']
    ).create_list_of_commands(),
    axis=1
)

nyc_squad_training_df = nyc_squad_df[nyc_squad_df["is_pretraining"] == False]
nyc_squad_pretraining_df = nyc_squad_df[nyc_squad_df["is_pretraining"] == True]
#save the final dataframe as a JSON lines file
nyc_squad_df.to_json(global_config.prepared_dataset_with_commands_filepath(), orient="records",
                     lines=True)

### Write the files containing the commands to perform pretraining, training and testing on ANNABELL

In [7]:
write_pretraining_file(global_config.pre_training_filepath(), nyc_squad_pretraining_df)
write_training_file(global_config.training_filepath(), nyc_squad_training_df)
write_testing_file(global_config.testing_filepath(), nyc_squad_training_df)
write_testing_file(global_config.pre_training_validation_testing_filepath(), nyc_squad_pretraining_df)

2025-11-05 08:19:10,925 - dataset_processing - INFO - Wrote /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/15/pre_training/nyc_squad_pretraining_commands.txt
2025-11-05 08:19:10,927 - dataset_processing - INFO - Number of reward lines: 42
2025-11-05 08:19:10,927 - dataset_processing - INFO - Number of commands: 773
2025-11-05 08:19:10,930 - dataset_processing - INFO - file written: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/15/training/nyc_squad_training_commands.txt
2025-11-05 08:19:10,931 - dataset_processing - INFO - Number of commands: 2130
2025-11-05 08:19:10,933 - dataset_processing - INFO - file written: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/15/testin

## Generate the command line instructions for running the experiments

In [8]:
# Copy the data files to the docker shared data directory for processing
shutil.copy(global_config.pre_training_filepath(), global_config.docker_pre_training_directory())
shutil.copy(global_config.training_filepath(), global_config.docker_training_directory())
shutil.copy(global_config.testing_filepath(), global_config.docker_testing_directory())
shutil.copy(global_config.pre_training_validation_testing_filepath(), global_config.docker_testing_directory())

print("Files copied to docker shared data directory.")

Files copied to docker shared data directory.


In [9]:
#Create the pre-training command
print(
    f'docker compose run --remove-orphans --entrypoint ./pre_train_annabell_squad_nyc.sh app data/pre-training/logfile_nyc_squad_pretraining_commands.txt data/pre-training/{global_config.pre_training_filename()} data/pre-training/{global_config.pre_training_filename().replace(".txt", ".dat")}')

docker compose run --remove-orphans --entrypoint ./pre_train_annabell_squad_nyc.sh app data/pre-training/logfile_nyc_squad_pretraining_commands.txt data/pre-training/nyc_squad_pretraining_commands.txt data/pre-training/nyc_squad_pretraining_commands.dat


In [None]:
#copy the pre-trained weights to the pre-training directory
weights_filename = global_config.pre_training_filename().replace(".txt", ".dat")
source_path = os.path.join(global_config.get_docker_data_directory(), "pre-training", weights_filename)
destination_path = os.path.join(global_config.get_docker_data_directory(), weights_filename)

try:
    shutil.copy(source_path, destination_path)
    print("copied: " + source_path + " to: " + destination_path)
except FileNotFoundError:
    print(f"Error: Source file not found at {source_path}")
except Exception as e:
    print(f"An error occurred: {e}")
#move the pre-training logfile to the pre-training directory
source_pattern = os.path.join(global_config.docker_pre_training_directory(),
                              "logfile_nyc_squad_pretraining_commands*")
destination_dir = logs_directory

log_files = glob.glob(source_pattern)
if not log_files:
    print(f"No log files found matching pattern: {source_pattern}")

for file_path in log_files:
    try:
        shutil.move(file_path, destination_dir)
        print(f"moved: {file_path} to: {destination_dir}")
    except FileNotFoundError:
        print(f"Error: Log file not found at {file_path}")
    except Exception as e:
        print(f"An error occurred while moving {file_path}: {e}")

In [None]:
#run the pretraining validation testing
print(
    f'docker compose run --remove-orphans --entrypoint ./test_annabell_squad.sh app data/testing/logfile_nyc_squad_pretraining_validation_testing_commands.txt data/pre-training/{global_config.pre_training_filename().replace(".txt", ".dat")} data/testing/{global_config.pre_training_validation_testing_filename()}')

In [None]:
#Copy the testing logs back to the experiment directory
source_pattern = os.path.join(global_config.docker_testing_directory(),
                              "logfile_nyc_squad_pretraining_validation_testing_commands*")
destination_dir = global_config.log_archive_directory()
# Find all files matching the pattern
log_files = glob.glob(source_pattern)
# Copy each found file to the destination directory
for file_path in log_files:
    try:
        shutil.copy(file_path, destination_dir)
        print(f"copied: {file_path} to: {destination_dir}")
    except FileNotFoundError:
        print(f"Error: Source file not found at {file_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

### perform the testing using the "test annabell" notebook

In [28]:
#print the command to run the training
print(
    f'docker compose run --remove-orphans --entrypoint ./train_annabell_squad.sh app data/training/logfile_nyc_squad_training_commands.txt data/pre-training/{global_config.pre_training_filename().replace(".txt", ".dat")} data/training/{global_config.training_filename().replace(".txt", ".dat")} data/training/{global_config.training_filename()}')

docker compose run --remove-orphans --entrypoint ./train_annabell_squad.sh app data/training/logfile_nyc_squad_training_commands.txt data/pre-training/nyc_squad_pretraining_commands_20251022_085512.dat data/training/nyc_squad_training_commands_20251022_085512.dat data/training/nyc_squad_training_commands_20251022_085512.txt


In [None]:
#copy the training weights back to the experiment directory
source_path = os.path.join(global_config.docker_training_directory(),
                           global_config.training_filename().replace(".txt", ".dat"))
destination_path = os.path.join(global_config.training_directory(),
                                global_config.training_filename().replace(".txt", ".dat"))
try:
    shutil.copy(source_path, destination_path)
    print("copied: " + source_path + " to: " + destination_path)
except FileNotFoundError:
    print(f"Error: Source file not found at {source_path}")
except Exception as e:
    print(f"An error occurred: {e}")

#copy the training logfile to the logs directory
source_pattern = os.path.join(global_config.docker_training_directory(),
                              "logfile_nyc_squad_training_commands*")
destination_dir = global_config.log_archive_directory()
# Find all files matching the pattern
log_files = glob.glob(source_pattern)
# Move each found file to the destination directory
for file_path in log_files:
    try:
        shutil.move(file_path, destination_dir)
        print(f"moved: {file_path} to: {destination_dir}")
    except FileNotFoundError:
        print(f"Error: Log file not found at {file_path}")
    except Exception as e:
        print(f"An error occurred while moving {file_path}: {e}")

In [None]:
#print the command to run the testing
print("#run the testing")
print(
    f'docker compose run --remove-orphans --entrypoint ./test_annabell_squad.sh app data/testing/logfile_nyc_squad_testing_commands.txt data/training/{global_config.training_filename().replace(".txt", ".dat")} data/testing/{global_config.testing_filename()}')

In [None]:
#Copy the testing logs back to the experiment directory
source_pattern = os.path.join(global_config.docker_testing_directory(),
                              "logfile_nyc_squad_testing_commands*")
destination_dir = global_config.log_archive_directory()
# Find all files matching the pattern
log_files = glob.glob(source_pattern)
# Copy each found file to the destination directory
for file_path in log_files:
    try:
        shutil.copy(file_path, destination_dir)
        print(f"copied: {file_path} to: {destination_dir}")
    except FileNotFoundError:
        print(f"Error: Source file not found at {file_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

### Run the testing notebook to evaluate the results