# This notebook tests the NYC pretraining

## Prerequisite steps
1. Pretraining has already been completed for the NYC pretraining dataset

In [5]:
import os
import shutil

## Create a testing file

In [6]:
#read the pretraining data and parse the rows that begin with "?"
base_directory = "/Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/7"
pretraining_data_directory = os.path.join(base_directory, "pre_training")
pretraining_data_filename = "pre_training_nyc_samples_updated.txt"
pretraining_data_filepath = os.path.join(pretraining_data_directory, pretraining_data_filename)
pretraining_data_lines = []
with open(pretraining_data_filepath, 'r') as pretraining_data_file:
	pretraining_data_lines = pretraining_data_file.readlines()
print("length of pretraining data lines: " + str(len(pretraining_data_lines)))
pretraining_data_lines[:5]

length of pretraining data lines: 343


['#statements\n',
 '#\n',
 '#category: ATTRIBUTE_IS_VALUE\n',
 'the number of leader terrorists of Al Quada directly involved in the 911 attacks that day was ten\n',
 'the area of land in New York City is 304 point 8 square miles\n']

In [7]:
pretraining_data_lines[10:]

['there are 120 school and universities in NYC\n',
 'there are more than 200 newspaper offices located in New York\n',
 'there are 24000 restaurants in New York\n',
 '#\n',
 '#category: PASSIVE_CONSTRUCTION\n',
 'the United Nations is headquartered in New York\n',
 'New York City is located in the Northeastern region of the United States\n',
 'the sculptures of eagles are located on the 61st floor of the Chrysler Building\n',
 'the Two highest advertising agencies in the world located in NYC are called Omnicom Group and Interpublic Group\n',
 'Manhattan is located in Silicon Alley\n',
 'the Second Department of the Supreme Court is located in Brooklyn\n',
 '#\n',
 '#category: PREPOSITIONAL_PHRASE_FRONTING\n',
 'after the building of new courthouse African burial grounds were discovered in the 1990s\n',
 'during the Draft Riots of 1863 approximately 120 people died\n',
 'in the early 1920s London was the second most highly populated city in the world\n',
 'in January New York City exper

In [8]:
#parse the lines that begin with "?"
pretraining_question_lines = [line.strip() for line in pretraining_data_lines if line.startswith("?")]
print("length of pretraining question lines: " + str(len(pretraining_question_lines)))
pretraining_question_lines[:5]

length of pretraining question lines: 33


['? how many school and universities are in nyc',
 '? how many leader terrorists of Al Quada were involved with the 911 attacks directly that day',
 '? how many square miles are land in nyc',
 '? the mean snowfall between 1981 and 2010 in nyc has been how many inches',
 '? how many Hispanic people live in the New York metropolitan area']

In [9]:
#write a testing file with the questions
test_input_directory = os.path.join(base_directory, "testing")
test_input_filename = "test_nyc_pretrain_questions.txt"
test_input_filepath = os.path.join(test_input_directory, test_input_filename)
with open(test_input_filepath, 'w') as test_input_file:
	for line in pretraining_question_lines:
		test_input_file.write(line + "\n")
		test_input_file.write(".x\n")  #add the .x line after each question
print("test input file written to: " + test_input_filepath)
#check the file looks correct
with open(test_input_filepath, 'r') as test_input_file:
	test_input_lines = test_input_file.readlines()
print("length of test input lines: " + str(len(test_input_lines)))
test_input_lines[:10]

test input file written to: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/7/testing/test_nyc_pretrain_questions.txt
length of test input lines: 66


['? how many school and universities are in nyc\n',
 '.x\n',
 '? how many leader terrorists of Al Quada were involved with the 911 attacks directly that day\n',
 '.x\n',
 '? how many square miles are land in nyc\n',
 '.x\n',
 '? the mean snowfall between 1981 and 2010 in nyc has been how many inches\n',
 '.x\n',
 '? how many Hispanic people live in the New York metropolitan area\n',
 '.x\n']

In [10]:
#copy the pretraining test input data to the Docker data directory
docker_data_directory = "/Users/chris/PycharmProjects/dataset/docker/shared_data"
docker_test_input_filepath = os.path.join(docker_data_directory, "testing", test_input_filename)
shutil.copy(test_input_filepath, docker_test_input_filepath)
print("test input file copied to: " + docker_test_input_filepath)

test input file copied to: /Users/chris/PycharmProjects/dataset/docker/shared_data/testing/test_nyc_pretrain_questions.txt


## Processing
1. Run the testing file using the testing script.

docker compose run --remove-orphans --entrypoint ./test_annabell_squad.sh app \
  data/testing/test_nyc_pretrain_log.txt \
  data/pre-training/pretraining_nyc_squad.dat \
  data/testing/test_nyc_pretrain_questions.txt

2. Copy the log file back from the Docker data directory

In [15]:
#copy the log file back from the Docker data directory
test_log_directory = os.path.join(base_directory, "logs")
docker_test_log_directory = os.path.join(docker_data_directory, "testing")
for filename in os.listdir(os.path.join(docker_data_directory, "testing")):
	if filename.startswith("test_nyc_pretrain_log_") and filename.endswith(".txt"):
		docker_test_log_filepath = os.path.join(docker_test_log_directory, filename)
		test_log_filepath = os.path.join(test_log_directory, filename)
		shutil.copy(docker_test_log_filepath, test_log_filepath)
		print("test log file copied from: " + docker_test_log_filepath + " to: " + test_log_filepath)

test log file copied from: /Users/chris/PycharmProjects/dataset/docker/shared_data/testing/test_nyc_log_2025-09-13_15-37-50.txt to: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/7/logs/test_nyc_log_2025-09-13_15-37-50.txt
test log file copied from: /Users/chris/PycharmProjects/dataset/docker/shared_data/testing/test_nyc_log_2025-09-14_16-14-53.txt to: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/7/logs/test_nyc_log_2025-09-14_16-14-53.txt
test log file copied from: /Users/chris/PycharmProjects/dataset/docker/shared_data/testing/test_nyc_log_2025-09-19_06-51-33.txt to: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/7/logs/test_nyc_log_2025-09-19_06-51-33.txt


3. Use test_annabell.ipynb to analyse the results in the log file