# This notebook is used to test the ANNABELL model on the SQuAD dataset.


In [15]:
from dataset_processing import any_word_match, embedding_for_sentence, \
    cosine_distance, ids_questions_answers_from_log_file
import os
import platform
import datetime
import pandas as pd
from tqdm import tqdm

In [16]:
experiment_number = 14
operating_system = platform.system()
if operating_system == 'Windows':
	base_directory = "G:\\My Drive\\Shared with Julia\\Education\\Kent University\\PhD\\work\\annabell"
elif operating_system == 'Linux':
    base_directory = "/home/chris/gdrive/work/annabell"
elif operating_system == 'Darwin':  #macOS
    base_directory = "/Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/"
else:
    raise Exception("unsupported OS")

#These parameters need to be set to match the files being used
referenced_files_timestamp = "20251022_085512"
test_log_filename = "logfile_nyc_squad_testing_commands_2025-10-23_05-48-29.txt"
#test_input_filename = "nyc_squad_pretraining_validation_testing_commands_" + referenced_files_timestamp + ".txt"
test_input_filename = "nyc_squad_testing_commands_" + referenced_files_timestamp + ".txt"

timestamp = datetime.datetime.now().strftime("_%Y%m%d_%H%M%S")
test_input_dir = os.path.join(base_directory, "experiments", str(experiment_number), "testing")
test_log_dir = os.path.join(base_directory, "experiments", str(experiment_number), "logs")
data_dir = os.path.join(base_directory, "experiments/data")
dataframe_directory = os.path.join(base_directory, "experiments", "dataframes")
test_results_dir = os.path.join(base_directory, "experiments", str(experiment_number), "results")
if not os.path.exists(test_results_dir):
    os.makedirs(test_results_dir)
pretraining_dir = os.path.join(base_directory, "experiments", str(experiment_number), "pre_training")

dataset_filename = "nyc_squad_with_pretraining_commands_" + referenced_files_timestamp + ".jsonl"
pretraining_filename ="nyc_squad_pretraining_commands_" + referenced_files_timestamp + ".txt"

test_log_filepath = os.path.join(test_log_dir, test_log_filename)
test_input_filepath = os.path.join(test_input_dir, test_input_filename)
dataset_filepath = os.path.join(dataframe_directory, dataset_filename)
pretraining_filepath =  os.path.join(pretraining_dir, pretraining_filename)

test_answer_summary_filename = "test_nyc_answer_summary" + timestamp + ".tsv"
tsv_results_filepath = os.path.join(test_results_dir, test_answer_summary_filename)

ids_questions_answers = ids_questions_answers_from_log_file(test_log_filepath)
for index, each_tuple in enumerate(ids_questions_answers):
    if each_tuple[-1] == "" or each_tuple[-1] == None:
        ids_questions_answers[index] = (each_tuple[0], each_tuple[1], "NO ANSWER GIVEN")

print("length of log file questions and answers: " + str(len(ids_questions_answers)))
with open(test_input_filepath, 'r') as test_input_file:
    test_input_lines = test_input_file.readlines()
total_number_of_test_samples = len([id_line for id_line in test_input_lines if id_line.startswith("#id:")])
with open(pretraining_filepath, 'r') as pretraining_file:
	pretraining_lines = pretraining_file.readlines()
print(f"total number of test samples in input file: {total_number_of_test_samples}")
total_number_of_pretraining_samples = len([id_line for id_line in pretraining_lines if id_line.startswith("# ID:")])
print(f"total number of pretraining samples in input file: {total_number_of_pretraining_samples}")
response_formatted_df = pd.read_json(dataset_filepath, lines=True)
#add the test questions to the dataframe
questions_not_found = []
for the_id, question, answer, in ids_questions_answers:
    if the_id in response_formatted_df["id"].values:
        response_formatted_df.loc[response_formatted_df["id"] == the_id, "test_answer"] = answer
    else:
        questions_not_found.append(question)
print(f"number of test samples not found in training data: {len(questions_not_found)}")
print("test samples not found in training data: " + str(questions_not_found[:5]) + " ...")
#drop any rows that are not in the test samples
response_formatted_df.dropna(subset=["test_answer"], inplace=True)
response_formatted_df.reset_index(inplace=True)
response_formatted_df

length of log file questions and answers: 518
total number of test samples in input file: 518
total number of pretraining samples in input file: 211
number of test samples not found in training data: 0
test samples not found in training data: [] ...


Unnamed: 0,index,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,question_category,sentence_category,created_commands,test_answer
0,0,56ce304daab44d1400b8850e,What city in the United States has the highest...,What city in the United States has the high –e...,New York,The city in the United States with the high –e...,New York,? what city in the United-States has the high ...,New-York,the city in the United-States with the high -e...,False,Wh-Subject Question,Subject-Verb-Complement (SVC),"[# ID: 56ce304daab44d1400b8850e, the city in t...",New-York-City-Department of Education
1,1,56ce304daab44d1400b8850f,In what city is the United Nations based?,In what city is the United Nations base –d?,New York,The United Nations is base –d in New York.,New York,? in what city is the United-Nations base -d,New-York,the United-Nations is base -d in New-York,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56ce304daab44d1400b8850f, the United-Na...",New-York
2,2,56ce304daab44d1400b88510,What city has been called the cultural capital...,What city has been call –ed the culture –al ca...,New York,New York has been call –ed the culture –al cap...,New York,? what city has been call -ed the culture -al ...,New-York,New-York has been call -ed the culture -al cap...,False,Wh-Subject Question,Passive Construction,"[# ID: 56ce304daab44d1400b88510, New-York has ...",capture
3,3,56ce304daab44d1400b88511,What American city welcomes the largest number...,What American city welcome –s the large –st nu...,New York,New York is the American city that welcome –s ...,New York,? what American city welcome -s the large -st ...,New-York,New-York is the American city that welcome -s ...,False,Wh-Subject Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56ce304daab44d1400b88511, New-York is t...",Exploitation number of updates >= 4000
4,6,56ce3124aab44d1400b8852a,How many boroughs comprise New York City?,How many borough –s comprise New York City?,five,New York City is comprise –d of five borough –s.,five,? how many borough -s comprise New-York-City,five,New-York-City is comprise -d of five borough -s,False,Quantitative Wh-Question,Passive Construction,"[# ID: 56ce3124aab44d1400b8852a, New-York-City...",Exploitation number of updates >= 4000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513,723,56d1204617492d1400aab9fc,What federal district court has jurisdiction o...,What federal district court has jurisdiction o...,the District Court for the Eastern District of...,The District Court for the Eastern District of...,the District Court for the Eastern District of...,? what federal district court has jurisdiction...,the District-Court for the Eastern-District of...,the District-Court for the Eastern-District of...,False,Wh-Subject Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56d1204617492d1400aab9fc, the District-...",-s
514,724,56d1204617492d1400aab9fd,In what borough is the New York City Hall found?,In what borough is the New York City Hall found?,Manhattan,The New York City Hall is found in the borough...,Manhattan,? in what borough is the New-York-City-Hall found,Manhattan,the New-York-City-Hall is found in the borough...,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56d1204617492d1400aab9fd, the New-York-...",NO ANSWER GIVEN
515,725,56d1218c17492d1400aaba1f,How much money in cents does New York City rec...,How much money in cent –s does New York City r...,83,New York City receive –s 83 cent –s for every ...,83,? how much money in cent -s does New-York-City...,83,New-York-City receive -s 83 cent -s for every ...,False,Quantitative Wh-Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56d1218c17492d1400aaba1f, New-York-City...",Exploitation number of updates >= 4000
516,727,56d1218c17492d1400aaba21,"Each year, how much more money does New York C...","Each year, how much more money does New York C...",$ 11.4 billion,New York City give –s $ 11.4 billion more mone...,$11.4 billion,? each year how much more money does New-York-...,11 point 4 billion,New-York-City give -s 11 point 4 billion more...,False,Quantitative Wh-Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56d1218c17492d1400aaba21, New-York-City...",1625


In [17]:
#generate embeddings for the test answer and the response_answer_formatted columns and compare them using cosine distance
tqdm.pandas(desc="Generating test answer embeddings")
response_formatted_df["test_answer_embedding"] = response_formatted_df["test_answer"].progress_apply(
    lambda x: embedding_for_sentence(x) if pd.notnull(x) else None)
tqdm.pandas(desc="Generating response answer embeddings")
response_formatted_df["response_answer_formatted_embedding"] = response_formatted_df[
    "response_answer_formatted"].progress_apply(
    lambda x: embedding_for_sentence(x) if pd.notnull(x) else None)

Generating test answer embeddings: 100%|██████████| 518/518 [00:45<00:00, 11.44it/s]
Generating response answer embeddings: 100%|██████████| 518/518 [00:46<00:00, 11.03it/s]


In [18]:
response_formatted_df["test_answer_cosine_distance"] = response_formatted_df.apply(cosine_distance, axis=1)
response_formatted_df

Unnamed: 0,index,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,question_category,sentence_category,created_commands,test_answer,test_answer_embedding,response_answer_formatted_embedding,test_answer_cosine_distance
0,0,56ce304daab44d1400b8850e,What city in the United States has the highest...,What city in the United States has the high –e...,New York,The city in the United States with the high –e...,New York,? what city in the United-States has the high ...,New-York,the city in the United-States with the high -e...,False,Wh-Subject Question,Subject-Verb-Complement (SVC),"[# ID: 56ce304daab44d1400b8850e, the city in t...",New-York-City-Department of Education,"[-0.1762507, 0.026600152, 0.057974998, 0.01327...","[-0.1972834, 0.00014738683, 0.026876723, 0.000...",0.080072
1,1,56ce304daab44d1400b8850f,In what city is the United Nations based?,In what city is the United Nations base –d?,New York,The United Nations is base –d in New York.,New York,? in what city is the United-Nations base -d,New-York,the United-Nations is base -d in New-York,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56ce304daab44d1400b8850f, the United-Na...",New-York,"[-0.1972834, 0.00014738683, 0.026876723, 0.000...","[-0.1972834, 0.00014738683, 0.026876723, 0.000...",0.000000
2,2,56ce304daab44d1400b88510,What city has been called the cultural capital...,What city has been call –ed the culture –al ca...,New York,New York has been call –ed the culture –al cap...,New York,? what city has been call -ed the culture -al ...,New-York,New-York has been call -ed the culture -al cap...,False,Wh-Subject Question,Passive Construction,"[# ID: 56ce304daab44d1400b88510, New-York has ...",capture,"[-0.2194085, -0.0031048143, 0.013731885, -0.00...","[-0.1972834, 0.00014738683, 0.026876723, 0.000...",0.100256
3,3,56ce304daab44d1400b88511,What American city welcomes the largest number...,What American city welcome –s the large –st nu...,New York,New York is the American city that welcome –s ...,New York,? what American city welcome -s the large -st ...,New-York,New-York is the American city that welcome -s ...,False,Wh-Subject Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56ce304daab44d1400b88511, New-York is t...",Exploitation number of updates >= 4000,"[-0.15511338, 0.028740732, 0.008751592, 0.0579...","[-0.1972834, 0.00014738683, 0.026876723, 0.000...",0.306653
4,6,56ce3124aab44d1400b8852a,How many boroughs comprise New York City?,How many borough –s comprise New York City?,five,New York City is comprise –d of five borough –s.,five,? how many borough -s comprise New-York-City,five,New-York-City is comprise -d of five borough -s,False,Quantitative Wh-Question,Passive Construction,"[# ID: 56ce3124aab44d1400b8852a, New-York-City...",Exploitation number of updates >= 4000,"[-0.15511338, 0.028740732, 0.008751592, 0.0579...","[-0.21607131, 0.0066142594, 0.021365147, -0.00...",0.312307
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513,723,56d1204617492d1400aab9fc,What federal district court has jurisdiction o...,What federal district court has jurisdiction o...,the District Court for the Eastern District of...,The District Court for the Eastern District of...,the District Court for the Eastern District of...,? what federal district court has jurisdiction...,the District-Court for the Eastern-District of...,the District-Court for the Eastern-District of...,False,Wh-Subject Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56d1204617492d1400aab9fc, the District-...",-s,"[-0.2126666, -0.0055902475, 0.011536866, 0.025...","[-0.1400723, 0.008040977, 0.020870337, 0.00184...",0.316981
514,724,56d1204617492d1400aab9fd,In what borough is the New York City Hall found?,In what borough is the New York City Hall found?,Manhattan,The New York City Hall is found in the borough...,Manhattan,? in what borough is the New-York-City-Hall found,Manhattan,the New-York-City-Hall is found in the borough...,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56d1204617492d1400aab9fd, the New-York-...",NO ANSWER GIVEN,"[-0.17646857, 0.017760377, 0.028869705, 0.0407...","[-0.20684911, -0.003379386, 0.038716637, -0.00...",0.180568
515,725,56d1218c17492d1400aaba1f,How much money in cents does New York City rec...,How much money in cent –s does New York City r...,83,New York City receive –s 83 cent –s for every ...,83,? how much money in cent -s does New-York-City...,83,New-York-City receive -s 83 cent -s for every ...,False,Quantitative Wh-Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56d1218c17492d1400aaba1f, New-York-City...",Exploitation number of updates >= 4000,"[-0.15511338, 0.028740732, 0.008751592, 0.0579...","[-0.19080083, -0.0140541615, -0.002487832, 0.0...",0.357599
516,727,56d1218c17492d1400aaba21,"Each year, how much more money does New York C...","Each year, how much more money does New York C...",$ 11.4 billion,New York City give –s $ 11.4 billion more mone...,$11.4 billion,? each year how much more money does New-York-...,11 point 4 billion,New-York-City give -s 11 point 4 billion more...,False,Quantitative Wh-Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56d1218c17492d1400aaba21, New-York-City...",1625,"[-0.18477677, 0.010238579, 0.020874035, -0.010...","[-0.13429901, -0.012208445, 0.041993335, 0.012...",0.299060


In [19]:
# Get the counts for each unique value in the 'test_answer' column
test_answer_summary = response_formatted_df['test_answer'].value_counts().reset_index()
# Rename the columns for clarity
test_answer_summary.columns = ['test_answer', 'count']
# Sort the results by count in descending order
test_answer_summary.sort_values(by='count', ascending=False, inplace=True)
test_answer_summary

Unnamed: 0,test_answer,count
0,NO ANSWER GIVEN,171
1,Exploitation number of updates >= 4000,110
2,Manhattan,8
3,New-York-City-Department of Education,7
4,Brooklyn,7
...,...,...
79,468 point 9,1
78,two,1
77,38 point 4,1
76,Estevao-Gomess,1


In [20]:
#write the results dataframe to a tsv file
test_answer_summary.to_csv(tsv_results_filepath, sep="\t", index=False)
#count the number of results where the test answer is > 20 words
num_long_answers = response_formatted_df["test_answer"].apply(
    lambda x: len(x.split()) > 20 if pd.notnull(x) else False).sum()
print(f"number of test answers longer than 20 words: {num_long_answers}")
response_formatted_df["test_answer_correct"] = response_formatted_df["test_answer"] == response_formatted_df[
    "response_answer_formatted"]
number_correct = response_formatted_df["test_answer_correct"].sum()
print(f"number correct = {number_correct} out of {len(response_formatted_df)}")
percentage_correct = response_formatted_df["test_answer_correct"].mean() * 100
print(f"percentage correct = {percentage_correct} %")

response_formatted_df["test_answer_any_matching_word"] = response_formatted_df.apply(any_word_match, axis=1)
percentage_any_word_matches = response_formatted_df["test_answer_any_matching_word"].mean() * 100
number_any_word_matches = response_formatted_df["test_answer_any_matching_word"].sum()
print(f"number any word matches = {number_any_word_matches} out of {len(response_formatted_df)}")
print(f"percentage any word matches = {percentage_any_word_matches} %")

cosine_distance_threshold = 0.1
#create a dataframe with the rows where the cosine distance is less than the threshold
close_cosine_distance_df = response_formatted_df[
    response_formatted_df["test_answer_cosine_distance"] < cosine_distance_threshold]
print(f"number of rows with cosine distance less than {cosine_distance_threshold}: {len(close_cosine_distance_df)}")
print("percentage of total: " + str(len(close_cosine_distance_df) / len(response_formatted_df) * 100) + " %")

#create a dataframe with the rows where the cosine distance is less than the threshold and any matching answer is correct
close_cosine_distance_correct_df = close_cosine_distance_df[
    close_cosine_distance_df["test_answer_any_matching_word"]]
print(
    f"number of rows with cosine distance less than {cosine_distance_threshold} and any matching answer correct: {len(close_cosine_distance_correct_df)}")
print("percentage of total: " + str(len(close_cosine_distance_correct_df) / len
(response_formatted_df) * 100) + " %")

close_cosine_distance_correct_df

number of test answers longer than 20 words: 0
number correct = 19 out of 518
percentage correct = 3.667953667953668 %
number any word matches = 28 out of 518
percentage any word matches = 5.405405405405405 %
number of rows with cosine distance less than 0.1: 59
percentage of total: 11.389961389961389 %
number of rows with cosine distance less than 0.1 and any matching answer correct: 21
percentage of total: 4.054054054054054 %


Unnamed: 0,index,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,question_category,sentence_category,created_commands,test_answer,test_answer_embedding,response_answer_formatted_embedding,test_answer_cosine_distance,test_answer_correct,test_answer_any_matching_word
1,1,56ce304daab44d1400b8850f,In what city is the United Nations based?,In what city is the United Nations base –d?,New York,The United Nations is base –d in New York.,New York,? in what city is the United-Nations base -d,New-York,the United-Nations is base -d in New-York,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56ce304daab44d1400b8850f, the United-Na...",New-York,"[-0.1972834, 0.00014738683, 0.026876723, 0.000...","[-0.1972834, 0.00014738683, 0.026876723, 0.000...",0.0,True,True
13,18,56ce31baaab44d1400b8853c,In what year did New York become the largest c...,In what year did New York become the large –st...,1790,New York become the large –st city in the Unit...,1790,? in what year did New-York become the large -...,1790,New-York become the large -st city in the Unit...,False,Wh-Adverbial Question,Subject-Verb-Complement (SVC),"[# ID: 56ce31baaab44d1400b8853c, New-York beco...",1790,"[-0.17822242, -0.003509889, 0.058659993, 0.012...","[-0.17822242, -0.003509889, 0.058659993, 0.012...",0.0,True,True
19,27,56ce32e7aab44d1400b88552,In what borough is Wall Street located?,In what borough is Wall Street locate –d?,Manhattan,Wall Street is locate –d in the borough of Man...,Manhattan,? in what borough is Wall-Street locate -d,Manhattan,Wall-Street is locate -d in the borough of Man...,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56ce32e7aab44d1400b88552, Wall-Street i...",Manhattan,"[-0.20684911, -0.003379386, 0.038716637, -0.00...","[-0.20684911, -0.003379386, 0.038716637, -0.00...",0.0,True,True
31,44,56ce34c7aab44d1400b88598,What was the name of Estêvão Gomes's ship?,What was the name of Estêvão Gomes's ship?,La Anunciada,The name of Estêvão Gomes's ship was La Anunci...,La Anunciada,? what was the name of Estevao-Gomess ship,La-Anunciada,the name of Estevao-Gomess ship was La-Anunciada,False,Wh-Object/Complement Question,Subject-Verb-Complement (SVC),"[# ID: 56ce34c7aab44d1400b88598, the name of E...",La-Anunciada,"[-0.20099996, -0.0032679276, 0.023376267, -0.0...","[-0.20099996, -0.0032679276, 0.023376267, -0.0...",0.0,True,True
57,88,56ceddd9aab44d1400b88b61,Near what square was the African Burial Ground...,Near what square was the African Burial Ground...,Foley Square,The African Burial Ground was unearth –ed near...,Foley Square,? Near what square was the African-Burial-Grou...,Foley-Square,the African-Burial-Ground was unearth -ed near...,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56ceddd9aab44d1400b88b61, the African-B...",Foley-Square,"[-0.19269383, -0.00928507, -0.012149676, 0.003...","[-0.19269383, -0.00928507, -0.012149676, 0.003...",0.0,True,True
67,101,56cedf11aab44d1400b88b94,In what borough did the Battle of Long Island ...,In what borough did the Battle of Long Island ...,Brooklyn,The Battle of Long Island occur –ed in Brooklyn.,Brooklyn,? in what borough did the Battle of Long-Islan...,Brooklyn,the Battle of Long-Island occur -ed in Brooklyn,False,Wh-Adverbial Question,Subject-Verb-Adverbial (SVA),"[# ID: 56cedf11aab44d1400b88b94, the Battle of...",Brooklyn,"[-0.21150663, -0.015454139, 0.015124913, -0.00...","[-0.21150663, -0.015454139, 0.015124913, -0.00...",0.0,True,True
104,155,56cee70daab44d1400b88c51,In what year did the General Slocum disaster o...,In what year did the General Slocum disaster o...,1904,The General Slocum disaster occur –ed in 1904.,1904,? in what year did the General-Slocum disaster...,1904,the General-Slocum disaster occur -ed in 1904,False,Wh-Adverbial Question,Subject-Verb-Adverbial (SVA),"[# ID: 56cee70daab44d1400b88c51, the General-S...",1904,"[-0.18449205, -0.008385724, 0.03736758, 0.0120...","[-0.18449205, -0.008385724, 0.03736758, 0.0120...",0.0,True,True
114,168,56cee90caab44d1400b88c86,In what neighborhood did the Stonewall riots o...,In what neighborhood did the Stonewall riots o...,Greenwich Village,The Stonewall riots occur –ed in the Greenwich...,Greenwich Village,? in what neighborhood did the Stonewall riots...,Greenwich-Village,the Stonewall riots occur -ed in the Greenwich...,False,Wh-Adverbial Question,Subject-Verb-Adverbial (SVA),"[# ID: 56cee90caab44d1400b88c86, the Stonewall...",Greenwich-Village,"[-0.20341873, -0.0015141508, 0.010141497, 0.01...","[-0.20341873, -0.0015141508, 0.010141497, 0.01...",0.0,True,True
193,278,56cf306baab44d1400b88de9,In what neighborhood is the Stonewall Inn loca...,In what neighborhood is the Stonewall Inn loca...,Greenwich Village,The Stonewall Inn is locate –d in the Greenwic...,Greenwich Village,? in what neighborhood is the Stonewall-Inn lo...,Greenwich-Village,the Stonewall-Inn is locate -d in the Greenwic...,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56cf306baab44d1400b88de9, the Stonewall...",Greenwich-Village,"[-0.20341873, -0.0015141508, 0.010141497, 0.01...","[-0.20341873, -0.0015141508, 0.010141497, 0.01...",0.0,True,True
212,303,56cfe836234ae51400d9c064,When was Fort Hamilton built?,When was Fort Hamilton built?,1825,Fort Hamilton was built in 1825.,1825,? when was Fort-Hamilton built,1825,Fort-Hamilton was built in 1825,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56cfe836234ae51400d9c064, Fort-Hamilton...",1825,"[-0.18570381, -0.0046451828, 0.03343887, -0.00...","[-0.18570381, -0.0046451828, 0.03343887, -0.00...",0.0,True,True


In [21]:
correct_matches = response_formatted_df[response_formatted_df["test_answer_correct"]]
correct_matches

Unnamed: 0,index,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,question_category,sentence_category,created_commands,test_answer,test_answer_embedding,response_answer_formatted_embedding,test_answer_cosine_distance,test_answer_correct,test_answer_any_matching_word
1,1,56ce304daab44d1400b8850f,In what city is the United Nations based?,In what city is the United Nations base –d?,New York,The United Nations is base –d in New York.,New York,? in what city is the United-Nations base -d,New-York,the United-Nations is base -d in New-York,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56ce304daab44d1400b8850f, the United-Na...",New-York,"[-0.1972834, 0.00014738683, 0.026876723, 0.000...","[-0.1972834, 0.00014738683, 0.026876723, 0.000...",0.0,True,True
13,18,56ce31baaab44d1400b8853c,In what year did New York become the largest c...,In what year did New York become the large –st...,1790,New York become the large –st city in the Unit...,1790,? in what year did New-York become the large -...,1790,New-York become the large -st city in the Unit...,False,Wh-Adverbial Question,Subject-Verb-Complement (SVC),"[# ID: 56ce31baaab44d1400b8853c, New-York beco...",1790,"[-0.17822242, -0.003509889, 0.058659993, 0.012...","[-0.17822242, -0.003509889, 0.058659993, 0.012...",0.0,True,True
19,27,56ce32e7aab44d1400b88552,In what borough is Wall Street located?,In what borough is Wall Street locate –d?,Manhattan,Wall Street is locate –d in the borough of Man...,Manhattan,? in what borough is Wall-Street locate -d,Manhattan,Wall-Street is locate -d in the borough of Man...,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56ce32e7aab44d1400b88552, Wall-Street i...",Manhattan,"[-0.20684911, -0.003379386, 0.038716637, -0.00...","[-0.20684911, -0.003379386, 0.038716637, -0.00...",0.0,True,True
31,44,56ce34c7aab44d1400b88598,What was the name of Estêvão Gomes's ship?,What was the name of Estêvão Gomes's ship?,La Anunciada,The name of Estêvão Gomes's ship was La Anunci...,La Anunciada,? what was the name of Estevao-Gomess ship,La-Anunciada,the name of Estevao-Gomess ship was La-Anunciada,False,Wh-Object/Complement Question,Subject-Verb-Complement (SVC),"[# ID: 56ce34c7aab44d1400b88598, the name of E...",La-Anunciada,"[-0.20099996, -0.0032679276, 0.023376267, -0.0...","[-0.20099996, -0.0032679276, 0.023376267, -0.0...",0.0,True,True
57,88,56ceddd9aab44d1400b88b61,Near what square was the African Burial Ground...,Near what square was the African Burial Ground...,Foley Square,The African Burial Ground was unearth –ed near...,Foley Square,? Near what square was the African-Burial-Grou...,Foley-Square,the African-Burial-Ground was unearth -ed near...,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56ceddd9aab44d1400b88b61, the African-B...",Foley-Square,"[-0.19269383, -0.00928507, -0.012149676, 0.003...","[-0.19269383, -0.00928507, -0.012149676, 0.003...",0.0,True,True
67,101,56cedf11aab44d1400b88b94,In what borough did the Battle of Long Island ...,In what borough did the Battle of Long Island ...,Brooklyn,The Battle of Long Island occur –ed in Brooklyn.,Brooklyn,? in what borough did the Battle of Long-Islan...,Brooklyn,the Battle of Long-Island occur -ed in Brooklyn,False,Wh-Adverbial Question,Subject-Verb-Adverbial (SVA),"[# ID: 56cedf11aab44d1400b88b94, the Battle of...",Brooklyn,"[-0.21150663, -0.015454139, 0.015124913, -0.00...","[-0.21150663, -0.015454139, 0.015124913, -0.00...",0.0,True,True
104,155,56cee70daab44d1400b88c51,In what year did the General Slocum disaster o...,In what year did the General Slocum disaster o...,1904,The General Slocum disaster occur –ed in 1904.,1904,? in what year did the General-Slocum disaster...,1904,the General-Slocum disaster occur -ed in 1904,False,Wh-Adverbial Question,Subject-Verb-Adverbial (SVA),"[# ID: 56cee70daab44d1400b88c51, the General-S...",1904,"[-0.18449205, -0.008385724, 0.03736758, 0.0120...","[-0.18449205, -0.008385724, 0.03736758, 0.0120...",0.0,True,True
114,168,56cee90caab44d1400b88c86,In what neighborhood did the Stonewall riots o...,In what neighborhood did the Stonewall riots o...,Greenwich Village,The Stonewall riots occur –ed in the Greenwich...,Greenwich Village,? in what neighborhood did the Stonewall riots...,Greenwich-Village,the Stonewall riots occur -ed in the Greenwich...,False,Wh-Adverbial Question,Subject-Verb-Adverbial (SVA),"[# ID: 56cee90caab44d1400b88c86, the Stonewall...",Greenwich-Village,"[-0.20341873, -0.0015141508, 0.010141497, 0.01...","[-0.20341873, -0.0015141508, 0.010141497, 0.01...",0.0,True,True
193,278,56cf306baab44d1400b88de9,In what neighborhood is the Stonewall Inn loca...,In what neighborhood is the Stonewall Inn loca...,Greenwich Village,The Stonewall Inn is locate –d in the Greenwic...,Greenwich Village,? in what neighborhood is the Stonewall-Inn lo...,Greenwich-Village,the Stonewall-Inn is locate -d in the Greenwic...,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56cf306baab44d1400b88de9, the Stonewall...",Greenwich-Village,"[-0.20341873, -0.0015141508, 0.010141497, 0.01...","[-0.20341873, -0.0015141508, 0.010141497, 0.01...",0.0,True,True
212,303,56cfe836234ae51400d9c064,When was Fort Hamilton built?,When was Fort Hamilton built?,1825,Fort Hamilton was built in 1825.,1825,? when was Fort-Hamilton built,1825,Fort-Hamilton was built in 1825,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56cfe836234ae51400d9c064, Fort-Hamilton...",1825,"[-0.18570381, -0.0046451828, 0.03343887, -0.00...","[-0.18570381, -0.0046451828, 0.03343887, -0.00...",0.0,True,True


In [22]:
any_matches = response_formatted_df[response_formatted_df["test_answer_any_matching_word"]]
any_matches

Unnamed: 0,index,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,question_category,sentence_category,created_commands,test_answer,test_answer_embedding,response_answer_formatted_embedding,test_answer_cosine_distance,test_answer_correct,test_answer_any_matching_word
1,1,56ce304daab44d1400b8850f,In what city is the United Nations based?,In what city is the United Nations base –d?,New York,The United Nations is base –d in New York.,New York,? in what city is the United-Nations base -d,New-York,the United-Nations is base -d in New-York,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56ce304daab44d1400b8850f, the United-Na...",New-York,"[-0.1972834, 0.00014738683, 0.026876723, 0.000...","[-0.1972834, 0.00014738683, 0.026876723, 0.000...",0.0,True,True
13,18,56ce31baaab44d1400b8853c,In what year did New York become the largest c...,In what year did New York become the large –st...,1790,New York become the large –st city in the Unit...,1790,? in what year did New-York become the large -...,1790,New-York become the large -st city in the Unit...,False,Wh-Adverbial Question,Subject-Verb-Complement (SVC),"[# ID: 56ce31baaab44d1400b8853c, New-York beco...",1790,"[-0.17822242, -0.003509889, 0.058659993, 0.012...","[-0.17822242, -0.003509889, 0.058659993, 0.012...",0.0,True,True
19,27,56ce32e7aab44d1400b88552,In what borough is Wall Street located?,In what borough is Wall Street locate –d?,Manhattan,Wall Street is locate –d in the borough of Man...,Manhattan,? in what borough is Wall-Street locate -d,Manhattan,Wall-Street is locate -d in the borough of Man...,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56ce32e7aab44d1400b88552, Wall-Street i...",Manhattan,"[-0.20684911, -0.003379386, 0.038716637, -0.00...","[-0.20684911, -0.003379386, 0.038716637, -0.00...",0.0,True,True
31,44,56ce34c7aab44d1400b88598,What was the name of Estêvão Gomes's ship?,What was the name of Estêvão Gomes's ship?,La Anunciada,The name of Estêvão Gomes's ship was La Anunci...,La Anunciada,? what was the name of Estevao-Gomess ship,La-Anunciada,the name of Estevao-Gomess ship was La-Anunciada,False,Wh-Object/Complement Question,Subject-Verb-Complement (SVC),"[# ID: 56ce34c7aab44d1400b88598, the name of E...",La-Anunciada,"[-0.20099996, -0.0032679276, 0.023376267, -0.0...","[-0.20099996, -0.0032679276, 0.023376267, -0.0...",0.0,True,True
57,88,56ceddd9aab44d1400b88b61,Near what square was the African Burial Ground...,Near what square was the African Burial Ground...,Foley Square,The African Burial Ground was unearth –ed near...,Foley Square,? Near what square was the African-Burial-Grou...,Foley-Square,the African-Burial-Ground was unearth -ed near...,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56ceddd9aab44d1400b88b61, the African-B...",Foley-Square,"[-0.19269383, -0.00928507, -0.012149676, 0.003...","[-0.19269383, -0.00928507, -0.012149676, 0.003...",0.0,True,True
67,101,56cedf11aab44d1400b88b94,In what borough did the Battle of Long Island ...,In what borough did the Battle of Long Island ...,Brooklyn,The Battle of Long Island occur –ed in Brooklyn.,Brooklyn,? in what borough did the Battle of Long-Islan...,Brooklyn,the Battle of Long-Island occur -ed in Brooklyn,False,Wh-Adverbial Question,Subject-Verb-Adverbial (SVA),"[# ID: 56cedf11aab44d1400b88b94, the Battle of...",Brooklyn,"[-0.21150663, -0.015454139, 0.015124913, -0.00...","[-0.21150663, -0.015454139, 0.015124913, -0.00...",0.0,True,True
104,155,56cee70daab44d1400b88c51,In what year did the General Slocum disaster o...,In what year did the General Slocum disaster o...,1904,The General Slocum disaster occur –ed in 1904.,1904,? in what year did the General-Slocum disaster...,1904,the General-Slocum disaster occur -ed in 1904,False,Wh-Adverbial Question,Subject-Verb-Adverbial (SVA),"[# ID: 56cee70daab44d1400b88c51, the General-S...",1904,"[-0.18449205, -0.008385724, 0.03736758, 0.0120...","[-0.18449205, -0.008385724, 0.03736758, 0.0120...",0.0,True,True
114,168,56cee90caab44d1400b88c86,In what neighborhood did the Stonewall riots o...,In what neighborhood did the Stonewall riots o...,Greenwich Village,The Stonewall riots occur –ed in the Greenwich...,Greenwich Village,? in what neighborhood did the Stonewall riots...,Greenwich-Village,the Stonewall riots occur -ed in the Greenwich...,False,Wh-Adverbial Question,Subject-Verb-Adverbial (SVA),"[# ID: 56cee90caab44d1400b88c86, the Stonewall...",Greenwich-Village,"[-0.20341873, -0.0015141508, 0.010141497, 0.01...","[-0.20341873, -0.0015141508, 0.010141497, 0.01...",0.0,True,True
122,183,56cef300aab44d1400b88cff,On what date did the Occupy Wall Street protes...,On what date did the Occupy Wall Street protes...,"September 17, 2011",The Occupy Wall Street protest –s commence –d ...,"September 17, 2011",? on what date did the Occupy-Wall-Street prot...,September 17 2011,the Occupy-Wall-Street protest -s commence -d ...,False,Wh-Adverbial Question,Subject-Verb-Adverbial (SVA),"[# ID: 56cef300aab44d1400b88cff, the Occupy-Wa...",June 24 2011,"[-0.17310381, 0.010886849, -0.0018171272, 0.00...","[-0.16370903, 0.010387576, 0.0017466204, -0.00...",0.101669,False,True
145,210,56cef84faab44d1400b88d54,"In square miles, how much of the city's total ...","In square mile –s, how much of the city's tota...",164.1,164.1 square mile –s of the city's total area ...,164.1,? in square mile -s how much of the citys tota...,164 point 1,164 point 1 square mile -s of the citys total ...,False,Quantitative Wh-Question,Passive Construction,"[# ID: 56cef84faab44d1400b88d54, 164 point 1 s...",304 point 8,"[-0.1775424, 0.0134160165, 0.007534124, 0.0282...","[-0.16510823, 0.014871427, 0.03769997, 0.00227...",0.183619,False,True


In [23]:
incorrect_matches = response_formatted_df[~response_formatted_df["test_answer_correct"]]
incorrect_matches

Unnamed: 0,index,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,question_category,sentence_category,created_commands,test_answer,test_answer_embedding,response_answer_formatted_embedding,test_answer_cosine_distance,test_answer_correct,test_answer_any_matching_word
0,0,56ce304daab44d1400b8850e,What city in the United States has the highest...,What city in the United States has the high –e...,New York,The city in the United States with the high –e...,New York,? what city in the United-States has the high ...,New-York,the city in the United-States with the high -e...,False,Wh-Subject Question,Subject-Verb-Complement (SVC),"[# ID: 56ce304daab44d1400b8850e, the city in t...",New-York-City-Department of Education,"[-0.1762507, 0.026600152, 0.057974998, 0.01327...","[-0.1972834, 0.00014738683, 0.026876723, 0.000...",0.080072,False,False
2,2,56ce304daab44d1400b88510,What city has been called the cultural capital...,What city has been call –ed the culture –al ca...,New York,New York has been call –ed the culture –al cap...,New York,? what city has been call -ed the culture -al ...,New-York,New-York has been call -ed the culture -al cap...,False,Wh-Subject Question,Passive Construction,"[# ID: 56ce304daab44d1400b88510, New-York has ...",capture,"[-0.2194085, -0.0031048143, 0.013731885, -0.00...","[-0.1972834, 0.00014738683, 0.026876723, 0.000...",0.100256,False,False
3,3,56ce304daab44d1400b88511,What American city welcomes the largest number...,What American city welcome –s the large –st nu...,New York,New York is the American city that welcome –s ...,New York,? what American city welcome -s the large -st ...,New-York,New-York is the American city that welcome -s ...,False,Wh-Subject Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56ce304daab44d1400b88511, New-York is t...",Exploitation number of updates >= 4000,"[-0.15511338, 0.028740732, 0.008751592, 0.0579...","[-0.1972834, 0.00014738683, 0.026876723, 0.000...",0.306653,False,False
4,6,56ce3124aab44d1400b8852a,How many boroughs comprise New York City?,How many borough –s comprise New York City?,five,New York City is comprise –d of five borough –s.,five,? how many borough -s comprise New-York-City,five,New-York-City is comprise -d of five borough -s,False,Quantitative Wh-Question,Passive Construction,"[# ID: 56ce3124aab44d1400b8852a, New-York-City...",Exploitation number of updates >= 4000,"[-0.15511338, 0.028740732, 0.008751592, 0.0579...","[-0.21607131, 0.0066142594, 0.021365147, -0.00...",0.312307,False,False
5,7,56ce3124aab44d1400b8852b,In what year were the five boroughs combined i...,In what year were the five borough –s combine ...,1898,The five borough –s were combine –d into one c...,1898,? in what year were the five borough -s combin...,1898,the five borough -s were combine -d into one c...,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56ce3124aab44d1400b8852b, the five boro...",NO ANSWER GIVEN,"[-0.17646857, 0.017760377, 0.028869705, 0.0407...","[-0.1872016, 0.0097118905, 0.052524198, -0.017...",0.259467,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513,723,56d1204617492d1400aab9fc,What federal district court has jurisdiction o...,What federal district court has jurisdiction o...,the District Court for the Eastern District of...,The District Court for the Eastern District of...,the District Court for the Eastern District of...,? what federal district court has jurisdiction...,the District-Court for the Eastern-District of...,the District-Court for the Eastern-District of...,False,Wh-Subject Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56d1204617492d1400aab9fc, the District-...",-s,"[-0.2126666, -0.0055902475, 0.011536866, 0.025...","[-0.1400723, 0.008040977, 0.020870337, 0.00184...",0.316981,False,False
514,724,56d1204617492d1400aab9fd,In what borough is the New York City Hall found?,In what borough is the New York City Hall found?,Manhattan,The New York City Hall is found in the borough...,Manhattan,? in what borough is the New-York-City-Hall found,Manhattan,the New-York-City-Hall is found in the borough...,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56d1204617492d1400aab9fd, the New-York-...",NO ANSWER GIVEN,"[-0.17646857, 0.017760377, 0.028869705, 0.0407...","[-0.20684911, -0.003379386, 0.038716637, -0.00...",0.180568,False,False
515,725,56d1218c17492d1400aaba1f,How much money in cents does New York City rec...,How much money in cent –s does New York City r...,83,New York City receive –s 83 cent –s for every ...,83,? how much money in cent -s does New-York-City...,83,New-York-City receive -s 83 cent -s for every ...,False,Quantitative Wh-Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56d1218c17492d1400aaba1f, New-York-City...",Exploitation number of updates >= 4000,"[-0.15511338, 0.028740732, 0.008751592, 0.0579...","[-0.19080083, -0.0140541615, -0.002487832, 0.0...",0.357599,False,False
516,727,56d1218c17492d1400aaba21,"Each year, how much more money does New York C...","Each year, how much more money does New York C...",$ 11.4 billion,New York City give –s $ 11.4 billion more mone...,$11.4 billion,? each year how much more money does New-York-...,11 point 4 billion,New-York-City give -s 11 point 4 billion more...,False,Quantitative Wh-Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56d1218c17492d1400aaba21, New-York-City...",1625,"[-0.18477677, 0.010238579, 0.020874035, -0.010...","[-0.13429901, -0.012208445, 0.041993335, 0.012...",0.299060,False,False


In [24]:
#write the results to a file and export the results dataframe to a tsv file
import datetime
tsv_results_filename = "test_nyc_results" + datetime.datetime.now().strftime("_%Y%m%d_%H%M%S") + ".tsv"
tsv_results_filepath = os.path.join(test_results_dir, tsv_results_filename)
response_formatted_df.to_csv(tsv_results_filepath, sep="\t", index=False)

results_summary_filename = "test_nyc_results_summary" + datetime.datetime.now().strftime("_%Y%m%d_%H%M%S") + ".txt"
results_summary_filepath = os.path.join(test_results_dir, results_summary_filename)

with open(results_summary_filepath, 'w') as results_file:
    #write the number of samples tested
    results_file.write(f"total number of samples\t{total_number_of_test_samples}\n")
    results_file.write(f"number_of_test_answers\t{len(response_formatted_df)}\n")
    results_file.write(f"total_number_of_pretraining_samples\t{total_number_of_pretraining_samples}\n")
    results_file.write(f"percentage_correct\t{percentage_correct}\n")
    results_file.write(f"percentage_any_word_matches\t{percentage_any_word_matches}\n")
    results_file.write(f"percentage_close_cosine_distance\t{len(close_cosine_distance_df) / len(response_formatted_df) * 100}\n")
    results_file.write(f"percentage_close_cosine_distance_and_any_word_match\t{len(close_cosine_distance_correct_df) / len(response_formatted_df) * 100}\n")
    results_file.write(f"number of test answers longer than 20 words (removed)\t{num_long_answers}\n")
    # write the rows that had exact word matches to the file
    results_file.write("\nRows with exact matches:\n")
    results_file.write(
        correct_matches[["response_question", "response_answer", "test_answer"]].to_markdown(index=False))
    #write the rows in any_matches to the file
    results_file.write("\nRows with any word matches:\n")
    results_file.write(any_matches[["response_question", "response_answer", "test_answer"]].to_markdown(index=False))
    # write the rows that had a close cosine distance to the file
    results_file.write(f"\nRows with cosine distance less than {cosine_distance_threshold}:\n")
    results_file.write(
        close_cosine_distance_df[
            ["response_question", "response_answer", "test_answer", "test_answer_cosine_distance"]].to_markdown(
            index=False))
    # write the rows that had a close cosine distance and any word match to the file
    results_file.write(f"\nRows with cosine distance less than {cosine_distance_threshold} and any word match:\n")
    results_file.write(close_cosine_distance_correct_df[
                           ["response_question", "response_answer", "test_answer",
                            "test_answer_cosine_distance"]].to_markdown(index=False))
    #write the rows that had any matches and with a close cosine distance to the file
    results_file.write(f"\nRows with cosine distance less than {cosine_distance_threshold} and exact match:\n")
    results_file.write(correct_matches[
                           ["response_question", "response_answer", "test_answer",
                            "test_answer_cosine_distance"]].to_markdown(index=False))
print(f"results written to {tsv_results_filepath} and {results_summary_filepath}")

results written to /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/14/results/test_nyc_results_20251023_094027.tsv and /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/14/results/test_nyc_results_summary_20251023_094027.txt
