# This notebook is used to test the ANNABELL model on the SQuAD dataset.


In [14]:
from dataset_processing import any_word_match, embedding_for_sentence, \
    cosine_distance, ids_questions_answers_from_log_file
import os
import platform
import datetime
import pandas as pd
from tqdm import tqdm

In [15]:
experiment_number = 11
operating_system = platform.system()
if operating_system == 'Windows':
    raise Exception("not used on Windows yet")
elif operating_system == 'Linux':
    base_directory = "/home/chris/gdrive/work/annabell"
elif operating_system == 'Darwin':  #macOS
    base_directory = "/Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/"
else:
    raise Exception("unsupported OS")

timestamp = datetime.datetime.now().strftime("_%Y%m%d_%H%M%S")

test_input_dir = os.path.join(base_directory, "experiments", str(experiment_number), "testing")
test_log_dir = os.path.join(base_directory, "experiments", str(experiment_number), "logs")
data_dir = os.path.join(base_directory, "experiments/data")
test_results_dir = os.path.join(base_directory, "experiments", str(experiment_number), "results")
if not os.path.exists(test_results_dir):
    os.makedirs(test_results_dir)

test_log_filename = "logfile_nyc_squad_testing_commands_2025-10-10_06-56-59.txt"
test_input_filename = "nyc_squad_testing_commands_20251009_075658.txt"
dataset_filename = "nyc_squad_with_pretraining_commands_20251009_075658.jsonl"

test_log_filepath = os.path.join(test_log_dir, test_log_filename)
test_input_filepath = os.path.join(test_input_dir, test_input_filename)
dataset_filepath = os.path.join(data_dir, dataset_filename)

test_answer_summary_filename = "test_nyc_answer_summary" + timestamp + ".tsv"
tsv_results_filepath = os.path.join(test_results_dir, test_answer_summary_filename)

ids_questions_answers = ids_questions_answers_from_log_file(test_log_filepath)
for index, each_tuple in enumerate(ids_questions_answers):
    if each_tuple[-1] == "" or each_tuple[-1] == None:
        ids_questions_answers[index] = (each_tuple[0], each_tuple[1], "NO ANSWER GIVEN")

print("length of log file questions and answers: " + str(len(ids_questions_answers)))
with open(test_input_filepath, 'r') as test_input_file:
    test_input_lines = test_input_file.readlines()
response_formatted_df = pd.read_json(dataset_filepath, lines=True)
#add the test questions to the dataframe
questions_not_found = []
for the_id, question, answer, in ids_questions_answers:
    if the_id in response_formatted_df["id"].values:
        response_formatted_df.loc[response_formatted_df["id"] == the_id, "test_answer"] = answer
    else:
        questions_not_found.append(question)
print(f"number of test samples not found in training data: {len(questions_not_found)}")
print("test samples not found in training data: " + str(questions_not_found[:5]) + " ...")
#drop any rows that are not in the test samples
response_formatted_df.dropna(subset=["test_answer"], inplace=True)
response_formatted_df.reset_index(inplace=True)
response_formatted_df

length of log file questions and answers: 670
number of test samples not found in training data: 0
test samples not found in training data: [] ...


Unnamed: 0,index,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,question_category,sentence_category,created_commands,test_answer
0,0,56ce304daab44d1400b8850e,What city in the United States has the highest...,What city in the United States has the high –e...,New York,The city in the United States with the high –e...,New York,? what city in the United-States has the high ...,New-York,the city in the United-States with the high -e...,False,Wh-Subject Question,Subject-Verb-Complement (SVC),"[# ID: 56ce304daab44d1400b8850e, the city in t...",William-III
1,1,56ce304daab44d1400b8850f,In what city is the United Nations based?,In what city is the United Nations base –d?,New York,The United Nations is base –d in New York.,New York,? in what city is the United-Nations base -d,New-York,the United-Nations is base -d in New-York,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56ce304daab44d1400b8850f, the United-Na...",in
2,2,56ce304daab44d1400b88510,What city has been called the cultural capital...,What city has been call –ed the culture –al ca...,New York,New York has been call –ed the culture –al cap...,New York,? what city has been call -ed the culture -al ...,New-York,New-York has been call -ed the culture -al cap...,False,Wh-Subject Question,Passive Construction,"[# ID: 56ce304daab44d1400b88510, New-York has ...",One-World-Trade-Center
3,3,56ce304daab44d1400b88511,What American city welcomes the largest number...,What American city welcome –s the large –st nu...,New York,New York is the American city that welcome –s ...,New York,? what American city welcome -s the large -st ...,New-York,New-York is the American city that welcome -s ...,False,Wh-Subject Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56ce304daab44d1400b88511, New-York is t...",90 percent
4,4,56cf5d41aab44d1400b89130,The major gateway for immigration has been whi...,The major gateway for immigrate –ion has been ...,New York City,The major gateway for immigrate –ion has been ...,New York City,? the major gateway for immigrate -ion has bee...,New-York-City,the major gateway for immigrate -ion has been ...,False,Wh-in-situ Question,Subject-Verb-Complement (SVC),"[# ID: 56cf5d41aab44d1400b89130, the major gat...",the
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665,724,56d1204617492d1400aab9fd,In what borough is the New York City Hall found?,In what borough is the New York City Hall found?,Manhattan,The New York City Hall is found in the borough...,Manhattan,? in what borough is the New-York-City-Hall found,Manhattan,the New-York-City-Hall is found in the borough...,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56d1204617492d1400aab9fd, the New-York-...",NO ANSWER GIVEN
666,725,56d1218c17492d1400aaba1f,How much money in cents does New York City rec...,How much money in cent –s does New York City r...,83,New York City receive –s 83 cent –s for every ...,83,? how much money in cent -s does New-York-City...,83,New-York-City receive -s 83 cent -s for every ...,False,Quantitative Wh-Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56d1218c17492d1400aaba1f, New-York-City...",NO ANSWER GIVEN
667,726,56d1218c17492d1400aaba20,How much more money does the city give to the ...,How much more money does the city give to the ...,$ 11 billion,The city give –s $ 11 billion more money to th...,$11 billion,? how much more money does the city give to th...,11 billion,the city give -s 11 billion more money to the...,False,Quantitative Wh-Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56d1218c17492d1400aaba20, the city give...",14 mile -s
668,727,56d1218c17492d1400aaba21,"Each year, how much more money does New York C...","Each year, how much more money does New York C...",$ 11.4 billion,New York City give –s $ 11.4 billion more mone...,$11.4 billion,? each year how much more money does New-York-...,11 point 4 billion,New-York-City give -s 11 point 4 billion more...,False,Quantitative Wh-Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56d1218c17492d1400aaba21, New-York-City...",Exploitation number of updates >= 4000


In [16]:
#generate embeddings for the test answer and the response_answer_formatted columns and compare them using cosine distance
tqdm.pandas(desc="Generating test answer embeddings")
response_formatted_df["test_answer_embedding"] = response_formatted_df["test_answer"].progress_apply(
    lambda x: embedding_for_sentence(x) if pd.notnull(x) else None)
tqdm.pandas(desc="Generating response answer embeddings")
response_formatted_df["response_answer_formatted_embedding"] = response_formatted_df[
    "response_answer_formatted"].progress_apply(
    lambda x: embedding_for_sentence(x) if pd.notnull(x) else None)

Generating test answer embeddings: 100%|██████████| 670/670 [00:57<00:00, 11.69it/s]
Generating response answer embeddings: 100%|██████████| 670/670 [00:55<00:00, 12.02it/s]


In [17]:
response_formatted_df["test_answer_cosine_distance"] = response_formatted_df.apply(cosine_distance, axis=1)
response_formatted_df

Unnamed: 0,index,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,question_category,sentence_category,created_commands,test_answer,test_answer_embedding,response_answer_formatted_embedding,test_answer_cosine_distance
0,0,56ce304daab44d1400b8850e,What city in the United States has the highest...,What city in the United States has the high –e...,New York,The city in the United States with the high –e...,New York,? what city in the United-States has the high ...,New-York,the city in the United-States with the high -e...,False,Wh-Subject Question,Subject-Verb-Complement (SVC),"[# ID: 56ce304daab44d1400b8850e, the city in t...",William-III,"[-0.20163444, -0.0094867805, 0.022896027, -0.0...","[-0.19727688, 0.0001464649, 0.026875416, 0.000...",0.113534
1,1,56ce304daab44d1400b8850f,In what city is the United Nations based?,In what city is the United Nations base –d?,New York,The United Nations is base –d in New York.,New York,? in what city is the United-Nations base -d,New-York,the United-Nations is base -d in New-York,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56ce304daab44d1400b8850f, the United-Na...",in,"[-0.20956264, -0.0064942883, 0.035192005, -0.0...","[-0.19727688, 0.0001464649, 0.026875416, 0.000...",0.099051
2,2,56ce304daab44d1400b88510,What city has been called the cultural capital...,What city has been call –ed the culture –al ca...,New York,New York has been call –ed the culture –al cap...,New York,? what city has been call -ed the culture -al ...,New-York,New-York has been call -ed the culture -al cap...,False,Wh-Subject Question,Passive Construction,"[# ID: 56ce304daab44d1400b88510, New-York has ...",One-World-Trade-Center,"[-0.16668645, 0.012506176, 0.05196567, 0.00260...","[-0.19727688, 0.0001464649, 0.026875416, 0.000...",0.207564
3,3,56ce304daab44d1400b88511,What American city welcomes the largest number...,What American city welcome –s the large –st nu...,New York,New York is the American city that welcome –s ...,New York,? what American city welcome -s the large -st ...,New-York,New-York is the American city that welcome -s ...,False,Wh-Subject Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56ce304daab44d1400b88511, New-York is t...",90 percent,"[-0.1786337, 0.020912088, 0.0043227947, 0.0069...","[-0.19727688, 0.0001464649, 0.026875416, 0.000...",0.190136
4,4,56cf5d41aab44d1400b89130,The major gateway for immigration has been whi...,The major gateway for immigrate –ion has been ...,New York City,The major gateway for immigrate –ion has been ...,New York City,? the major gateway for immigrate -ion has bee...,New-York-City,the major gateway for immigrate -ion has been ...,False,Wh-in-situ Question,Subject-Verb-Complement (SVC),"[# ID: 56cf5d41aab44d1400b89130, the major gat...",the,"[-0.19715294, 0.0027368844, 0.017215747, 0.004...","[-0.19554919, 0.006619052, 0.039511867, -0.000...",0.145936
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665,724,56d1204617492d1400aab9fd,In what borough is the New York City Hall found?,In what borough is the New York City Hall found?,Manhattan,The New York City Hall is found in the borough...,Manhattan,? in what borough is the New-York-City-Hall found,Manhattan,the New-York-City-Hall is found in the borough...,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56d1204617492d1400aab9fd, the New-York-...",NO ANSWER GIVEN,"[-0.17647709, 0.017767908, 0.028887603, 0.0407...","[-0.20684884, -0.0033799286, 0.038709026, -0.0...",0.180551
666,725,56d1218c17492d1400aaba1f,How much money in cents does New York City rec...,How much money in cent –s does New York City r...,83,New York City receive –s 83 cent –s for every ...,83,? how much money in cent -s does New-York-City...,83,New-York-City receive -s 83 cent -s for every ...,False,Quantitative Wh-Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56d1218c17492d1400aaba1f, New-York-City...",NO ANSWER GIVEN,"[-0.17647709, 0.017767908, 0.028887603, 0.0407...","[-0.19078888, -0.014057412, -0.002494617, 0.01...",0.262505
667,726,56d1218c17492d1400aaba20,How much more money does the city give to the ...,How much more money does the city give to the ...,$ 11 billion,The city give –s $ 11 billion more money to th...,$11 billion,? how much more money does the city give to th...,11 billion,the city give -s 11 billion more money to the...,False,Quantitative Wh-Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56d1218c17492d1400aaba20, the city give...",14 mile -s,"[-0.18709867, -0.0013902837, 0.033759546, 0.01...","[-0.16274187, -0.02008049, 0.05266985, 0.00129...",0.217211
668,727,56d1218c17492d1400aaba21,"Each year, how much more money does New York C...","Each year, how much more money does New York C...",$ 11.4 billion,New York City give –s $ 11.4 billion more mone...,$11.4 billion,? each year how much more money does New-York-...,11 point 4 billion,New-York-City give -s 11 point 4 billion more...,False,Quantitative Wh-Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56d1218c17492d1400aaba21, New-York-City...",Exploitation number of updates >= 4000,"[-0.15502132, 0.028481387, 0.009052947, 0.0581...","[-0.13401222, -0.0122689605, 0.04238571, 0.012...",0.390273


In [18]:
# Get the counts for each unique value in the 'test_answer' column
test_answer_summary = response_formatted_df['test_answer'].value_counts().reset_index()
# Rename the columns for clarity
test_answer_summary.columns = ['test_answer', 'count']
# Sort the results by count in descending order
test_answer_summary.sort_values(by='count', ascending=False, inplace=True)
test_answer_summary

Unnamed: 0,test_answer,count
0,NO ANSWER GIVEN,255
1,Exploitation number of updates >= 4000,110
2,1825,20
3,25000,14
4,1 point 1 million,11
...,...,...
76,borough,1
77,seven,1
79,build,1
80,New-Amsterdam,1


In [19]:
#write the results dataframe to a tsv file
test_answer_summary.to_csv(tsv_results_filepath, sep="\t", index=False)
#count the number of results where the test answer is > 20 words
num_long_answers = response_formatted_df["test_answer"].apply(
    lambda x: len(x.split()) > 20 if pd.notnull(x) else False).sum()
print(f"number of test answers longer than 20 words: {num_long_answers}")
response_formatted_df["test_answer_correct"] = response_formatted_df["test_answer"] == response_formatted_df[
    "response_answer_formatted"]
number_correct = response_formatted_df["test_answer_correct"].sum()
print(f"number correct = {number_correct} out of {len(response_formatted_df)}")
percentage_correct = response_formatted_df["test_answer_correct"].mean() * 100
print(f"percentage correct = {percentage_correct} %")

response_formatted_df["test_answer_any_matching_word"] = response_formatted_df.apply(any_word_match, axis=1)
percentage_any_word_matches = response_formatted_df["test_answer_any_matching_word"].mean() * 100
number_any_word_matches = response_formatted_df["test_answer_any_matching_word"].sum()
print(f"number any word matches = {number_any_word_matches} out of {len(response_formatted_df)}")
print(f"percentage any word matches = {percentage_any_word_matches} %")

cosine_distance_threshold = 0.1
#create a dataframe with the rows where the cosine distance is less than the threshold
close_cosine_distance_df = response_formatted_df[
    response_formatted_df["test_answer_cosine_distance"] < cosine_distance_threshold]
print(f"number of rows with cosine distance less than {cosine_distance_threshold}: {len(close_cosine_distance_df)}")
print("percentage of total: " + str(len(close_cosine_distance_df) / len(response_formatted_df) * 100) + " %")

#create a dataframe with the rows where the cosine distance is less than the threshold and any matching answer is correct
close_cosine_distance_correct_df = close_cosine_distance_df[
    close_cosine_distance_df["test_answer_any_matching_word"]]
print(
    f"number of rows with cosine distance less than {cosine_distance_threshold} and any matching answer correct: {len(close_cosine_distance_correct_df)}")
print("percentage of total: " + str(len(close_cosine_distance_correct_df) / len
(response_formatted_df) * 100) + " %")

close_cosine_distance_correct_df

number of test answers longer than 20 words: 0
number correct = 13 out of 670
percentage correct = 1.9402985074626864 %
number any word matches = 18 out of 670
percentage any word matches = 2.6865671641791042 %
number of rows with cosine distance less than 0.1: 65
percentage of total: 9.701492537313433 %
number of rows with cosine distance less than 0.1 and any matching answer correct: 14
percentage of total: 2.0895522388059704 %


Unnamed: 0,index,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,question_category,sentence_category,created_commands,test_answer,test_answer_embedding,response_answer_formatted_embedding,test_answer_cosine_distance,test_answer_correct,test_answer_any_matching_word
14,14,56cf9d81234ae51400d9be1e,What is the population of New York City as of ...,What is the populate –ion of New York City as ...,8491079,The populate –ion of New York City as of 2014 ...,8491079,? what is the populate -ion of New-York-City a...,8491079,the populate -ion of New-York-City as of 2014 ...,False,Wh-Object/Complement Question,Subject-Verb-Complement (SVC),"[# ID: 56cf9d81234ae51400d9be1e, the populate ...",8491079,"[-0.16185391, 0.009814939, 0.027025407, 0.0109...","[-0.16185391, 0.009814939, 0.027025407, 0.0109...",0.0,True,True
40,41,56ce34c7aab44d1400b88595,Who commanded the Spanish expedition?,Who command –ed the Spanish expedite –ion?,Estêvão Gomes,Estêvão Gomes command –ed the Spanish expedite...,Estêvão Gomes,? who command -ed the Spanish expedite -ion,Estevao-Gomes,Estevao-Gomes command -ed the Spanish expedite...,False,Wh-Subject Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56ce34c7aab44d1400b88595, Estevao-Gomes...",Estevao-Gomes,"[-0.18101516, 0.017185545, 0.03445143, -0.0088...","[-0.18101516, 0.017185545, 0.03445143, -0.0088...",0.0,True,True
61,65,56cedbb9aab44d1400b88b13,In what year did the Dutch buy Manhattan?,In what year did the Dutch buy Manhattan?,1626,The Dutch bought Manhattan in 1626.,1626,? in what year did the Dutch buy Manhattan,1626,the Dutch bought Manhattan in 1626,False,Wh-Adverbial Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56cedbb9aab44d1400b88b13, the Dutch bou...",1626,"[-0.18711904, 0.0031177902, 0.024501463, -0.01...","[-0.18711904, 0.0031177902, 0.024501463, -0.01...",0.0,True,True
86,92,56cede40aab44d1400b88b71,In what year was John Peter Zenger tried?,In what year was John Peter Zenger try –ed?,1735,John Peter Zenger was try –ed in 1735.,1735,? in what year was John-Peter-Zenger try -ed,1735,John-Peter-Zenger was try -ed in 1735,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56cede40aab44d1400b88b71, John-Peter-Ze...",1735,"[-0.19004145, -0.008742259, 0.05413161, 0.0107...","[-0.19004145, -0.008742259, 0.05413161, 0.0107...",0.0,True,True
87,93,56cede40aab44d1400b88b72,On what island did Zenger's trial occur?,On what island did Zenger's trial occur?,Manhattan,Zenger's trial occur –ed on Manhattan.,Manhattan,? on what island did Zengers trial occur,Manhattan,Zengers trial occur -ed on Manhattan,False,Wh-Adverbial Question,Subject-Verb-Adverbial (SVA),"[# ID: 56cede40aab44d1400b88b72, Zengers trial...",Manhattan,"[-0.20684884, -0.0033799286, 0.038709026, -0.0...","[-0.20684884, -0.0033799286, 0.038709026, -0.0...",0.0,True,True
88,94,56cede40aab44d1400b88b73,In what year was Columbia University chartered?,In what year was Columbia University charter –ed?,1754,Columbia University was charter –ed in 1754.,1754,? in what year was Columbia-University charter...,1754,Columbia-University was charter -ed in 1754,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56cede40aab44d1400b88b73, Columbia-Univ...",1754,"[-0.18632932, -0.00466555, 0.0492731, 0.002800...","[-0.18632932, -0.00466555, 0.0492731, 0.002800...",0.0,True,True
103,109,56cee30faab44d1400b88bf2,Who was the British representative at the Conf...,Who was the British representative at the Conf...,Lord Howe,The British representative at the Conference H...,Lord Howe,? who was the British representative at the Co...,Lord-Howe,the British representative at the Conference-H...,False,Wh-Subject Question,Subject-Verb-Complement (SVC),"[# ID: 56cee30faab44d1400b88bf2, the British r...",Lord-Howe,"[-0.16088948, -0.0116037205, 0.017029908, 0.01...","[-0.16088948, -0.0116037205, 0.017029908, 0.01...",0.0,True,True
124,131,56cee4d1aab44d1400b88c10,In what year did the Erie Canal finish building?,In what year did the Erie Canal finish build –...,1825,The Erie Canal finish –ed build –ing in 1825.,1825,? in what year did the Erie-Canal finish build...,1825,the Erie-Canal finish -ed build -ing in 1825,False,Wh-Adverbial Question,Subject-Verb-Adverbial (SVA),"[# ID: 56cee4d1aab44d1400b88c10, the Erie-Cana...",1825,"[-0.18570255, -0.004654048, 0.03343313, -0.004...","[-0.18570255, -0.004654048, 0.03343313, -0.004...",0.0,True,True
261,279,56cf306baab44d1400b88dea,What movement is the Stonewall Inn most famous...,What move –ment is the Stonewall Inn most famo...,gay rights movement,The Stonewall Inn is most famous –ly associate...,gay rights movement,? what move -ment is the Stonewall-Inn most fa...,gay rights movement,the Stonewall-Inn is most famous -ly associate...,False,Wh-Object/Complement Question,Passive Construction,"[# ID: 56cf306baab44d1400b88dea, the Stonewall...",gay,"[-0.21181943, -0.0070518795, 0.018363109, -0.0...","[-0.20972589, -0.012156877, 0.024234217, 0.004...",0.039101,False,True
279,299,56cf3297aab44d1400b88e13,In what borough is Fort Hamilton located?,In what borough is Fort Hamilton locate –d?,Brooklyn,Fort Hamilton is locate –d in Brooklyn.,Brooklyn,? in what borough is Fort-Hamilton locate -d,Brooklyn,Fort-Hamilton is locate -d in Brooklyn,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56cf3297aab44d1400b88e13, Fort-Hamilton...",Brooklyn,"[-0.21150173, -0.015448935, 0.01512959, -0.005...","[-0.21150173, -0.015448935, 0.01512959, -0.005...",0.0,True,True


In [20]:
correct_matches = response_formatted_df[response_formatted_df["test_answer_correct"]]
correct_matches

Unnamed: 0,index,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,question_category,sentence_category,created_commands,test_answer,test_answer_embedding,response_answer_formatted_embedding,test_answer_cosine_distance,test_answer_correct,test_answer_any_matching_word
14,14,56cf9d81234ae51400d9be1e,What is the population of New York City as of ...,What is the populate –ion of New York City as ...,8491079,The populate –ion of New York City as of 2014 ...,8491079,? what is the populate -ion of New-York-City a...,8491079,the populate -ion of New-York-City as of 2014 ...,False,Wh-Object/Complement Question,Subject-Verb-Complement (SVC),"[# ID: 56cf9d81234ae51400d9be1e, the populate ...",8491079,"[-0.16185391, 0.009814939, 0.027025407, 0.0109...","[-0.16185391, 0.009814939, 0.027025407, 0.0109...",0.0,True,True
40,41,56ce34c7aab44d1400b88595,Who commanded the Spanish expedition?,Who command –ed the Spanish expedite –ion?,Estêvão Gomes,Estêvão Gomes command –ed the Spanish expedite...,Estêvão Gomes,? who command -ed the Spanish expedite -ion,Estevao-Gomes,Estevao-Gomes command -ed the Spanish expedite...,False,Wh-Subject Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56ce34c7aab44d1400b88595, Estevao-Gomes...",Estevao-Gomes,"[-0.18101516, 0.017185545, 0.03445143, -0.0088...","[-0.18101516, 0.017185545, 0.03445143, -0.0088...",0.0,True,True
61,65,56cedbb9aab44d1400b88b13,In what year did the Dutch buy Manhattan?,In what year did the Dutch buy Manhattan?,1626,The Dutch bought Manhattan in 1626.,1626,? in what year did the Dutch buy Manhattan,1626,the Dutch bought Manhattan in 1626,False,Wh-Adverbial Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56cedbb9aab44d1400b88b13, the Dutch bou...",1626,"[-0.18711904, 0.0031177902, 0.024501463, -0.01...","[-0.18711904, 0.0031177902, 0.024501463, -0.01...",0.0,True,True
86,92,56cede40aab44d1400b88b71,In what year was John Peter Zenger tried?,In what year was John Peter Zenger try –ed?,1735,John Peter Zenger was try –ed in 1735.,1735,? in what year was John-Peter-Zenger try -ed,1735,John-Peter-Zenger was try -ed in 1735,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56cede40aab44d1400b88b71, John-Peter-Ze...",1735,"[-0.19004145, -0.008742259, 0.05413161, 0.0107...","[-0.19004145, -0.008742259, 0.05413161, 0.0107...",0.0,True,True
87,93,56cede40aab44d1400b88b72,On what island did Zenger's trial occur?,On what island did Zenger's trial occur?,Manhattan,Zenger's trial occur –ed on Manhattan.,Manhattan,? on what island did Zengers trial occur,Manhattan,Zengers trial occur -ed on Manhattan,False,Wh-Adverbial Question,Subject-Verb-Adverbial (SVA),"[# ID: 56cede40aab44d1400b88b72, Zengers trial...",Manhattan,"[-0.20684884, -0.0033799286, 0.038709026, -0.0...","[-0.20684884, -0.0033799286, 0.038709026, -0.0...",0.0,True,True
88,94,56cede40aab44d1400b88b73,In what year was Columbia University chartered?,In what year was Columbia University charter –ed?,1754,Columbia University was charter –ed in 1754.,1754,? in what year was Columbia-University charter...,1754,Columbia-University was charter -ed in 1754,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56cede40aab44d1400b88b73, Columbia-Univ...",1754,"[-0.18632932, -0.00466555, 0.0492731, 0.002800...","[-0.18632932, -0.00466555, 0.0492731, 0.002800...",0.0,True,True
103,109,56cee30faab44d1400b88bf2,Who was the British representative at the Conf...,Who was the British representative at the Conf...,Lord Howe,The British representative at the Conference H...,Lord Howe,? who was the British representative at the Co...,Lord-Howe,the British representative at the Conference-H...,False,Wh-Subject Question,Subject-Verb-Complement (SVC),"[# ID: 56cee30faab44d1400b88bf2, the British r...",Lord-Howe,"[-0.16088948, -0.0116037205, 0.017029908, 0.01...","[-0.16088948, -0.0116037205, 0.017029908, 0.01...",0.0,True,True
124,131,56cee4d1aab44d1400b88c10,In what year did the Erie Canal finish building?,In what year did the Erie Canal finish build –...,1825,The Erie Canal finish –ed build –ing in 1825.,1825,? in what year did the Erie-Canal finish build...,1825,the Erie-Canal finish -ed build -ing in 1825,False,Wh-Adverbial Question,Subject-Verb-Adverbial (SVA),"[# ID: 56cee4d1aab44d1400b88c10, the Erie-Cana...",1825,"[-0.18570255, -0.004654048, 0.03343313, -0.004...","[-0.18570255, -0.004654048, 0.03343313, -0.004...",0.0,True,True
279,299,56cf3297aab44d1400b88e13,In what borough is Fort Hamilton located?,In what borough is Fort Hamilton locate –d?,Brooklyn,Fort Hamilton is locate –d in Brooklyn.,Brooklyn,? in what borough is Fort-Hamilton locate -d,Brooklyn,Fort-Hamilton is locate -d in Brooklyn,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56cf3297aab44d1400b88e13, Fort-Hamilton...",Brooklyn,"[-0.21150173, -0.015448935, 0.01512959, -0.005...","[-0.21150173, -0.015448935, 0.01512959, -0.005...",0.0,True,True
286,306,56cf331faab44d1400b88e1b,What was the population of New York City in 2014?,What was the populate –ion of New York City in...,8491079,The populate –ion of New York City in 2014 was...,8491079,? what was the populate -ion of New-York-City ...,8491079,the populate -ion of New-York-City in 2014 was...,False,Wh-Object/Complement Question,Subject-Verb-Complement (SVC),"[# ID: 56cf331faab44d1400b88e1b, the populate ...",8491079,"[-0.16185391, 0.009814939, 0.027025407, 0.0109...","[-0.16185391, 0.009814939, 0.027025407, 0.0109...",0.0,True,True


In [21]:
any_matches = response_formatted_df[response_formatted_df["test_answer_any_matching_word"]]
any_matches

Unnamed: 0,index,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,question_category,sentence_category,created_commands,test_answer,test_answer_embedding,response_answer_formatted_embedding,test_answer_cosine_distance,test_answer_correct,test_answer_any_matching_word
14,14,56cf9d81234ae51400d9be1e,What is the population of New York City as of ...,What is the populate –ion of New York City as ...,8491079,The populate –ion of New York City as of 2014 ...,8491079,? what is the populate -ion of New-York-City a...,8491079,the populate -ion of New-York-City as of 2014 ...,False,Wh-Object/Complement Question,Subject-Verb-Complement (SVC),"[# ID: 56cf9d81234ae51400d9be1e, the populate ...",8491079,"[-0.16185391, 0.009814939, 0.027025407, 0.0109...","[-0.16185391, 0.009814939, 0.027025407, 0.0109...",0.0,True,True
40,41,56ce34c7aab44d1400b88595,Who commanded the Spanish expedition?,Who command –ed the Spanish expedite –ion?,Estêvão Gomes,Estêvão Gomes command –ed the Spanish expedite...,Estêvão Gomes,? who command -ed the Spanish expedite -ion,Estevao-Gomes,Estevao-Gomes command -ed the Spanish expedite...,False,Wh-Subject Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56ce34c7aab44d1400b88595, Estevao-Gomes...",Estevao-Gomes,"[-0.18101516, 0.017185545, 0.03445143, -0.0088...","[-0.18101516, 0.017185545, 0.03445143, -0.0088...",0.0,True,True
61,65,56cedbb9aab44d1400b88b13,In what year did the Dutch buy Manhattan?,In what year did the Dutch buy Manhattan?,1626,The Dutch bought Manhattan in 1626.,1626,? in what year did the Dutch buy Manhattan,1626,the Dutch bought Manhattan in 1626,False,Wh-Adverbial Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56cedbb9aab44d1400b88b13, the Dutch bou...",1626,"[-0.18711904, 0.0031177902, 0.024501463, -0.01...","[-0.18711904, 0.0031177902, 0.024501463, -0.01...",0.0,True,True
86,92,56cede40aab44d1400b88b71,In what year was John Peter Zenger tried?,In what year was John Peter Zenger try –ed?,1735,John Peter Zenger was try –ed in 1735.,1735,? in what year was John-Peter-Zenger try -ed,1735,John-Peter-Zenger was try -ed in 1735,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56cede40aab44d1400b88b71, John-Peter-Ze...",1735,"[-0.19004145, -0.008742259, 0.05413161, 0.0107...","[-0.19004145, -0.008742259, 0.05413161, 0.0107...",0.0,True,True
87,93,56cede40aab44d1400b88b72,On what island did Zenger's trial occur?,On what island did Zenger's trial occur?,Manhattan,Zenger's trial occur –ed on Manhattan.,Manhattan,? on what island did Zengers trial occur,Manhattan,Zengers trial occur -ed on Manhattan,False,Wh-Adverbial Question,Subject-Verb-Adverbial (SVA),"[# ID: 56cede40aab44d1400b88b72, Zengers trial...",Manhattan,"[-0.20684884, -0.0033799286, 0.038709026, -0.0...","[-0.20684884, -0.0033799286, 0.038709026, -0.0...",0.0,True,True
88,94,56cede40aab44d1400b88b73,In what year was Columbia University chartered?,In what year was Columbia University charter –ed?,1754,Columbia University was charter –ed in 1754.,1754,? in what year was Columbia-University charter...,1754,Columbia-University was charter -ed in 1754,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56cede40aab44d1400b88b73, Columbia-Univ...",1754,"[-0.18632932, -0.00466555, 0.0492731, 0.002800...","[-0.18632932, -0.00466555, 0.0492731, 0.002800...",0.0,True,True
103,109,56cee30faab44d1400b88bf2,Who was the British representative at the Conf...,Who was the British representative at the Conf...,Lord Howe,The British representative at the Conference H...,Lord Howe,? who was the British representative at the Co...,Lord-Howe,the British representative at the Conference-H...,False,Wh-Subject Question,Subject-Verb-Complement (SVC),"[# ID: 56cee30faab44d1400b88bf2, the British r...",Lord-Howe,"[-0.16088948, -0.0116037205, 0.017029908, 0.01...","[-0.16088948, -0.0116037205, 0.017029908, 0.01...",0.0,True,True
124,131,56cee4d1aab44d1400b88c10,In what year did the Erie Canal finish building?,In what year did the Erie Canal finish build –...,1825,The Erie Canal finish –ed build –ing in 1825.,1825,? in what year did the Erie-Canal finish build...,1825,the Erie-Canal finish -ed build -ing in 1825,False,Wh-Adverbial Question,Subject-Verb-Adverbial (SVA),"[# ID: 56cee4d1aab44d1400b88c10, the Erie-Cana...",1825,"[-0.18570255, -0.004654048, 0.03343313, -0.004...","[-0.18570255, -0.004654048, 0.03343313, -0.004...",0.0,True,True
261,279,56cf306baab44d1400b88dea,What movement is the Stonewall Inn most famous...,What move –ment is the Stonewall Inn most famo...,gay rights movement,The Stonewall Inn is most famous –ly associate...,gay rights movement,? what move -ment is the Stonewall-Inn most fa...,gay rights movement,the Stonewall-Inn is most famous -ly associate...,False,Wh-Object/Complement Question,Passive Construction,"[# ID: 56cf306baab44d1400b88dea, the Stonewall...",gay,"[-0.21181943, -0.0070518795, 0.018363109, -0.0...","[-0.20972589, -0.012156877, 0.024234217, 0.004...",0.039101,False,True
279,299,56cf3297aab44d1400b88e13,In what borough is Fort Hamilton located?,In what borough is Fort Hamilton locate –d?,Brooklyn,Fort Hamilton is locate –d in Brooklyn.,Brooklyn,? in what borough is Fort-Hamilton locate -d,Brooklyn,Fort-Hamilton is locate -d in Brooklyn,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56cf3297aab44d1400b88e13, Fort-Hamilton...",Brooklyn,"[-0.21150173, -0.015448935, 0.01512959, -0.005...","[-0.21150173, -0.015448935, 0.01512959, -0.005...",0.0,True,True


In [22]:
incorrect_matches = response_formatted_df[~response_formatted_df["test_answer_correct"]]
incorrect_matches

Unnamed: 0,index,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,question_category,sentence_category,created_commands,test_answer,test_answer_embedding,response_answer_formatted_embedding,test_answer_cosine_distance,test_answer_correct,test_answer_any_matching_word
0,0,56ce304daab44d1400b8850e,What city in the United States has the highest...,What city in the United States has the high –e...,New York,The city in the United States with the high –e...,New York,? what city in the United-States has the high ...,New-York,the city in the United-States with the high -e...,False,Wh-Subject Question,Subject-Verb-Complement (SVC),"[# ID: 56ce304daab44d1400b8850e, the city in t...",William-III,"[-0.20163444, -0.0094867805, 0.022896027, -0.0...","[-0.19727688, 0.0001464649, 0.026875416, 0.000...",0.113534,False,False
1,1,56ce304daab44d1400b8850f,In what city is the United Nations based?,In what city is the United Nations base –d?,New York,The United Nations is base –d in New York.,New York,? in what city is the United-Nations base -d,New-York,the United-Nations is base -d in New-York,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56ce304daab44d1400b8850f, the United-Na...",in,"[-0.20956264, -0.0064942883, 0.035192005, -0.0...","[-0.19727688, 0.0001464649, 0.026875416, 0.000...",0.099051,False,False
2,2,56ce304daab44d1400b88510,What city has been called the cultural capital...,What city has been call –ed the culture –al ca...,New York,New York has been call –ed the culture –al cap...,New York,? what city has been call -ed the culture -al ...,New-York,New-York has been call -ed the culture -al cap...,False,Wh-Subject Question,Passive Construction,"[# ID: 56ce304daab44d1400b88510, New-York has ...",One-World-Trade-Center,"[-0.16668645, 0.012506176, 0.05196567, 0.00260...","[-0.19727688, 0.0001464649, 0.026875416, 0.000...",0.207564,False,False
3,3,56ce304daab44d1400b88511,What American city welcomes the largest number...,What American city welcome –s the large –st nu...,New York,New York is the American city that welcome –s ...,New York,? what American city welcome -s the large -st ...,New-York,New-York is the American city that welcome -s ...,False,Wh-Subject Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56ce304daab44d1400b88511, New-York is t...",90 percent,"[-0.1786337, 0.020912088, 0.0043227947, 0.0069...","[-0.19727688, 0.0001464649, 0.026875416, 0.000...",0.190136,False,False
4,4,56cf5d41aab44d1400b89130,The major gateway for immigration has been whi...,The major gateway for immigrate –ion has been ...,New York City,The major gateway for immigrate –ion has been ...,New York City,? the major gateway for immigrate -ion has bee...,New-York-City,the major gateway for immigrate -ion has been ...,False,Wh-in-situ Question,Subject-Verb-Complement (SVC),"[# ID: 56cf5d41aab44d1400b89130, the major gat...",the,"[-0.19715294, 0.0027368844, 0.017215747, 0.004...","[-0.19554919, 0.006619052, 0.039511867, -0.000...",0.145936,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665,724,56d1204617492d1400aab9fd,In what borough is the New York City Hall found?,In what borough is the New York City Hall found?,Manhattan,The New York City Hall is found in the borough...,Manhattan,? in what borough is the New-York-City-Hall found,Manhattan,the New-York-City-Hall is found in the borough...,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56d1204617492d1400aab9fd, the New-York-...",NO ANSWER GIVEN,"[-0.17647709, 0.017767908, 0.028887603, 0.0407...","[-0.20684884, -0.0033799286, 0.038709026, -0.0...",0.180551,False,False
666,725,56d1218c17492d1400aaba1f,How much money in cents does New York City rec...,How much money in cent –s does New York City r...,83,New York City receive –s 83 cent –s for every ...,83,? how much money in cent -s does New-York-City...,83,New-York-City receive -s 83 cent -s for every ...,False,Quantitative Wh-Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56d1218c17492d1400aaba1f, New-York-City...",NO ANSWER GIVEN,"[-0.17647709, 0.017767908, 0.028887603, 0.0407...","[-0.19078888, -0.014057412, -0.002494617, 0.01...",0.262505,False,False
667,726,56d1218c17492d1400aaba20,How much more money does the city give to the ...,How much more money does the city give to the ...,$ 11 billion,The city give –s $ 11 billion more money to th...,$11 billion,? how much more money does the city give to th...,11 billion,the city give -s 11 billion more money to the...,False,Quantitative Wh-Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56d1218c17492d1400aaba20, the city give...",14 mile -s,"[-0.18709867, -0.0013902837, 0.033759546, 0.01...","[-0.16274187, -0.02008049, 0.05266985, 0.00129...",0.217211,False,False
668,727,56d1218c17492d1400aaba21,"Each year, how much more money does New York C...","Each year, how much more money does New York C...",$ 11.4 billion,New York City give –s $ 11.4 billion more mone...,$11.4 billion,? each year how much more money does New-York-...,11 point 4 billion,New-York-City give -s 11 point 4 billion more...,False,Quantitative Wh-Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56d1218c17492d1400aaba21, New-York-City...",Exploitation number of updates >= 4000,"[-0.15502132, 0.028481387, 0.009052947, 0.0581...","[-0.13401222, -0.0122689605, 0.04238571, 0.012...",0.390273,False,False


In [23]:
#write the results to a file and export the results dataframe to a tsv file
tsv_results_filename = "test_nyc_results" + datetime.datetime.now().strftime("_%Y%m%d_%H%M%S") + ".tsv"
tsv_results_filepath = os.path.join(test_results_dir, tsv_results_filename)
response_formatted_df.to_csv(tsv_results_filepath, sep="\t", index=False)

results_summary_filename = "test_nyc_results_summary" + datetime.datetime.now().strftime("_%Y%m%d_%H%M%S") + ".txt"
results_summary_filepath = os.path.join(test_results_dir, results_summary_filename)

with open(results_summary_filepath, 'w') as results_file:
    #write the number of samples tested
    results_file.write(f"total number of samples\t{len(test_input_lines) / 2}\n")
    results_file.write(f"number_of_test_answers\t{len(response_formatted_df)}\n")
    results_file.write(f"percentage_correct\t{percentage_correct}\n")
    results_file.write(f"percentage_any_word_matches\t{percentage_any_word_matches}\n")
    results_file.write(f"number of test answers longer than 20 words (removed)\t{num_long_answers}\n")
    # write the rows that had exact word matches to the file
    results_file.write("\nRows with exact matches:\n")
    results_file.write(
        correct_matches[["response_question", "response_answer", "test_answer"]].to_markdown(index=False))
    #write the rows in any_matches to the file
    results_file.write("\nRows with any word matches:\n")
    results_file.write(any_matches[["response_question", "response_answer", "test_answer"]].to_markdown(index=False))
    # write the rows that had a close cosine distance to the file
    results_file.write(f"\nRows with cosine distance less than {cosine_distance_threshold}:\n")
    results_file.write(
        close_cosine_distance_df[
            ["response_question", "response_answer", "test_answer", "test_answer_cosine_distance"]].to_markdown(
            index=False))
    # write the rows that had a close cosine distance and any word match to the file
    results_file.write(f"\nRows with cosine distance less than {cosine_distance_threshold} and any word match:\n")
    results_file.write(close_cosine_distance_correct_df[
                           ["response_question", "response_answer", "test_answer",
                            "test_answer_cosine_distance"]].to_markdown(index=False))
    #write the rows that had any matches and with a close cosine distance to the file
    results_file.write(f"\nRows with cosine distance less than {cosine_distance_threshold} and exact match:\n")
    results_file.write(correct_matches[
                           ["response_question", "response_answer", "test_answer",
                            "test_answer_cosine_distance"]].to_markdown(index=False))
print(f"results written to {tsv_results_filepath} and {results_summary_filepath}")

results written to /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/11/results/test_nyc_results_20251010_094532.tsv and /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/11/results/test_nyc_results_summary_20251010_094532.txt
