# This notebook is used to test the ANNABELL model on the SQuAD dataset.


In [8]:
from dataset_processing import question_and_answer_pairs_from_log_file, any_word_match, embedding_for_sentence, \
    cosine_distance, ids_questions_answers_from_log_file
import os
import platform
import datetime
import pandas as pd
from tqdm import tqdm

In [31]:
experiment_number = 9
operating_system = platform.system()
if operating_system == 'Windows':
    raise Exception("not used on Windows yet")
elif operating_system == 'Linux':
    base_directory = "/home/chris/gdrive/work/annabell"
elif operating_system == 'Darwin':  #macOS
    base_directory = "/Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/"
else:
    raise Exception("unsupported OS")

timestamp = datetime.datetime.now().strftime("_%Y%m%d_%H%M%S")

test_input_dir = os.path.join(base_directory, "experiments", str(experiment_number), "testing")
test_log_dir = os.path.join(base_directory, "experiments", str(experiment_number), "logs")
data_dir = os.path.join(base_directory, "experiments/data")
test_results_dir = os.path.join(base_directory, "experiments", str(experiment_number), "results")

test_log_filename = "test_nyc_samples_log_2025-09-29_14-00-53.txt"
test_input_filename = "nyc_squad_testing_commands_20250929_075818.txt"
dataset_filename = "nyc_squad_with_pretraining_commands_20250928_094914.jsonl"

test_log_filepath = os.path.join(test_log_dir, test_log_filename)
test_input_filepath = os.path.join(test_input_dir, test_input_filename)
dataset_filepath = os.path.join(data_dir, dataset_filename)

test_answer_summary_filename = "test_nyc_answer_summary" + timestamp + ".tsv"
tsv_results_filepath = os.path.join(test_results_dir, test_answer_summary_filename)

ids_questions_answers = ids_questions_answers_from_log_file(test_log_filepath)
for index, each_tuple in enumerate(ids_questions_answers):
    if each_tuple[-1] == "" or each_tuple[-1] == None:
        ids_questions_answers[index] = (each_tuple[0], each_tuple[1], "NO ANSWER GIVEN")

print("length of log file questions and answers: " + str(len(ids_questions_answers)))
with open(test_input_filepath, 'r') as test_input_file:
    test_input_lines = test_input_file.readlines()
response_formatted_df = pd.read_json(dataset_filepath, lines=True)
#add the test questions to the dataframe
questions_not_found = []
for the_id, question, answer, in ids_questions_answers:
    if the_id in response_formatted_df["id"].values:
        response_formatted_df.loc[response_formatted_df["id"] == the_id, "test_answer"] = answer
    else:
        questions_not_found.append(question)
print(f"number of test samples not found in training data: {len(questions_not_found)}")
print("test samples not found in training data: " + str(questions_not_found[:5]) + " ...")
#drop any rows that are not in the test samples
response_formatted_df.dropna(subset=["test_answer"], inplace=True)
response_formatted_df.reset_index(inplace=True)
response_formatted_df

length of log file questions and answers: 527
number of test samples not found in training data: 0
test samples not found in training data: [] ...


Unnamed: 0,index,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,created_commands,test_answer
0,0,56ce304daab44d1400b8850e,What city in the United States has the highest...,What city in the United States has the high –e...,New York,The city in the United States with the high –e...,New York,? what city in the United States has the high ...,New York,the city in the United States with the high -e...,False,"[# ID: 56ce304daab44d1400b8850e, the city in t...",Exploitation number of updates >= 4000
1,1,56ce304daab44d1400b8850f,In what city is the United Nations based?,In what city is the United Nations base –d?,New York,The United Nations is base –d in New York.,New York,? in what city is the United Nations base -d,New York,the United Nations is base -d in New York,False,"[# ID: 56ce304daab44d1400b8850f, the United Na...",NO ANSWER GIVEN
2,2,56ce304daab44d1400b88510,What city has been called the cultural capital...,What city has been call –ed the culture –al ca...,New York,New York has been call –ed the culture –al cap...,New York,? what city has been call -ed the culture -al ...,New York,New York has been call -ed the culture -al cap...,False,"[# ID: 56ce304daab44d1400b88510, New York has ...",New York
3,3,56ce304daab44d1400b88511,What American city welcomes the largest number...,What American city welcome –s the large –st nu...,New York,New York is the American city that welcome –s ...,New York,? what American city welcome -s the large -st ...,New York,New York is the American city that welcome -s ...,False,"[# ID: 56ce304daab44d1400b88511, New York is t...",Exploitation number of updates >= 4000
4,4,56cf5d41aab44d1400b89130,The major gateway for immigration has been whi...,The major gateway for immigrate –ion has been ...,New York City,The major gateway for immigrate –ion has been ...,New York City,? the major gateway for immigrate -ion has bee...,New York City,the major gateway for immigrate -ion has been ...,False,"[# ID: 56cf5d41aab44d1400b89130, the major gat...",1 point 95 billion
...,...,...,...,...,...,...,...,...,...,...,...,...,...
522,624,56d10e4617492d1400aab841,How many Major League baseball league pennants...,How many Major League baseball league pennant ...,73,New York team –s have won 73 Major League base...,73,? how many Major League baseball league pennan...,73,New York team -s have won 73 Major League base...,False,"[# ID: 56d10e4617492d1400aab841, New York team...",NO ANSWER GIVEN
523,627,56d10e4617492d1400aab844,What is the nickname for a World Series where ...,What is the nickname for a World Series where ...,Subway Series,The nickname for a World Series where two New ...,Subway Series,? what is the nickname for a World Series wher...,Subway Series,the nickname for a World Series where two New ...,False,"[# ID: 56d10e4617492d1400aab844, the nickname ...",NO ANSWER GIVEN
524,628,56d005a1234ae51400d9c27e,The New York Giants and the New York Jets plac...,The New York Giants and the New York Jets plac...,MetLife Stadium,The New York Giants and the New York Jets play...,MetLife Stadium,? the New York Giants and the New York Jets pl...,MetLife Stadium,the New York Giants and the New York Jets play...,False,"[# ID: 56d005a1234ae51400d9c27e, the New York ...",NO ANSWER GIVEN
525,629,56d005a1234ae51400d9c27f,When was the most recent superbowl held in NYC...,When was the most recent superbowl held in NYC...,2014,The most recent superbowl held in NYC for foot...,2014,? when was the most recent superbowl held in N...,2014,the most recent superbowl held in NYC for foot...,False,"[# ID: 56d005a1234ae51400d9c27f, the most rece...",NO ANSWER GIVEN


In [32]:
#generate embeddings for the test answer and the response_answer_formatted columns and compare them using cosine distance
tqdm.pandas(desc="Generating test answer embeddings")
response_formatted_df["test_answer_embedding"] = response_formatted_df["test_answer"].progress_apply(
    lambda x: embedding_for_sentence(x) if pd.notnull(x) else None)
tqdm.pandas(desc="Generating response answer embeddings")
response_formatted_df["response_answer_formatted_embedding"] = response_formatted_df[
    "response_answer_formatted"].progress_apply(
    lambda x: embedding_for_sentence(x) if pd.notnull(x) else None)

Generating test answer embeddings: 100%|██████████| 527/527 [01:03<00:00,  8.24it/s]
Generating response answer embeddings: 100%|██████████| 527/527 [01:00<00:00,  8.77it/s]


In [33]:
response_formatted_df["test_answer_cosine_distance"] = response_formatted_df.apply(cosine_distance, axis=1)
response_formatted_df

Unnamed: 0,index,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,created_commands,test_answer,test_answer_embedding,response_answer_formatted_embedding,test_answer_cosine_distance
0,0,56ce304daab44d1400b8850e,What city in the United States has the highest...,What city in the United States has the high –e...,New York,The city in the United States with the high –e...,New York,? what city in the United States has the high ...,New York,the city in the United States with the high -e...,False,"[# ID: 56ce304daab44d1400b8850e, the city in t...",Exploitation number of updates >= 4000,"[-0.15502132, 0.028481387, 0.009052947, 0.0581...","[-0.2009252, 0.0065351515, 0.024541078, -0.001...",0.309620
1,1,56ce304daab44d1400b8850f,In what city is the United Nations based?,In what city is the United Nations base –d?,New York,The United Nations is base –d in New York.,New York,? in what city is the United Nations base -d,New York,the United Nations is base -d in New York,False,"[# ID: 56ce304daab44d1400b8850f, the United Na...",NO ANSWER GIVEN,"[-0.17647709, 0.017767908, 0.028887603, 0.0407...","[-0.2009252, 0.0065351515, 0.024541078, -0.001...",0.168470
2,2,56ce304daab44d1400b88510,What city has been called the cultural capital...,What city has been call –ed the culture –al ca...,New York,New York has been call –ed the culture –al cap...,New York,? what city has been call -ed the culture -al ...,New York,New York has been call -ed the culture -al cap...,False,"[# ID: 56ce304daab44d1400b88510, New York has ...",New York,"[-0.2009252, 0.0065351515, 0.024541078, -0.001...","[-0.2009252, 0.0065351515, 0.024541078, -0.001...",0.000000
3,3,56ce304daab44d1400b88511,What American city welcomes the largest number...,What American city welcome –s the large –st nu...,New York,New York is the American city that welcome –s ...,New York,? what American city welcome -s the large -st ...,New York,New York is the American city that welcome -s ...,False,"[# ID: 56ce304daab44d1400b88511, New York is t...",Exploitation number of updates >= 4000,"[-0.15502132, 0.028481387, 0.009052947, 0.0581...","[-0.2009252, 0.0065351515, 0.024541078, -0.001...",0.309620
4,4,56cf5d41aab44d1400b89130,The major gateway for immigration has been whi...,The major gateway for immigrate –ion has been ...,New York City,The major gateway for immigrate –ion has been ...,New York City,? the major gateway for immigrate -ion has bee...,New York City,the major gateway for immigrate -ion has been ...,False,"[# ID: 56cf5d41aab44d1400b89130, the major gat...",1 point 95 billion,"[-0.16469762, 0.016392894, 0.043153655, -0.001...","[-0.1971769, 0.014245291, 0.030869555, 0.00048...",0.219910
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522,624,56d10e4617492d1400aab841,How many Major League baseball league pennants...,How many Major League baseball league pennant ...,73,New York team –s have won 73 Major League base...,73,? how many Major League baseball league pennan...,73,New York team -s have won 73 Major League base...,False,"[# ID: 56d10e4617492d1400aab841, New York team...",NO ANSWER GIVEN,"[-0.17647709, 0.017767908, 0.028887603, 0.0407...","[-0.19190602, -0.00080314564, 0.007982052, 0.0...",0.193329
523,627,56d10e4617492d1400aab844,What is the nickname for a World Series where ...,What is the nickname for a World Series where ...,Subway Series,The nickname for a World Series where two New ...,Subway Series,? what is the nickname for a World Series wher...,Subway Series,the nickname for a World Series where two New ...,False,"[# ID: 56d10e4617492d1400aab844, the nickname ...",NO ANSWER GIVEN,"[-0.17647709, 0.017767908, 0.028887603, 0.0407...","[-0.203238, 0.00032487154, -0.00096490426, -0....",0.199532
524,628,56d005a1234ae51400d9c27e,The New York Giants and the New York Jets plac...,The New York Giants and the New York Jets plac...,MetLife Stadium,The New York Giants and the New York Jets play...,MetLife Stadium,? the New York Giants and the New York Jets pl...,MetLife Stadium,the New York Giants and the New York Jets play...,False,"[# ID: 56d005a1234ae51400d9c27e, the New York ...",NO ANSWER GIVEN,"[-0.17647709, 0.017767908, 0.028887603, 0.0407...","[-0.19918741, -0.006811507, 0.034609053, 0.003...",0.184822
525,629,56d005a1234ae51400d9c27f,When was the most recent superbowl held in NYC...,When was the most recent superbowl held in NYC...,2014,The most recent superbowl held in NYC for foot...,2014,? when was the most recent superbowl held in N...,2014,the most recent superbowl held in NYC for foot...,False,"[# ID: 56d005a1234ae51400d9c27f, the most rece...",NO ANSWER GIVEN,"[-0.17647709, 0.017767908, 0.028887603, 0.0407...","[-0.18289578, 0.012464214, 0.0044358545, 0.011...",0.212417


In [34]:
# Get the counts for each unique value in the 'test_answer' column
test_answer_summary = response_formatted_df['test_answer'].value_counts().reset_index()
# Rename the columns for clarity
test_answer_summary.columns = ['test_answer', 'count']
# Sort the results by count in descending order
test_answer_summary.sort_values(by='count', ascending=False, inplace=True)
test_answer_summary

Unnamed: 0,test_answer,count
0,NO ANSWER GIVEN,169
1,Exploitation number of updates >= 4000,77
2,Lenapehoking,14
3,200000,12
4,Queens Borough Public Library,12
...,...,...
69,was La,1
70,the state,1
71,146 people,1
73,21 million,1


In [35]:
#write the results dataframe to a tsv file
test_answer_summary.to_csv(tsv_results_filepath, sep="\t", index=False)
#count the number of results where the test answer is > 20 words
num_long_answers = response_formatted_df["test_answer"].apply(
    lambda x: len(x.split()) > 20 if pd.notnull(x) else False).sum()
print(f"number of test answers longer than 20 words: {num_long_answers}")
response_formatted_df["test_answer_correct"] = response_formatted_df["test_answer"] == response_formatted_df[
    "response_answer_formatted"]
number_correct = response_formatted_df["test_answer_correct"].sum()
print(f"number correct = {number_correct} out of {len(response_formatted_df)}")
percentage_correct = response_formatted_df["test_answer_correct"].mean() * 100
print(f"percentage correct = {percentage_correct} %")

response_formatted_df["test_answer_any_matching_word"] = response_formatted_df.apply(any_word_match, axis=1)
percentage_any_word_matches = response_formatted_df["test_answer_any_matching_word"].mean() * 100
number_any_word_matches = response_formatted_df["test_answer_any_matching_word"].sum()
print(f"number any word matches = {number_any_word_matches} out of {len(response_formatted_df)}")
print(f"percentage any word matches = {percentage_any_word_matches} %")

cosine_distance_threshold = 0.1
#create a dataframe with the rows where the cosine distance is less than the threshold
close_cosine_distance_df = response_formatted_df[
    response_formatted_df["test_answer_cosine_distance"] < cosine_distance_threshold]
print(f"number of rows with cosine distance less than {cosine_distance_threshold}: {len(close_cosine_distance_df)}")
print("percentage of total: " + str(len(close_cosine_distance_df) / len(response_formatted_df) * 100) + " %")

#create a dataframe with the rows where the cosine distance is less than the threshold and any matching answer is correct
close_cosine_distance_correct_df = close_cosine_distance_df[
    close_cosine_distance_df["test_answer_any_matching_word"]]
print(
    f"number of rows with cosine distance less than {cosine_distance_threshold} and any matching answer correct: {len(close_cosine_distance_correct_df)}")
print("percentage of total: " + str(len(close_cosine_distance_correct_df) / len
(response_formatted_df) * 100) + " %")

close_cosine_distance_correct_df

number of test answers longer than 20 words: 0
number correct = 10 out of 527
percentage correct = 1.8975332068311195 %
number any word matches = 27 out of 527
percentage any word matches = 5.1233396584440225 %
number of rows with cosine distance less than 0.1: 64
percentage of total: 12.144212523719165 %
number of rows with cosine distance less than 0.1 and any matching answer correct: 17
percentage of total: 3.225806451612903 %


Unnamed: 0,index,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,created_commands,test_answer,test_answer_embedding,response_answer_formatted_embedding,test_answer_cosine_distance,test_answer_correct,test_answer_any_matching_word
2,2,56ce304daab44d1400b88510,What city has been called the cultural capital...,What city has been call –ed the culture –al ca...,New York,New York has been call –ed the culture –al cap...,New York,? what city has been call -ed the culture -al ...,New York,New York has been call -ed the culture -al cap...,False,"[# ID: 56ce304daab44d1400b88510, New York has ...",New York,"[-0.2009252, 0.0065351515, 0.024541078, -0.001...","[-0.2009252, 0.0065351515, 0.024541078, -0.001...",0.0,True,True
26,27,56ce32e7aab44d1400b88552,In what borough is Wall Street located?,In what borough is Wall Street locate –d?,Manhattan,Wall Street is locate –d in the borough of Man...,Manhattan,? in what borough is Wall Street locate -d,Manhattan,Wall Street is locate -d in the borough of Man...,False,"[# ID: 56ce32e7aab44d1400b88552, Wall Street i...",Manhattan,"[-0.20684884, -0.0033799286, 0.038709026, -0.0...","[-0.20684884, -0.0033799286, 0.038709026, -0.0...",0.0,True,True
38,44,56ce34c7aab44d1400b88596,What was the nationality of Estêvão Gomes?,What was the national –ity of Estêvão Gomes?,Portuguese,The national –ity of Estêvão Gomes was Portugu...,Portuguese,? what was the national -ity of Estevao Gomes,Portuguese,the national -ity of Estevao Gomes was Portuguese,False,"[# ID: 56ce34c7aab44d1400b88596, the national ...",Portuguese,"[-0.19723552, -0.017073331, 0.018096691, 0.000...","[-0.19723552, -0.017073331, 0.018096691, 0.000...",0.0,True,True
75,91,56ceddd9aab44d1400b88b60,"In 1730, what American city had the highest pe...","In 1730, what American city had the high –est ...","Charleston, South Carolina","In 1730, Charleston, South Carolina had the hi...","Charleston, South Carolina",? in 1730 what American city had the high -est...,Charleston South Carolina,in 1730 Charleston South Carolina had the high...,False,"[# ID: 56ceddd9aab44d1400b88b60, in 1730 Charl...",Charleston South,"[-0.20465769, -0.026038276, 0.017783925, 0.012...","[-0.20349422, -0.02054222, 0.014909617, 0.0118...",0.005934,False,True
81,98,56cede40aab44d1400b88b73,In what year was Columbia University chartered?,In what year was Columbia University charter –ed?,1754,Columbia University was charter –ed in 1754.,1754,? in what year was Columbia University charter...,1754,Columbia University was charter -ed in 1754,False,"[# ID: 56cede40aab44d1400b88b73, Columbia Univ...",in 1754,"[-0.15577044, -0.004608084, 0.05936995, -0.022...","[-0.18632932, -0.00466555, 0.0492731, 0.002800...",0.08415,False,True
157,181,56cee90caab44d1400b88c88,In what borough did the Stonewall riots happen?,In what borough did the Stonewall riots happen?,Manhattan,The Stonewall riots happen –ed in Manhattan.,Manhattan,? in what borough did the Stonewall riots happen,Manhattan,the Stonewall riots happen -ed in Manhattan,False,"[# ID: 56cee90caab44d1400b88c88, the Stonewall...",Manhattan,"[-0.20684884, -0.0033799286, 0.038709026, -0.0...","[-0.20684884, -0.0033799286, 0.038709026, -0.0...",0.0,True,True
166,193,56ceeb94aab44d1400b88cb4,On what date did the World Trade Center PATH b...,On what date did the World Trade Center PATH b...,"July 19, 1909",The World Trade Center PATH began operate –ion...,"July 19, 1909",? on what date did the World Trade Center PATH...,July 19 1909,the World Trade Center PATH began operate -ion...,False,"[# ID: 56ceeb94aab44d1400b88cb4, the World Tra...",July 19 1909,"[-0.1697763, 0.030600576, 0.010123951, -0.0168...","[-0.1697763, 0.030600576, 0.010123951, -0.0168...",0.0,True,True
181,216,56cef613aab44d1400b88d2c,What river flows between the Hudson and East R...,What river flow –s between the Hudson and East...,Harlem River,The Harlem River flow –s between the Hudson an...,Harlem River,? what river flow -s between the Hudson and Ea...,Harlem River,the Harlem River flow -s between the Hudson an...,False,"[# ID: 56cef613aab44d1400b88d2c, the Harlem Ri...",Hudson River,"[-0.21610382, -0.009038954, 0.0075779464, 0.02...","[-0.20373935, -0.02124149, 0.04380335, 0.01637...",0.064366,False,True
215,253,56cfe201234ae51400d9bffe,When was the Empire State Building constructed?,When was the Empire State Building construct –ed?,1931,The Empire State Building was construct –ed in...,1931,? when was the Empire State Building construct...,1931,the Empire State Building was construct -ed in...,False,"[# ID: 56cfe201234ae51400d9bffe, the Empire St...",1931,"[-0.19556679, 0.011965578, 0.045552537, -0.001...","[-0.19556679, 0.011965578, 0.045552537, -0.001...",0.0,True,True
233,279,56cf2b33aab44d1400b88db0,What is New York City's daily January mean tem...,What is New York City's dai –ly January mean t...,0.3,New York City's dai –ly January mean temperatu...,0.3,? what is New York Citys dai -ly January mean ...,0 point 3,New York Citys dai -ly January mean temperatur...,False,"[# ID: 56cf2b33aab44d1400b88db0, New York City...",0 point 5,"[-0.1849458, 0.013786762, 0.040066436, 0.00974...","[-0.18568538, 0.015396277, 0.039072547, 0.0084...",0.07196,False,True


In [36]:
correct_matches = response_formatted_df[response_formatted_df["test_answer_correct"]]
correct_matches

Unnamed: 0,index,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,created_commands,test_answer,test_answer_embedding,response_answer_formatted_embedding,test_answer_cosine_distance,test_answer_correct,test_answer_any_matching_word
2,2,56ce304daab44d1400b88510,What city has been called the cultural capital...,What city has been call –ed the culture –al ca...,New York,New York has been call –ed the culture –al cap...,New York,? what city has been call -ed the culture -al ...,New York,New York has been call -ed the culture -al cap...,False,"[# ID: 56ce304daab44d1400b88510, New York has ...",New York,"[-0.2009252, 0.0065351515, 0.024541078, -0.001...","[-0.2009252, 0.0065351515, 0.024541078, -0.001...",0.0,True,True
26,27,56ce32e7aab44d1400b88552,In what borough is Wall Street located?,In what borough is Wall Street locate –d?,Manhattan,Wall Street is locate –d in the borough of Man...,Manhattan,? in what borough is Wall Street locate -d,Manhattan,Wall Street is locate -d in the borough of Man...,False,"[# ID: 56ce32e7aab44d1400b88552, Wall Street i...",Manhattan,"[-0.20684884, -0.0033799286, 0.038709026, -0.0...","[-0.20684884, -0.0033799286, 0.038709026, -0.0...",0.0,True,True
38,44,56ce34c7aab44d1400b88596,What was the nationality of Estêvão Gomes?,What was the national –ity of Estêvão Gomes?,Portuguese,The national –ity of Estêvão Gomes was Portugu...,Portuguese,? what was the national -ity of Estevao Gomes,Portuguese,the national -ity of Estevao Gomes was Portuguese,False,"[# ID: 56ce34c7aab44d1400b88596, the national ...",Portuguese,"[-0.19723552, -0.017073331, 0.018096691, 0.000...","[-0.19723552, -0.017073331, 0.018096691, 0.000...",0.0,True,True
157,181,56cee90caab44d1400b88c88,In what borough did the Stonewall riots happen?,In what borough did the Stonewall riots happen?,Manhattan,The Stonewall riots happen –ed in Manhattan.,Manhattan,? in what borough did the Stonewall riots happen,Manhattan,the Stonewall riots happen -ed in Manhattan,False,"[# ID: 56cee90caab44d1400b88c88, the Stonewall...",Manhattan,"[-0.20684884, -0.0033799286, 0.038709026, -0.0...","[-0.20684884, -0.0033799286, 0.038709026, -0.0...",0.0,True,True
166,193,56ceeb94aab44d1400b88cb4,On what date did the World Trade Center PATH b...,On what date did the World Trade Center PATH b...,"July 19, 1909",The World Trade Center PATH began operate –ion...,"July 19, 1909",? on what date did the World Trade Center PATH...,July 19 1909,the World Trade Center PATH began operate -ion...,False,"[# ID: 56ceeb94aab44d1400b88cb4, the World Tra...",July 19 1909,"[-0.1697763, 0.030600576, 0.010123951, -0.0168...","[-0.1697763, 0.030600576, 0.010123951, -0.0168...",0.0,True,True
215,253,56cfe201234ae51400d9bffe,When was the Empire State Building constructed?,When was the Empire State Building construct –ed?,1931,The Empire State Building was construct –ed in...,1931,? when was the Empire State Building construct...,1931,the Empire State Building was construct -ed in...,False,"[# ID: 56cfe201234ae51400d9bffe, the Empire St...",1931,"[-0.19556679, 0.011965578, 0.045552537, -0.001...","[-0.19556679, 0.011965578, 0.045552537, -0.001...",0.0,True,True
286,340,56cfe890234ae51400d9c07d,What is the population of NYC as of 2014?,What is the populate –ion of NYC as of 2014?,8491079,"The populate –ion of NYC as of 2014 is 8,491,079.",8491079,? what is the populate -ion of NYC as of 2014,8491079,the populate -ion of NYC as of 2014 is 8491079,False,"[# ID: 56cfe890234ae51400d9c07d, the populate ...",8491079,"[-0.16185391, 0.009814939, 0.027025407, 0.0109...","[-0.16185391, 0.009814939, 0.027025407, 0.0109...",0.0,True,True
368,435,56cf4a29aab44d1400b88f58,In what borough is Godiva based?,In what borough is Godiva base –d?,Manhattan,Godiva is base –d in Manhattan.,Manhattan,? in what borough is Godiva base -d,Manhattan,Godiva is base -d in Manhattan,False,"[# ID: 56cf4a29aab44d1400b88f58, Godiva is bas...",Manhattan,"[-0.20684884, -0.0033799286, 0.038709026, -0.0...","[-0.20684884, -0.0033799286, 0.038709026, -0.0...",0.0,True,True
386,455,56cf55aeaab44d1400b89030,In what borough is Silicon Alley located?,In what borough is Silicon Alley locate –d?,Manhattan,Silicon Alley is locate –d in the borough of M...,Manhattan,? in what borough is Silicon Alley locate -d,Manhattan,Silicon Alley is locate -d in the borough of M...,False,"[# ID: 56cf55aeaab44d1400b89030, Silicon Alley...",Manhattan,"[-0.20684884, -0.0033799286, 0.038709026, -0.0...","[-0.20684884, -0.0033799286, 0.038709026, -0.0...",0.0,True,True
507,601,56d003cd234ae51400d9c257,How many people attended Broadway shows during...,How many people attend –ed Broadway show –s du...,12.21 million,12.21 million people attend –ed Broadway show ...,12.21 million,? how many people attend -ed Broadway show -s ...,12 point 21 million,12 point 21 million people attend -ed Broadway...,False,"[# ID: 56d003cd234ae51400d9c257, 12 point 21 m...",12 point 21 million,"[-0.158016, 0.004409022, 0.038073786, 0.009596...","[-0.158016, 0.004409022, 0.038073786, 0.009596...",0.0,True,True


In [38]:
any_matches = response_formatted_df[response_formatted_df["test_answer_any_matching_word"]]
any_matches

Unnamed: 0,index,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,created_commands,test_answer,test_answer_embedding,response_answer_formatted_embedding,test_answer_cosine_distance,test_answer_correct,test_answer_any_matching_word
2,2,56ce304daab44d1400b88510,What city has been called the cultural capital...,What city has been call –ed the culture –al ca...,New York,New York has been call –ed the culture –al cap...,New York,? what city has been call -ed the culture -al ...,New York,New York has been call -ed the culture -al cap...,False,"[# ID: 56ce304daab44d1400b88510, New York has ...",New York,"[-0.2009252, 0.0065351515, 0.024541078, -0.001...","[-0.2009252, 0.0065351515, 0.024541078, -0.001...",0.0,True,True
26,27,56ce32e7aab44d1400b88552,In what borough is Wall Street located?,In what borough is Wall Street locate –d?,Manhattan,Wall Street is locate –d in the borough of Man...,Manhattan,? in what borough is Wall Street locate -d,Manhattan,Wall Street is locate -d in the borough of Man...,False,"[# ID: 56ce32e7aab44d1400b88552, Wall Street i...",Manhattan,"[-0.20684884, -0.0033799286, 0.038709026, -0.0...","[-0.20684884, -0.0033799286, 0.038709026, -0.0...",0.0,True,True
38,44,56ce34c7aab44d1400b88596,What was the nationality of Estêvão Gomes?,What was the national –ity of Estêvão Gomes?,Portuguese,The national –ity of Estêvão Gomes was Portugu...,Portuguese,? what was the national -ity of Estevao Gomes,Portuguese,the national -ity of Estevao Gomes was Portuguese,False,"[# ID: 56ce34c7aab44d1400b88596, the national ...",Portuguese,"[-0.19723552, -0.017073331, 0.018096691, 0.000...","[-0.19723552, -0.017073331, 0.018096691, 0.000...",0.0,True,True
75,91,56ceddd9aab44d1400b88b60,"In 1730, what American city had the highest pe...","In 1730, what American city had the high –est ...","Charleston, South Carolina","In 1730, Charleston, South Carolina had the hi...","Charleston, South Carolina",? in 1730 what American city had the high -est...,Charleston South Carolina,in 1730 Charleston South Carolina had the high...,False,"[# ID: 56ceddd9aab44d1400b88b60, in 1730 Charl...",Charleston South,"[-0.20465769, -0.026038276, 0.017783925, 0.012...","[-0.20349422, -0.02054222, 0.014909617, 0.0118...",0.005934,False,True
81,98,56cede40aab44d1400b88b73,In what year was Columbia University chartered?,In what year was Columbia University charter –ed?,1754,Columbia University was charter –ed in 1754.,1754,? in what year was Columbia University charter...,1754,Columbia University was charter -ed in 1754,False,"[# ID: 56cede40aab44d1400b88b73, Columbia Univ...",in 1754,"[-0.15577044, -0.004608084, 0.05936995, -0.022...","[-0.18632932, -0.00466555, 0.0492731, 0.002800...",0.08415,False,True
125,146,56cee5a1aab44d1400b88c26,"In 1860, what percentage of the city populatio...","In 1860, what percentage of the city populate ...",25%,"In 1860, 25% of the city's populate –ion was c...",25%,? in 1860 what percentage of the city populate...,25 percent,in 1860 25 percent of the citys populate -ion ...,False,"[# ID: 56cee5a1aab44d1400b88c26, in 1860 25 pe...",11 point 4 percent,"[-0.15624279, -0.015117809, 0.031368643, 0.021...","[-0.17993657, -0.0011515665, -0.0049429797, 0....",0.207593,False,True
157,181,56cee90caab44d1400b88c88,In what borough did the Stonewall riots happen?,In what borough did the Stonewall riots happen?,Manhattan,The Stonewall riots happen –ed in Manhattan.,Manhattan,? in what borough did the Stonewall riots happen,Manhattan,the Stonewall riots happen -ed in Manhattan,False,"[# ID: 56cee90caab44d1400b88c88, the Stonewall...",Manhattan,"[-0.20684884, -0.0033799286, 0.038709026, -0.0...","[-0.20684884, -0.0033799286, 0.038709026, -0.0...",0.0,True,True
166,193,56ceeb94aab44d1400b88cb4,On what date did the World Trade Center PATH b...,On what date did the World Trade Center PATH b...,"July 19, 1909",The World Trade Center PATH began operate –ion...,"July 19, 1909",? on what date did the World Trade Center PATH...,July 19 1909,the World Trade Center PATH began operate -ion...,False,"[# ID: 56ceeb94aab44d1400b88cb4, the World Tra...",July 19 1909,"[-0.1697763, 0.030600576, 0.010123951, -0.0168...","[-0.1697763, 0.030600576, 0.010123951, -0.0168...",0.0,True,True
181,216,56cef613aab44d1400b88d2c,What river flows between the Hudson and East R...,What river flow –s between the Hudson and East...,Harlem River,The Harlem River flow –s between the Hudson an...,Harlem River,? what river flow -s between the Hudson and Ea...,Harlem River,the Harlem River flow -s between the Hudson an...,False,"[# ID: 56cef613aab44d1400b88d2c, the Harlem Ri...",Hudson River,"[-0.21610382, -0.009038954, 0.0075779464, 0.02...","[-0.20373935, -0.02124149, 0.04380335, 0.01637...",0.064366,False,True
184,219,56cef613aab44d1400b88d2f,Between New York City and what city is the Hud...,Between New York City and what city is the Hud...,"Troy, New York",The Hudson River is an estuary between New Yor...,"Troy, New York",? between New York City and what city is the H...,Troy New York,the Hudson River is an estuary between New Yor...,False,"[# ID: 56cef613aab44d1400b88d2f, the Hudson Ri...",the New York Times,"[-0.16780667, 0.015810816, 0.039937757, 0.0117...","[-0.19643416, 0.0020477828, 0.029460974, 0.016...",0.110069,False,True


In [39]:
#write the results to a file and export the results dataframe to a tsv file
tsv_results_filename = "test_nyc_results" + datetime.datetime.now().strftime("_%Y%m%d_%H%M%S") + ".tsv"
tsv_results_filepath = os.path.join(test_results_dir, tsv_results_filename)
response_formatted_df.to_csv(tsv_results_filepath, sep="\t", index=False)

results_summary_filename = "test_nyc_results_summary" + datetime.datetime.now().strftime("_%Y%m%d_%H%M%S") + ".txt"
results_summary_filepath = os.path.join(test_results_dir, results_summary_filename)

with open(results_summary_filepath, 'w') as results_file:
    #write the number of samples tested
    results_file.write(f"total number of samples\t{len(test_input_lines) / 2}\n")
    results_file.write(f"number_of_test_answers\t{len(response_formatted_df)}\n")
    results_file.write(f"percentage_correct\t{percentage_correct}\n")
    results_file.write(f"percentage_any_word_matches\t{percentage_any_word_matches}\n")
    results_file.write(f"number of test answers longer than 20 words (removed)\t{num_long_answers}\n")
    # write the rows that had exact word matches to the file
    results_file.write("\nRows with exact matches:\n")
    results_file.write(
        correct_matches[["response_question", "response_answer", "test_answer"]].to_markdown(index=False))
    #write the rows in any_matches to the file
    results_file.write("\nRows with any word matches:\n")
    results_file.write(any_matches[["response_question", "response_answer", "test_answer"]].to_markdown(index=False))
    # write the rows that had a close cosine distance to the file
    results_file.write(f"\nRows with cosine distance less than {cosine_distance_threshold}:\n")
    results_file.write(
        close_cosine_distance_df[
            ["response_question", "response_answer", "test_answer", "test_answer_cosine_distance"]].to_markdown(
            index=False))
    # write the rows that had a close cosine distance and any word match to the file
    results_file.write(f"\nRows with cosine distance less than {cosine_distance_threshold} and any word match:\n")
    results_file.write(close_cosine_distance_correct_df[
                           ["response_question", "response_answer", "test_answer",
                            "test_answer_cosine_distance"]].to_markdown(index=False))
    #write the rows that had any matches and with a close cosine distance to the file
    results_file.write(f"\nRows with cosine distance less than {cosine_distance_threshold} and exact match:\n")
    results_file.write(correct_matches[
                           ["response_question", "response_answer", "test_answer",
                            "test_answer_cosine_distance"]].to_markdown(index=False))
print(f"results written to {tsv_results_filepath} and {results_summary_filepath}")

results written to /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/9/results/test_nyc_results_20251001_074112.tsv and /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/9/results/test_nyc_results_summary_20251001_074112.txt
