# This notebook is used to test the ANNABELL model on the SQuAD dataset.


In [1]:
from dataset_processing import question_and_answer_pairs_from_log_file, any_word_match
import pandas as pd
import os
import platform
import datetime

In [2]:
experiment_number = 8
operating_system = platform.system()
if operating_system == 'Windows':
    raise Exception("not used on Windows yet")
elif operating_system == 'Linux':
    base_directory = "/home/chris/gdrive/work/annabell"
elif operating_system == 'Darwin': #macOS
    base_directory = "/Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/"
else:
    raise Exception("unsupported OS")

timestamp = datetime.datetime.now().strftime("_%Y%m%d_%H%M%S")

test_input_dir = os.path.join(base_directory, "experiments", str(experiment_number), "testing")
test_log_dir = os.path.join(base_directory, "experiments", str(experiment_number), "logs")
data_dir = os.path.join(base_directory, "experiments/data")
test_results_dir = os.path.join(base_directory, "experiments", str(experiment_number), "results")

test_log_filename = "test_nyc_samples_log_2025-09-24_05-28-54.txt"
test_input_filename = "nyc_declarative_sentences_testing_20250924_062527.txt"
dataset_filename = "response_formatted_20250924_174653.jsonl"

test_log_filepath = os.path.join(test_log_dir, test_log_filename)
test_input_filepath = os.path.join(test_input_dir, test_input_filename)
dataset_filepath = os.path.join(data_dir, dataset_filename)

test_answer_summary_filename = "test_nyc_answer_summary" + timestamp + ".tsv"
tsv_results_filepath = os.path.join(test_results_dir, test_answer_summary_filename)

In [26]:
question_and_answer_pairs = question_and_answer_pairs_from_log_file(test_log_filepath)
print("length of log file questions and answers: " + str(len(question_and_answer_pairs)))
with open(test_input_filepath, 'r') as test_input_file:
    test_input_lines = test_input_file.readlines()
print("length of test file questions: " + str(len(test_input_lines)))
#load the LLM response jsonl file into a dataframe
response_formatted_df = pd.read_json(dataset_filepath, lines=True)
#add the test questions to the dataframe
questions_not_found = []
for question, answer in question_and_answer_pairs:
    if question in response_formatted_df["response_question_formatted"].values:
        response_formatted_df.loc[response_formatted_df["response_question_formatted"] == question, "test_answer"] = answer
    else:
        questions_not_found.append(question)
print(f"number of questions not found in training data: {len(questions_not_found)}")
print("questions not found in training data: " + str(questions_not_found[:5]) + " ...")
response_formatted_df.head()

length of log file questions and answers: 829
length of log file questions and answers: 829
length of test file questions: 1568
number of questions not found in training data: 45
questions not found in training data: ['? when', '? the', '? on', '? New Netherland', '?'] ...


Unnamed: 0,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,test_answer
0,56ce304daab44d1400b8850e,What city in the United States has the highest...,What city in the United States has the high –e...,New York,The city in the United States with the high –e...,New York,? what city in the United States has the high ...,New York,the city in the United States with the high -e...,False,Hill
1,56ce304daab44d1400b8850f,In what city is the United Nations based?,In what city is the United Nations base –d?,New York,The United Nations is base –d in New York.,New York,? in what city is the United Nations base -d,New York,the United Nations is base -d in New York,True,
2,56ce304daab44d1400b88510,What city has been called the cultural capital...,What city has been call –ed the culture –al ca...,New York,New York has been call –ed the culture –al cap...,New York,? what city has been call -ed the culture -al ...,New York,New York has been call -ed the culture -al cap...,False,Exploitation number of updates >= 4000
3,56ce304daab44d1400b88511,What American city welcomes the largest number...,What American city welcome –s the large –st nu...,New York,New York is the American city that welcome –s ...,New York,? what American city welcome -s the large -st ...,New York,New York is the American city that welcome -s ...,False,the Bronx
4,56cf5d41aab44d1400b89130,The major gateway for immigration has been whi...,The major gateway for immigrate –ion has been ...,New York City,The major gateway for immigrate –ion has been ...,New York City,? the major gateway for immigrate -ion has bee...,New York City,the major gateway for immigrate -ion has been ...,False,Exploitation number of updates >= 4000


In [13]:
# Get the counts for each unique value in the 'test_answer' column
test_answer_summary = response_formatted_df['test_answer'].value_counts().reset_index()
# Rename the columns for clarity
test_answer_summary.columns = ['test_answer', 'count']
# Sort the results by count in descending order
test_answer_summary.sort_values(by='count', ascending=False, inplace=True)
test_answer_summary

Unnamed: 0,test_answer,count
0,,266
1,Exploitation number of updates >= 4000,89
2,.,45
4,NYCTV,12
3,more than 200,12
...,...,...
105,system,1
104,Great Fire of New York,1
103,the full,1
102,the populate,1


In [15]:
#write the results dataframe to a tsv file
test_answer_summary.to_csv(tsv_results_filepath, sep="\t", index=False)
#count the number of results where the test answer is > 20 words
num_long_answers = response_formatted_df["test_answer"].apply(lambda x: len(x.split()) > 20 if pd.notnull(x) else False).sum()
print(f"number of test answers longer than 20 words: {num_long_answers}")
response_formatted_df["test_answer_correct"] = response_formatted_df["test_answer"] == response_formatted_df["response_answer_formatted"]
percentage_correct =  response_formatted_df["test_answer_correct"].mean() * 100
print(f"percentage correct = {percentage_correct} %")

response_formatted_df["test_answer_any_matching_word"] = response_formatted_df.apply(any_word_match, axis=1)

percentage_any_word_matches =  response_formatted_df["test_answer_any_matching_word"].mean() * 100
print(f"percentage any word matches = {percentage_any_word_matches} %")
correct_matches = response_formatted_df[response_formatted_df["test_answer_correct"]]
correct_matches

number of test answers longer than 20 words: 0


In [21]:
any_matches = response_formatted_df[response_formatted_df["test_answer_any_matching_word"]]
any_matches

Unnamed: 0,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,test_answer,test_answer_correct,test_answer_any_matching_word
4,56cf5d41aab44d1400b89131,The most populated city in the United States i...,The most populate –d city in the United States...,New York City,The most populate –d city in the United States...,New York City,? the most populate -d city in the United Stat...,New York City,the most populate -d city in the United States...,False,New York become the,False,True
5,56ce3124aab44d1400b8852a,How many boroughs comprise New York City?,How many borough –s comprise New York City?,five,New York City is comprise –d of five borough –s.,five,? how many borough -s comprise New York City,five,New York City is comprise -d of five borough -s,False,five,True,True
10,56cf9d81234ae51400d9be1b,How man boroughs does New York City contain?,How many borough –s does New York City contain?,five,New York City contain –s five borough –s.,five,? how many borough -s does New York City contain,five,New York City contain -s five borough -s,False,five,True,True
16,56ce31baaab44d1400b8853b,What nation founded New Amsterdam?,What nation found –ed New Amsterdam?,the Dutch Republic,The Dutch Republic found –ed New Amsterdam.,the Dutch Republic,? what nation found -ed New Amsterdam,the Dutch Republic,the Dutch Republic found -ed New Amsterdam,False,the Dutch,False,True
18,56ce31baaab44d1400b8853d,When did the English take over the area from t...,When did the English take over the area from t...,1664,The English took over the area from the Dutch ...,1664,? when did the English take over the area from...,1664,the English took over the area from the Dutch ...,False,in 1664,False,True
26,56ce32e7aab44d1400b88552,In what borough is Wall Street located?,In what borough is Wall Street locate –d?,Manhattan,Wall Street is locate –d in the borough of Man...,Manhattan,? in what borough is Wall Street locate -d,Manhattan,Wall Street is locate -d in the borough of Man...,False,Manhattan,True,True
40,56ce34c7aab44d1400b88596,What was the nationality of Estêvão Gomes?,What was the national –ity of Estêvão Gomes?,Portuguese,The national –ity of Estêvão Gomes was Portugu...,Portuguese,? what was the national -ity of Estevao Gomes,Portuguese,the national -ity of Estevao Gomes was Portuguese,False,Portuguese,True,True
42,56ce34c7aab44d1400b88598,What was the name of Estêvão Gomes's ship?,What was the name of Estêvão Gomes's ship?,La Anunciada,The name of Estêvão Gomes's ship was La Anunci...,La Anunciada,? what was the name of Estevao Gomess ship,La Anunciada,the name of Estevao Gomess ship was La Anunciada,False,La,False,True
54,56ce362aaab44d1400b885bd,What did the Dutch call Juan Rodriguez?,What did the Dutch call Juan Rodriguez?,Jan Rodrigues,The Dutch call –ed Juan Rodriguez Jan Rodrigues.,Jan Rodrigues,? what did the Dutch call Juan Rodriguez,Jan Rodrigues,the Dutch call -ed Juan Rodriguez Jan Rodrigues,False,Rodrigues,False,True
60,56cedbb9aab44d1400b88b0f,On what island did the Dutch set up a settleme...,On what island did the Dutch set up a settle –...,Governors Island,The Dutch set up a settle –ment to trade fur –...,Governors Island,? on what island did the Dutch set up a settle...,Governors Island,the Dutch set up a settle -ment to trade fur -...,False,-s on Governors,False,True


In [22]:
#write the results to a file and export the results dataframe to a tsv file
tsv_results_filename = "test_nyc_results" + datetime.datetime.now().strftime("_%Y%m%d_%H%M%S") + ".tsv"
tsv_results_filepath = os.path.join(test_results_dir, tsv_results_filename)
response_formatted_df.to_csv(tsv_results_filepath, sep="\t", index=False)

results_summary_filename = "test_nyc_results_summary" + datetime.datetime.now().strftime("_%Y%m%d_%H%M%S") + ".txt"
results_summary_filepath = os.path.join(test_results_dir, results_summary_filename)

with open(results_summary_filepath, 'w') as results_file:
	#write the number of samples tested
	results_file.write(f"total number of samples\t{len(test_input_lines)/2}\n")
	results_file.write(f"number_of_test_answers\t{len(response_formatted_df)}\n")
	results_file.write(f"percentage_correct\t{percentage_correct}\n")
	results_file.write(f"percentage_any_word_matches\t{percentage_any_word_matches}\n")
	results_file.write(f"number of test answers longer than 20 words (removed)\t{num_long_answers}\n")
	# write the rows that had exact word matches to the file
	results_file.write("\nRows with exact matches:\n")
	results_file.write(correct_matches[["response_question", "response_answer", "test_answer"]].to_markdown(index=False))
	#write the rows in any_matches to the file
	results_file.write("\nRows with any word matches:\n")
	results_file.write(any_matches[["response_question", "response_answer", "test_answer"]].to_markdown(index=False))
print(f"results written to {tsv_results_filepath} and {results_summary_filepath}")

results written to /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/8/results/test_nyc_results_20250925_062932.tsv and /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/8/results/test_nyc_results_summary_20250925_062932.txt
