In [1]:
import os
import pandas as pd
from dataset_processing import write_training_file, write_testing_file, format_text, is_pretraining_question
from datetime import datetime

In [5]:
base_dir = "/Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work"
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
data_directory = os.path.join(base_dir, "annabell/experiments/data")
nyc_train_json_l_filename = "nyc_squad_train.jsonl"
nyc_train_json_l_filepath = os.path.join(data_directory, nyc_train_json_l_filename)
response_filename = "responses.jsonl"
response_filepath = os.path.join(data_directory, response_filename)
pretraining_dir = "annabell/experiments/data"
pretraining_nyc_filename = "pre_training_nyc_samples_manual.txt"
training_filename_base = "nyc_declarative_sentences_training"
training_filename = f"{training_filename_base}_{timestamp}.txt"
training_filepath = os.path.join(data_directory, training_filename)
testing_filename_base = "nyc_declarative_sentences_testing"
testing_filename = f"{testing_filename_base}_{timestamp}.txt"
testing_filepath = os.path.join(data_directory, testing_filename)
pretraining_testing_filename_base = "nyc_declarative_sentences_pretraining_testing"
pretraining_testing_filename = f"{pretraining_testing_filename_base}_{timestamp}.txt"
pretraining_testing_filepath = os.path.join(data_directory, pretraining_testing_filename)
manual_pretraining_filename = "pre_training_nyc_samples_manual"
manual_pretraining_filepath = os.path.join(base_dir, "annabell/experiments/data/pre_training_nyc_samples_manual.txt")
pretraining_filename_base = "nyc_declarative_sentences_pretraining"
response_formatted_filename = f"response_formatted_{timestamp}.jsonl"
response_formatted_filepath = os.path.join(data_directory, response_formatted_filename)

In [3]:
#create a dataframe from the LLM response jsonl file
response_df = pd.read_json(response_filepath, lines=True)
#add 3 columns to the dataframe
# 1. response_question_formatted - the question that was used to generate the declarative sentence with formatting rules applied
# 2. response_answer_formatted - the answer that was used to generate the declarative sentence with formatting rules applied
# 3. response_declarative_sentence_formatted - the declarative sentence generated by the LLM with formatting rules applied
response_df["response_question_formatted"] = response_df["response_question"].apply(format_text, args=(True,))
response_df["response_answer_formatted"] = response_df["response_answer"].apply(format_text)
response_df["response_declarative_sentence_formatted"] = response_df["response_declarative_sentence"].apply(format_text)
#using the pretraining questions file, add a column called "is pretraining" to the dataframe with a Boolean value
pretraining_questions_filepath = os.path.join(base_dir, pretraining_dir, pretraining_nyc_filename)
with open(pretraining_questions_filepath, "r") as pretraining_file:
	pretraining_questions = [line for line in pretraining_file.read().splitlines() if line[0] == "?"]
#Check that all the pretraining questions in the file are also questions in the dataframe
pretraining_not_in_responses = [q for q in pretraining_questions if q not in response_df["response_question_formatted"].values]
if len(pretraining_not_in_responses) > 0:
	raise Exception("Some pretraining questions are not in the responses dataframe\nMissing quesitons:\n" +str(pretraining_not_in_responses))
#add a column to the dataframe called "is_pretraining" with a Boolean value
response_df["is_pretraining"] = response_df["response_question_formatted"].apply(is_pretraining_question, args=(pretraining_questions,))
print(response_df["is_pretraining"].value_counts())
response_df.head()
#save the dataframe
response_df.to_json(response_formatted_filepath, orient="records", lines=True)
print("Response dataframe with formatted text written to: " + response_formatted_filepath)

Pretraining question found: ? in what city is the United Nations base -d
Pretraining question found: ? how many school and university -s are in NYC
Pretraining question found: ? what was the name of the Lenape homeland
Pretraining question found: ? who command -ed the Spanish expedite -ion
Pretraining question found: ? what was the myth that Manhattan was bought for by General Peter Minuit
Pretraining question found: ? when was an African burial ground discover -ed after the build -ing of a new courthouse
Pretraining question found: ? what notable catastrophe took place under the British occupy -ation
Pretraining question found: ? in the ear -ly 1920s what was the second most high -ly populate -d city in the world
Pretraining question found: ? how many lead -er terrorist -s of Al Quada were involve -d with the 911 attack -s direct -ly that day
Pretraining question found: ? in what geography -ical region of the United States is New York City locate -d
Pretraining question found: ? the H

Unnamed: 0,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining
0,56ce304daab44d1400b8850e,What city in the United States has the highest...,What city in the United States has the high –e...,New York,The city in the United States with the high –e...,New York,? what city in the United States has the high ...,New York,the city in the United States with the high -e...,False
1,56ce304daab44d1400b8850f,In what city is the United Nations based?,In what city is the United Nations base –d?,New York,The United Nations is base –d in New York.,New York,? in what city is the United Nations base -d,New York,the United Nations is base -d in New York,True
2,56ce304daab44d1400b88510,What city has been called the cultural capital...,What city has been call –ed the culture –al ca...,New York,New York has been call –ed the culture –al cap...,New York,? what city has been call -ed the culture -al ...,New York,New York has been call -ed the culture -al cap...,False
3,56ce304daab44d1400b88511,What American city welcomes the largest number...,What American city welcome –s the large –st nu...,New York,New York is the American city that welcome –s ...,New York,? what American city welcome -s the large -st ...,New York,New York is the American city that welcome -s ...,False
4,56cf5d41aab44d1400b89130,The major gateway for immigration has been whi...,The major gateway for immigrate –ion has been ...,New York City,The major gateway for immigrate –ion has been ...,New York City,? the major gateway for immigrate -ion has bee...,New York City,the major gateway for immigrate -ion has been ...,False


## The following files need to be created from the dataframe
1. training file containing declarative statements that contains only the items where is_pretraining is False
2. test file containing questions where is_pretraining is False
3. test file containing questions where is_pretraining is True

In [4]:
#create the training file
training_df = response_df[response_df["is_pretraining"] == False]
type(training_df["response_declarative_sentence_formatted"])
write_training_file(training_df["response_declarative_sentence_formatted"], training_filepath)
#create the testing file for non-pretraining questions
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
testing_df = response_df[response_df["is_pretraining"] == False]
write_testing_file(testing_df["response_question_formatted"].tolist(), testing_filepath)
#read the manual pretraining questions and parse out the lines that begin with "?"
with open(manual_pretraining_filepath, "r") as manual_pretraining_file:
	manual_pretraining_lines = manual_pretraining_file.readlines()
manual_pretraining_questions = [line.strip() for line in manual_pretraining_lines if line.startswith("?")]
print("Number of manual pretraining questions: " + str(len(manual_pretraining_questions)))
#write the pretraining testing file with the manual_pretraining_questions
write_testing_file(manual_pretraining_questions, pretraining_testing_filepath)
print("Pretraining testing file written to: " + pretraining_testing_filepath)

file created: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/data/nyc_declarative_sentences_training_20250924_081927.txt
file created: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/data/nyc_declarative_sentences_testing_20250924_081927.txt
Number of manual pretraining questions: 33
file created: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/data/nyc_declarative_sentences_pretraining_testing_20250924_081927.txt
Pretraining testing file written to: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/data/nyc_declarative_sentences_pretraining_testing_20250924_081927.txt
