In [10]:
import os
from dataset_processing import question_and_answer_pairs_from_log_file, remove_quotes_from_file, filter_by_max_words, clean_text
import pandas as pd

In [11]:
pre_training_dir ="training"
pre_training_filename = "pre_training_nyc_samples"
pre_training_filepath = os.path.join(pre_training_dir, pre_training_filename)

train_dir = "/Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/training"
train_filename = "declarative_sentences_train_gemma3:4b_20250617_201822.tsv"
train_filepath = os.path.join(train_dir, train_filename)

In [12]:
with open(pre_training_filepath, "r") as pre_training_file:
	pre_training_lines = pre_training_file.readlines()
pre_training_lines

['# Group 1: Active Voice Sentences\n',
 '# These are standard declarative sentences where the subject performs the action of the verb (Subject-Verb-Object).\n',
 'new york city has the highest population in the united states\n',
 'new york welcomes the largest number of legal immigrants\n',
 'New York City comprise -s five borough -s\n',
 '#\n',
 '# Group 2: Sentences with a Linking Verb ("to be")\n',
 '#These sentences use a form of the verb "to be" (is, am, are, was, were) to describe or identify the subject (Subject-Linking Verb-Complement).\n',
 'the united nations is based in New York\n',
 'the major gateway for immigration has been in new york city\n',
 'the most populated city in the united states is new york city\n',
 '#\n',
 '# Group 3: Sentences in the Passive Voice\n',
 '#In these sentences, the subject receives the action of the verb. The structure often involves a form of "to be" plus a past participle.\n',
 'new york has been called the cultural capital of the world\n',


In [13]:
#remove comment lines
pre_training_lines = [line for line in pre_training_lines if not line.startswith("#")]
pre_training_lines

['new york city has the highest population in the united states\n',
 'new york welcomes the largest number of legal immigrants\n',
 'New York City comprise -s five borough -s\n',
 'the united nations is based in New York\n',
 'the major gateway for immigration has been in new york city\n',
 'the most populated city in the united states is new york city\n',
 'new york has been called the cultural capital of the world\n',
 'the five borough -s were combined into one city in 1898\n',
 'the trading post that preceded New York City was called New Amsterdam\n',
 '? What city in the United States has the highest population\n',
 '? what city has been called the cultural capital of the world\n',
 '? What American city welcomes the largest number of legal immigrants\n',
 '? How many boroughs comprise New York City\n',
 '? how many languages are spoken by the people of New York City\n',
 '? How man boroughs does New York City contain\n',
 '? in what city is the united nations based\n',
 '? in wha

In [14]:
pre_training_statements = [line for line in pre_training_lines if not line.startswith("?")]
pre_training_statements

['new york city has the highest population in the united states\n',
 'new york welcomes the largest number of legal immigrants\n',
 'New York City comprise -s five borough -s\n',
 'the united nations is based in New York\n',
 'the major gateway for immigration has been in new york city\n',
 'the most populated city in the united states is new york city\n',
 'new york has been called the cultural capital of the world\n',
 'the five borough -s were combined into one city in 1898\n',
 'the trading post that preceded New York City was called New Amsterdam\n']

In [15]:
pre_training_questions = [line for line in pre_training_lines if line.startswith("?")]
pre_training_questions

['? What city in the United States has the highest population\n',
 '? what city has been called the cultural capital of the world\n',
 '? What American city welcomes the largest number of legal immigrants\n',
 '? How many boroughs comprise New York City\n',
 '? how many languages are spoken by the people of New York City\n',
 '? How man boroughs does New York City contain\n',
 '? in what city is the united nations based\n',
 '? in what year were the five borough -s combined into one city (appears multiple times)\n',
 '? in what borough is Wall Street located\n',
 '? When was New York City established\n',
 '? when did the English take over the area from the Dutch\n',
 '? when was the first European to visit the area of nyc\n',
 '? who commanded the Spanish expedition\n',
 '? who was henry hudson working for\n',
 '? who was the first non-Indian person to live in what is now nyc\n',
 '? where was Juan Rodriguez born\n',
 '? where did the stonewall riots happen\n',
 '? where in Central Par

In [16]:
#Load and clean the training data derived from the SQuAD dataset
train_filepath = remove_quotes_from_file(train_filepath)
train_df = pd.read_csv(train_filepath, sep="\t")
train_df = train_df.dropna()

filtered_train_df = train_df[train_df["title"] == "New_York_City"]
filtered_train_df = filter_by_max_words(filtered_train_df, max_words=20)
filtered_train_df["response_question"] = clean_text(filtered_train_df["response_question"], True)
filtered_train_df["response_answer"] = clean_text(filtered_train_df["response_answer"], False)
filtered_train_df["statement"] = clean_text(filtered_train_df["statement"], False)
filtered_train_df.reset_index(drop=True, inplace=True)
filtered_train_df

Cleaned data saved to /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/training/declarative_sentences_train_gemma3:4b_20250617_201822_cleaned.tsv


Unnamed: 0,id,title,question,answer,response_question,response_answer,statement
0,56ce304daab44d1400b8850e,New_York_City,What city in the United States has the highest...,New York,? What city in the United States has the highe...,new york,new york city has the highest population in th...
1,56ce304daab44d1400b8850f,New_York_City,In what city is the United Nations based?,New York,? in what city is the united nations based,new york,the united nations is based in New York
2,56ce304daab44d1400b88510,New_York_City,What city has been called the cultural capital...,New York,? what city has been called the cultural capit...,new york,new york has been called the cultural capital ...
3,56ce304daab44d1400b88511,New_York_City,What American city welcomes the largest number...,New York,? What American city welcomes the largest numb...,new york,new york welcomes the largest number of legal ...
4,56cf5d41aab44d1400b89130,New_York_City,The major gateway for immigration has been whi...,New York City,? the major gateway for immigration has been w...,new york city,the major gateway for immigration has been in ...
...,...,...,...,...,...,...,...
801,56d1218c17492d1400aaba1e,New_York_City,What ZIP code was responsible for the greatest...,10021,? what ZIP code was responsible for the greate...,10021,the ZIP code 10021 was responsible for the gre...
802,56d1218c17492d1400aaba1f,New_York_City,How much money in cents does New York City rec...,83,? how much money in cents does New York City r...,83,New York City receives 83 cents for every doll...
803,56d1218c17492d1400aaba20,New_York_City,How much more money does the city give to the ...,$11 billion,? how much more money does the city give to th...,11 billion,the city gives 11 billion to the state of new ...
804,56d1218c17492d1400aaba21,New_York_City,"Each year, how much more money does New York C...",$11.4 billion,? each year how much more money does New York ...,11 point 4 billion,new york city gives 11 point 4 billion more to...
