# This notebooks assigns categories to each row in the dataset based on its grammatical structure

In [27]:
import os
import pandas as pd
from ollama import generate
from pydantic import BaseModel
import platform

# Set up directories based on the operating system
operating_system = platform.system()
if operating_system == 'Windows':
    raise Exception("not used on Windows yet")
elif operating_system == 'Linux':
    base_directory = "/home/chris/gdrive/work/annabell/"
    #pycharm_projects_directory = "/home/chris/PycharmProjects/dataset"
elif operating_system == 'Darwin':  #macOS
    base_directory = "/Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/"
    #pycharm_projects_directory = "/Users/chris/PycharmProjects/dataset"
else:
    raise Exception("unsupported OS")

In [33]:
# Load the dataset
dataset_filename = "response_formatted_20250924_174653.jsonl"
data_directory = os.path.join(base_directory, "experiments/data")
dataset_filepath = os.path.join(data_directory, dataset_filename)
nyc_squad_df = pd.read_json(dataset_filepath, lines=True)
questions_prompt_input_dataset_filename = "questions_prompt_" + dataset_filename
questions_prompt_input_dataset_filepath = os.path.join(data_directory, questions_prompt_input_dataset_filename)
declarations_prompt_input_dataset_filename = "prompt_" + dataset_filename
declarations_prompt_input_dataset_filepath = os.path.join(data_directory, declarations_prompt_input_dataset_filename)
#drop the columns that are not needed for the prompt
declarations_prompt_df = nyc_squad_df[
    ["id", "response_declarative_sentence"]]
declarations_prompt_df["generated_category"] = "<INSERT GENERATED CATEGORY HERE>"
questions_prompt_df = nyc_squad_df[
    ["id", "response_question"]]
questions_prompt_df["generated_category"] = "<INSERT GENERATED CATEGORY HERE>"
#write the trimmed dataframes to a jsonl file
declarations_prompt_input_dataset = declarations_prompt_df.to_json(declarations_prompt_input_dataset_filepath,
                                                                   orient="records", lines=True)
print("Prompt input dataset written to: " + declarations_prompt_input_dataset_filepath)
#check the file has been written correctly
with open(declarations_prompt_input_dataset_filepath, "r") as declarations_prompt_input_file:
    declarations_prompt_input_lines = declarations_prompt_input_file.readlines()
print(f"Number of lines in prompt input file: {len(declarations_prompt_input_lines)}")
print("First 3 lines of prompt input file:")
for line in declarations_prompt_input_lines[:3]:
    print(line)

questions_prompt_input_dataset = questions_prompt_df.to_json(questions_prompt_input_dataset_filepath,
                                                             orient="records", lines=True)
print("Prompt input dataset written to: " + questions_prompt_input_dataset_filepath)
#check the file has been written correctly
with open(questions_prompt_input_dataset_filepath, "r") as questions_prompt_input_file:
    questions_prompt_input_lines = questions_prompt_input_file.readlines()
print(f"Number of lines in prompt input file: {len(questions_prompt_input_lines)}")
print("First 3 lines of prompt input file:")
for line in questions_prompt_input_lines[:3]:
    print(line)


Prompt input dataset written to: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/data/prompt_response_formatted_20250924_174653.jsonl
Number of lines in prompt input file: 817
First 3 lines of prompt input file:
{"id":"56ce304daab44d1400b8850e","response_declarative_sentence":"The city in the United States with the high \u2013est populate \u2013ion is New York.","generated_category":"<INSERT GENERATED CATEGORY HERE>"}

{"id":"56ce304daab44d1400b8850f","response_declarative_sentence":"The United Nations is base \u2013d in New York.","generated_category":"<INSERT GENERATED CATEGORY HERE>"}

{"id":"56ce304daab44d1400b88510","response_declarative_sentence":"New York has been call \u2013ed the culture \u2013al capital of the world.","generated_category":"<INSERT GENERATED CATEGORY HERE>"}

Prompt input dataset written to: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmai

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  declarations_prompt_df["generated_category"] = "<INSERT GENERATED CATEGORY HERE>"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  questions_prompt_df["generated_category"] = "<INSERT GENERATED CATEGORY HERE>"


In [5]:
class CategorisedRow(BaseModel):
    id: str
    category: str


def generated_text_from_prompt(the_model_string, the_prompt, the_format=None, the_options=None):
    generated_text = generate(model=the_model_string, prompt=the_prompt, format=the_format, options=the_options)
    return generated_text.response


def generated_json_from_prompt(the_model_string, the_prompt, base_model_class, the_options=None):
    the_response = generated_text_from_prompt(
        the_model_string=the_model_string,
        the_prompt=the_prompt,
        the_format=base_model_class.model_json_schema(),
        the_options=the_options)
    return CategorisedRow.model_validate_json(the_response)

In [8]:
model_string = "mistral-nemo"
base_prompt_filename = "categorisation_declarations_base_prompt.txt"
base_prompt_directory = "."
base_prompt_filepath = os.path.join(base_prompt_directory, base_prompt_filename)
with open(base_prompt_filepath, "r") as base_prompt_file:
    base_prompt = base_prompt_file.read()
with open(declarations_prompt_input_dataset_filepath, "r") as prompt_input_file:
    prompt_input_lines = prompt_input_file.readlines()
prompt_inputs = [line for line in prompt_input_lines if line.strip()]

all_samples_string = "".join(prompt_inputs)
print(f"Number of lines in prompt input file: {len(prompt_inputs)}")
print("First 3 lines of prompt input file:")
for line in prompt_inputs[:3]:
    print(line)

# take the first 3 input_lines and join them together in a string for testing
testing_sample_string = "".join(prompt_inputs[:3])
print("Testing sample string:")
print(testing_sample_string)

Number of lines in prompt input file: 817
First 3 lines of prompt input file:
{"id":"56ce304daab44d1400b8850e","response_declarative_sentence":"The city in the United States with the high \u2013est populate \u2013ion is New York.","generated_category":"<INSERT GENERATED CATEGORY HERE>"}

{"id":"56ce304daab44d1400b8850f","response_declarative_sentence":"The United Nations is base \u2013d in New York.","generated_category":"<INSERT GENERATED CATEGORY HERE>"}

{"id":"56ce304daab44d1400b88510","response_declarative_sentence":"New York has been call \u2013ed the culture \u2013al capital of the world.","generated_category":"<INSERT GENERATED CATEGORY HERE>"}

Testing sample string:
{"id":"56ce304daab44d1400b8850e","response_declarative_sentence":"The city in the United States with the high \u2013est populate \u2013ion is New York.","generated_category":"<INSERT GENERATED CATEGORY HERE>"}
{"id":"56ce304daab44d1400b8850f","response_declarative_sentence":"The United Nations is base \u2013d in N

In [9]:
#prompt_input_samples = testing_sample_string
prompt_input_samples = all_samples_string
prompt = base_prompt + prompt_input_samples
result = generated_text_from_prompt(model_string, prompt)
result

"Here are some categories and the corresponding statements from your list:\n\n1. **Geography**\n   - The borough of Brooklyn is locate'd on Long Island.\n   - Queens is locate'd on the west end of Long Island.\n\n2. **Infrastructure**\n   - There are 20 rail line's on New York City's commuter rail network.\n   - The AirTrain has a station at JFK International Airport.\n\n3. **Culture**\n   - Broadway is associate'd with the theater industry.\n   - Wall Street is associate'd with the finance industry.\n\n4. **Economy**\n   - Four-fifths of the ZIP code's that provide the high'est amount of political contribution's in the United States are locate'd in the borough of Manhattan.\n   - New York City receive's 83 cent's for every dollar paid in federal tax'es.\n\n5. **Governance**\n   - The administrative court's are a part of the executive branch of government.\n   - New York City adopt'd the mayor-council form of government in 1898.\n\n6. **Education**\n   - The acronym PATH stand's for Po

## combine the base prompt and the sentences to create a categorisation of the declarations
Then use a LLM to categorise the sentences

In [29]:
# move the results form the LLM classification into a dataframe
llm_response_filename = "llm_categorisation_results.jsonl"
llm_response_filepath = os.path.join(data_directory, "prompts", llm_response_filename)
categorised_sentences_df = pd.read_json(llm_response_filepath, lines=True)
categorised_sentences_df

Unnamed: 0,id,category
0,56ce304daab44d1400b8850e,Subject-Verb-Complement (SVC)
1,56ce304daab44d1400b8850f,Passive Construction
2,56ce304daab44d1400b88510,Passive Construction
3,56ce304daab44d1400b88511,Subject-Verb-Object (SVO/SVOA)
4,56cf5d41aab44d1400b89130,Subject-Verb-Complement (SVC)
...,...,...
812,56d1218c17492d1400aaba1f,Subject-Verb-Object (SVO/SVOA)
813,56d1218c17492d1400aaba20,Subject-Verb-Object (SVO/SVOA)
814,56d1218c17492d1400aaba21,Subject-Verb-Object (SVO/SVOA)
815,56d121d817492d1400aaba2d,Subject-Verb-Complement (SVC)


In [None]:
#join the categories to the original dataframe nyc_squad_df

In [37]:
#repeat for the questions
model_string = "mistral-nemo"
base_prompt_filename = "categorisation_questions_base_prompt.txt"
base_prompt_directory = "."
base_prompt_filepath = os.path.join(base_prompt_directory, base_prompt_filename)
with open(base_prompt_filepath, "r") as base_prompt_file:
    base_prompt = base_prompt_file.read()
with open(questions_prompt_input_dataset_filepath, "r") as prompt_input_file:
    prompt_input_lines = prompt_input_file.readlines()
prompt_inputs = [line for line in prompt_input_lines if line.strip()]
all_samples_string = "".join(prompt_inputs)
print(f"Number of lines in prompt input file: {len(prompt_inputs)}")
print("First 3 lines of prompt input file:")
for line in prompt_inputs[:3]:
    print(line)
# take the first 3 input_lines and join them together in a string for testing
testing_sample_string = "".join(prompt_inputs[:3])
print("Testing sample string:")
print(testing_sample_string)

Number of lines in prompt input file: 817
First 3 lines of prompt input file:
{"id":"56ce304daab44d1400b8850e","response_question":"What city in the United States has the high \u2013est populate \u2013ion?","generated_category":"<INSERT GENERATED CATEGORY HERE>"}

{"id":"56ce304daab44d1400b8850f","response_question":"In what city is the United Nations base \u2013d?","generated_category":"<INSERT GENERATED CATEGORY HERE>"}

{"id":"56ce304daab44d1400b88510","response_question":"What city has been call \u2013ed the culture \u2013al capital of the world?","generated_category":"<INSERT GENERATED CATEGORY HERE>"}

Testing sample string:
{"id":"56ce304daab44d1400b8850e","response_question":"What city in the United States has the high \u2013est populate \u2013ion?","generated_category":"<INSERT GENERATED CATEGORY HERE>"}
{"id":"56ce304daab44d1400b8850f","response_question":"In what city is the United Nations base \u2013d?","generated_category":"<INSERT GENERATED CATEGORY HERE>"}
{"id":"56ce304

In [38]:
prompt_input_samples = testing_sample_string
#prompt_input_samples = all_samples_string
prompt = base_prompt + prompt_input_samples
result = generated_text_from_prompt(model_string, prompt)
result

'{"id":"56ce304daab44d1400b8850e","category":"Yes/No Question"}\n{"id":"56ce304daab44d1400b8850f","category":"Where Question"}\n{"id":"56ce304daab44d1400b88510","category":"What City"}\n\nCategories:\n1. Yes/No Question: Questions that can be answered with a simple yes or no.\n2. Where Question: Questions that ask for the location of something.\n3. What City: Questions that ask about a city\'s name or feature.'