# This notebooks assigns categories to each row in the dataset based on its grammatical structure

In [21]:
import os
import pandas as pd
from ollama import generate
from pydantic import BaseModel

# Load the dataset
dataset_filename = "response_formatted_20250924_174653.jsonl"
base_directory = "/home/chris/gdrive/work/annabell"
data_directory = os.path.join(base_directory, "experiments/data")
dataset_filepath = os.path.join(data_directory, dataset_filename)
nyc_squad_df = pd.read_json(dataset_filepath, lines=True)
declarations_prompt_input_dataset_filename = "prompt_" + dataset_filename
declarations_prompt_input_dataset_filepath = os.path.join(data_directory, declarations_prompt_input_dataset_filename)
#drop the columns that are not needed for the prompt
declarations_prompt_df = nyc_squad_df[
    ["id", "response_declarative_sentence"]]
declarations_prompt_df["generated_category"] = "<INSERT GENERATED CATEGORY HERE>"
#write the trimmed dataframe to a jsonl file
declarations_prompt_input_dataset = declarations_prompt_df.to_json(declarations_prompt_input_dataset_filepath,
                                                                   orient="records", lines=True)
print("Prompt input dataset written to: " + declarations_prompt_input_dataset_filepath)
#check the file has been written correctly
with open(declarations_prompt_input_dataset_filepath, "r") as declarations_prompt_input_file:
    declarations_prompt_input_lines = declarations_prompt_input_file.readlines()
print(f"Number of lines in prompt input file: {len(declarations_prompt_input_lines)}")
print("First 3 lines of prompt input file:")
for line in declarations_prompt_input_lines[:3]:
    print(line)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  declarations_prompt_df["generated_category"] = "<INSERT GENERATED CATEGORY HERE>"


Prompt input dataset written to: /home/chris/gdrive/work/annabell/experiments/data/prompt_response_formatted_20250924_174653.jsonl
Number of lines in prompt input file: 817
First 3 lines of prompt input file:
{"id":"56ce304daab44d1400b8850e","response_declarative_sentence":"The city in the United States with the high \u2013est populate \u2013ion is New York.","generated_category":"<INSERT GENERATED CATEGORY HERE>"}

{"id":"56ce304daab44d1400b8850f","response_declarative_sentence":"The United Nations is base \u2013d in New York.","generated_category":"<INSERT GENERATED CATEGORY HERE>"}

{"id":"56ce304daab44d1400b88510","response_declarative_sentence":"New York has been call \u2013ed the culture \u2013al capital of the world.","generated_category":"<INSERT GENERATED CATEGORY HERE>"}



In [19]:
class CategorisedRow(BaseModel):
    id: str
    category: str


def generated_text_from_prompt(the_model_string, the_prompt, the_format=None, the_options=None):
    generated_text = generate(model=the_model_string, prompt=the_prompt, format=the_format, options=the_options)
    return generated_text.response


def generated_json_from_prompt(the_model_string, the_prompt, base_model_class, the_options=None):
    the_response = generated_text_from_prompt(
        the_model_string=the_model_string,
        the_prompt=the_prompt,
        the_format=base_model_class.model_json_schema(),
        the_options=the_options)
    return CategorisedRow.model_validate_json(the_response)

In [22]:
model_string = "mistral-nemo"
base_prompt_filename = "categorisation_declarations_base_prompt.txt"
base_prompt_directory = "."
base_prompt_filepath = os.path.join(base_prompt_directory, base_prompt_filename)
with open(base_prompt_filepath, "r") as base_prompt_file:
    base_prompt = base_prompt_file.read()
with open(prompt_input_dataset_filepath, "r") as prompt_input_file:
    prompt_input_lines = prompt_input_file.readlines()
prompt_inputs = [line for line in prompt_input_lines if line.strip()]

# take the first 3 input_lines and join them together in a string for testing
testing_sample_string = "".join(prompt_inputs[:3])
print("Testing sample string:")
print(testing_sample_string)

Testing sample string:
{"id":"56ce304daab44d1400b8850e","response_declarative_sentence":"The city in the United States with the high \u2013est populate \u2013ion is New York.","generated_category":"<INSERT GENERATED CATEGORY HERE>"}
{"id":"56ce304daab44d1400b8850f","response_declarative_sentence":"The United Nations is base \u2013d in New York.","generated_category":"<INSERT GENERATED CATEGORY HERE>"}
{"id":"56ce304daab44d1400b88510","response_declarative_sentence":"New York has been call \u2013ed the culture \u2013al capital of the world.","generated_category":"<INSERT GENERATED CATEGORY HERE>"}



In [23]:
prompt_input_samples = testing_sample_string
prompt = base_prompt + prompt_input_samples
result = generated_text_from_prompt(model_string, prompt)
result

'{"id":"56ce304daab44d1400b8850e","category":"Declarative Sentence"}\n{"id":"56ce304daab44d1400b8850f","category":"Declarative Sentence"}\n{"id":"56ce304daab44d1400b88510","category":"Declarative Sentence"}'