In [1]:
from generate_declarative_sentences import generate_declarative_statements
from categorise_sentences import QuestionCategoryAssigner, StatementCategoryAssigner
from dataset_processing import DatasetPreProcessor
from config.global_config import GlobalConfig
from training import AnnabellPreTrainingRunner

global_config = GlobalConfig()

model_name = "qwen3:4b"
if 'declarative_sentences_dataset' not in locals():
    import pandas as pd

    declarative_sentences_dataset = pd.read_json('declarative_sentences_dataset_sample.jsonl', lines=True)
else:
    #generate a small sample of declarative sentences from the NYC SQuAD dataset
    declarative_sentences_dataset = generate_declarative_statements(5, model_name)
    #use this dataset to assign categories to the questions and statements
    question_assigner = QuestionCategoryAssigner(declarative_sentences_dataset)
    question_assigner.generate_statement_categories(model_name)
    statement_assigner = StatementCategoryAssigner(declarative_sentences_dataset)
    statement_assigner.generate_statement_categories(model_name)
    declarative_sentences_dataset.to_json("declarative_sentences_dataset_sample.jsonl", orient="records", lines=True)
#apply the formatting rules to the dataset
datasetPreProcessor = DatasetPreProcessor(declarative_sentences_dataset)
datasetPreProcessor.preprocess_data()
#select the pretraining data
datasetPreProcessor.select_pretraining_data(global_config.percentage_of_pre_training_samples())
datasetPreProcessor.create_commands_for_pretraining()
declarative_sentences_dataset.to_json(global_config.prepared_dataset_with_commands_filepath(), orient="records",
                                      lines=True)
print(f"dataset saved to file: {global_config.prepared_dataset_with_commands_filepath()}")
declarative_sentences_dataset

2025-11-15 11:46:33,852 - root - INFO - Logging initialized. Log file: /Users/chris/logs/cognitive_language_model_logs/run_20251115_114633.log
2025-11-15 11:46:36,592 - dataset_processing - INFO - Number of pretraining samples: 2
2025-11-15 11:46:36,594 - dataset_processing - INFO - Samples per category: 1
2025-11-15 11:46:36,597 - dataset_processing - INFO - Pretraining samples by question category:
2025-11-15 11:46:36,598 - dataset_processing - INFO - question_category
Subject-Verb-Object    1
Name: count, dtype: int64
2025-11-15 11:46:36,600 - dataset_processing - INFO - Pretraining samples by sentence category:
2025-11-15 11:46:36,602 - dataset_processing - INFO - statement_category
Subject-Verb-Object    1
Name: count, dtype: int64
2025-11-15 11:46:36,603 - dataset_processing - INFO - Total number of samples selected for pretraining: 1
dataset saved to file: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/

Unnamed: 0,id,title,context,question,declarative_statement,question_category,statement_category,answer,declarative_statement_formatted,question_formatted,answer_formatted,is_pretraining,created_commands
0,5733be284776f41900661182,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,the Virgin Mary allegedly appeared to Saint Be...,Subject-Verb-Object,Subject-Verb-Object,Saint Bernadette Soubirous,the_Virgin_Mary allegedly appear to Saint_Bern...,? to whom do the_Virgin_Mary allegedly appear ...,Saint_Bernadette_Soubirous,False,[# This is a non-pretraining sample. No comman...
1,5733be284776f4190066117f,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ is in front of the N...,Subject-Verb-Object,Subject-Verb-Object,a copper statue of Christ,a copper statue of Christ be in front of the_N...,? what be in front of the_Notre_Dame Main Buil...,a copper statue of Christ,True,"[#id: 5733be284776f4190066117f, a copper statu..."
2,5733be284776f41900661180,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,The Basilica of the Sacred Heart at Notre Dame...,Subject-Verb-Object,Subject-Verb-Object,the Main Building,the Basilica of the Sacred Heart at Notre_Dame...,? the Basilica of the sacred heart at Notre_Da...,the_Main_Building,False,[# This is a non-pretraining sample. No comman...
3,5733be284776f41900661181,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,The Grotto at Notre Dame is a Marian place of ...,Subject-Verb-Object,Subject-Verb-Object,a Marian place of prayer and reflection,the Grotto at Notre_Dame be a marian place of ...,? what be the Grotto at Notre_Dame,a marian place of prayer and reflection,False,[# This is a non-pretraining sample. No comman...
4,5733be284776f4190066117e,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,a golden statue of the Virgin Mary sits on top...,Subject-Verb-Object,Subject-Verb-Object,a golden statue of the Virgin Mary,a golden statue of the_Virgin_Mary sit on top ...,? what sit on top of the_Main_Building at Notr...,a golden statue of the_Virgin_Mary,False,[# This is a non-pretraining sample. No comman...


In [3]:
#todo add POS tagging to columns
#fix pretraining selection algorithm
# use the POS tagging to identify goals as part of command generation

In [2]:
#run the pretraining using the generated dataset

from dataset_processing import DatasetPreProcessor
from config.global_config import GlobalConfig
from training import AnnabellPreTrainingRunner
import pandas as pd

global_config = GlobalConfig()

ds_filepath = "/Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/data/nyc_squad_with_pretraining_commands.jsonl"

declarative_sentences_dataset = pd.read_json(ds_filepath, lines=True)
datasetPreProcessor = DatasetPreProcessor(declarative_sentences_dataset)

runner = AnnabellPreTrainingRunner(datasetPreProcessor)
runner.run()

2025-11-15 11:33:33,874 - dataset_processing - INFO - Wrote /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/sandbox/pre_training/nyc_squad_pretraining_commands.txt
2025-11-15 11:33:33,875 - dataset_processing - INFO - Number of reward lines: 1
2025-11-15 11:33:33,876 - dataset_processing - INFO - Number of commands: 22
2025-11-15 11:33:33,878 - training - INFO - copied: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/sandbox/pre_training/nyc_squad_pretraining_commands.txt to: /Users/chris/PycharmProjects/Training-and-evaluating-cognitive-language-models/docker/shared_data/pre_training
2025-11-15 11:33:33,880 - dataset_processing - INFO - Wrote /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/ann

KeyboardInterrupt: 