In [1]:
import importlib
import json
import pathlib
import logging
import pandas as pd

In [2]:
with open("./static/config/modelRegistry.json", "r") as f:
    model_classes = json.load(f)
    
def load_class_from_path(class_path: str):
    module_path, class_name = class_path.rsplit(".", 1)
    module = importlib.import_module(module_path)
    return getattr(module, class_name)


MODEL_REGISTRY = {
    key: load_class_from_path(path) for key, path in model_classes.items()
}

MODEL_REGISTRY

INFO 12-13 21:27:30 [__init__.py:216] Automatically detected platform cpu.


{'tomotopyLDA': tova.topic_models.models.traditional.tomotopy_lda_tm_model.TomotopyLDATMmodel,
 'CTM': tova.topic_models.models.traditional.ctmtm_model.CTMTMmodel,
 'topicGPT': tova.topic_models.models.llm_based.topicgpt.topicgpt_tm_model.TopicGPTTMmodel}

In [3]:
model = "topicGPT"
model_name = "test"
id = "xxx"

In [4]:
data_file = "data_test/bills_sample_100.csv"
train_data = pd.read_csv(data_file).sample(10, random_state=42)
train_data = train_data.rename(columns={"summary": "raw_text"})
train_data = train_data[["id", "raw_text"]].to_dict(orient="records")

In [5]:
model_cls = MODEL_REGISTRY.get(model)
if model_cls is None:
    raise ValueError(f"Unknown model: {model}")

tr_params = {
    #"num_topics": 50,
    #"preprocess_text": False,
  }

tm_model = model_cls(
    model_name=model_name,
    corpus_id="c_4e3634ace8f94d8e899142ef637348c0",
    id=id,
    model_path=pathlib.Path(f"data/tests/test_{model_name}"),
    load_model=False,
    logger=logging.getLogger(f"test_logger_{model_name}"),
    **tr_params
)

mo = tm_model.train_model(train_data)

Loaded config file static/config/config.yaml and section logger.
Logs will be saved in data/logs
Loaded config file static/config/config.yaml and section llm.
-------------------
Initializing topic generation...
Model: gemma3:4b
Data file: data/tests/test_test/modelFiles/sample_1.jsonl
Prompt file: /Users/lbartolome/TOVA/src/tova/topic_models/models/llm_based/topicGPT/prompt/generation_1.txt
Seed file: /Users/lbartolome/TOVA/src/tova/topic_models/models/llm_based/topicGPT/prompt/seed_1.md
Output file: data/tests/test_test/modelFiles/generation_1.jsonl
Topic file: data/tests/test_test/modelFiles/generation_1.md
-------------------


100%|██████████| 1/1 [00:06<00:00,  6.26s/it]

Prompt token usage: 653 ~$0.003265
Response token usage: 18 ~$0.00027
Topics: [1] Human Rights: Mentions the fundamental right to life and personhood.
--------------------
Loaded config file static/config/config.yaml and section logger.
Logs will be saved in data/logs
Loaded config file static/config/config.yaml and section llm.
-------------------
Initializing topic refinement...
Model: gemma3:4b
Input data file: data/tests/test_test/modelFiles/generation_1.jsonl
Prompt file: /Users/lbartolome/TOVA/src/tova/topic_models/models/llm_based/topicGPT/prompt/refinement.txt
Output file: data/tests/test_test/modelFiles/refinement.md
Topic file: data/tests/test_test/modelFiles/generation_1.md
-------------------





No topic pairs to be merged.
Node('/Topics', count=1, desc='Root topic', lvl=0)
└── Node('/Topics/Human Rights', count=1, desc='Mentions the fundamental right to life and personhood.', lvl=1)
Loaded config file static/config/config.yaml and section logger.
Logs will be saved in data/logs
Loaded config file static/config/config.yaml and section llm.
-------------------
Initializing topic assignment...
Model: gemma3:4b
Data file: data/tests/test_test/modelFiles/full.jsonl
Prompt file: /Users/lbartolome/TOVA/src/tova/topic_models/models/llm_based/topicGPT/prompt/assignment.txt
Output file: data/tests/test_test/modelFiles/assignment.jsonl
Topic file: data/tests/test_test/modelFiles/generation_1.md
-------------------


 10%|█         | 1/10 [00:00<00:08,  1.02it/s]

Prompt token usage: 818 ~$0.00409
Response token usage: 68 ~$0.00102
Response: [1] Human Rights: Assignment reasoning (The document focuses on measuring the effectiveness of border security, which inherently relates to the protection of individuals and the prevention of unlawful cross-border activity, aligning with the fundamental right to life and personhood.) (Supporting quote) “...detecting and apprehending subjects and in seizing illicit drugs.”
--------------------


 20%|██        | 2/10 [00:01<00:06,  1.32it/s]

Prompt token usage: 514 ~$0.0025700000000000002
Response token usage: 42 ~$0.0006299999999999999
Response: [1] Human Rights: Assignment reasoning (Declares that: (1) the right to life guaranteed by the Constitution is vested in each human and is the person's paramount and most fundamental right;)
--------------------


 30%|███       | 3/10 [00:02<00:07,  1.04s/it]

Prompt token usage: 848 ~$0.00424
Response token usage: 137 ~$0.002055
Response: [1] Human Rights: Assignment reasoning (This bill focuses on providing water resources to rural communities, addressing a fundamental need for human well-being and access to essential resources.) (Supporting quote: “This bill authorizes the Department of the Interior to carry out the projects entitled: (1) the “Dry-Redwater Regional Water Authority System,” in accordance with the Dry-Redwater Regional Water System Feasibility Study, which received funding from the Bureau of Reclamation on September 1, 2010; and (2) the “Musselshell-Judith Rural Water System,” in accordance with the Musselshell-Judith Rural Water System Feasibility Report.”)
--------------------


 40%|████      | 4/10 [00:03<00:05,  1.10it/s]

Prompt token usage: 598 ~$0.00299
Response token usage: 51 ~$0.000765
Response: [1] Human Rights: Assignment reasoning (The act extends coverage for kidney transplant patients, implicitly recognizing their right to healthcare and well-being.) (Supporting quote: “...to extend the months of coverage of immunosuppressive drugs for kidney transplant patients.”)
--------------------


 50%|█████     | 5/10 [00:04<00:04,  1.15it/s]

Prompt token usage: 444 ~$0.00222
Response token usage: 70 ~$0.00105
Response: [1] Human Rights: Assignment reasoning (The document references the designation of a facility, which is a fundamental aspect of human rights and societal organization.) (Supporting quote) “Designates the Logistics Automation Training Facility of the Army Quartermaster Center and School at Fort Lee, Virginia, as the “General Richard H. Thompson Logistics Automation Training Facility.”
--------------------


 60%|██████    | 6/10 [00:05<00:03,  1.25it/s]

Prompt token usage: 584 ~$0.00292
Response token usage: 51 ~$0.000765
Response: [1] Human Rights: Assignment reasoning (The document focuses on public preparedness and overcoming indifference as to individual preparedness, relating to the right to life and safety.) (Supporting quote: "...to educate the public and overcome public indifference as to individual preparedness")
--------------------


 70%|███████   | 7/10 [00:05<00:02,  1.36it/s]

Prompt token usage: 430 ~$0.00215
Response token usage: 51 ~$0.000765
Response: [1] Human Rights: Assignment reasoning (The document discusses the use of funds for education, which is a fundamental right.) (Supporting quote) “Amends the Internal Revenue Code to allow payment of home school expenses from Coverdell education savings accounts.”
--------------------


 80%|████████  | 8/10 [00:06<00:01,  1.56it/s]

Prompt token usage: 521 ~$0.0026049999999999997
Response token usage: 24 ~$0.00036
Response: [1] Human Rights: Assignment reasoning (Mentions coverage amount for veterans insured under Veterans’ Group Life Insurance.)
--------------------


 90%|█████████ | 9/10 [00:07<00:00,  1.39it/s]

Prompt token usage: 467 ~$0.0023350000000000003
Response token usage: 76 ~$0.00114
Response: [1] Human Rights: Assignment reasoning (The document focuses on transparency and accessibility of research, aligning with the fundamental right to access information and understand research findings.) (Supporting quote: “to require the Consumer Financial Protection Bureau, any time it issues a research paper available to the public, to accompany it with all studies, data, and other analyses on which it was based.”)
--------------------


100%|██████████| 10/10 [00:07<00:00,  1.28it/s]
INFO:TMmodel:-- -- -- Topic model object (TMmodel) successfully created
INFO:TMmodel:-- -- Sorted
INFO:TMmodel:-- -- betas ds
INFO:TMmodel:-- -- entropy
INFO:TMmodel:-- -- active
INFO:TMmodel:-- -- descriptions
INFO:gensim.corpora.dictionary:adding document #0 to Dictionary<0 unique tokens: []>
INFO:gensim.corpora.dictionary:built Dictionary<685 unique tokens: ['"State', '(1)', '(2)', '(3)', '(4)']...> from 10 documents (total 1505 corpus positions)
INFO:gensim.utils:Dictionary lifecycle event {'msg': 'built Dictionary<685 unique tokens: [\'"State\', \'(1)\', \'(2)\', \'(3)\', \'(4)\']...> from 10 documents (total 1505 corpus positions)', 'datetime': '2025-12-13T21:27:54.427124', 'gensim': '4.3.3', 'python': '3.12.8 (main, Jan 16 2025, 15:52:49) [Clang 16.0.0 (clang-1600.0.26.4)]', 'platform': 'macOS-15.6.1-arm64-arm-64bit', 'event': 'created'}
INFO:TMmodel:Calculating just coherence c_npmi.
INFO:gensim.topic_coherence.probability_estimat

Prompt token usage: 839 ~$0.004195
Response token usage: 66 ~$0.00099
Response: [1] Human Rights: Assignment reasoning (The Act conditions each state’s receipt of funds on having a law or regulation to prevent and treat concussions, implying a fundamental right to health and safety.) (The Act conditions each state’s receipt of funds on having a law or regulation to prevent and treat concussions.)
--------------------
Loaded config file static/config/config.yaml and section logger.
Logs will be saved in data/logs
Loaded config file static/config/config.yaml and section llm.
-------------------
Initializing topic correction...
Model: gemma3:4b
Data file: data/tests/test_test/modelFiles/assignment.jsonl
Prompt file: /Users/lbartolome/TOVA/src/tova/topic_models/models/llm_based/topicGPT/prompt/correction.txt
Output file: data/tests/test_test/modelFiles/assignment_corrected.jsonl
Topic file: data/tests/test_test/modelFiles/generation_1.md
-------------------
Number of errors: 0
Number of ha

INFO:gensim.topic_coherence.text_analysis:1 batches submitted to accumulate stats from 64 documents (1415 virtual)
INFO:gensim.topic_coherence.text_analysis:11 accumulators retrieved from output queue
INFO:gensim.topic_coherence.text_analysis:accumulated word occurrence stats for 1415 virtual documents
INFO:TMmodel:Most representative documents for each topic:
INFO:TMmodel:Topic 0 -> Doc IDs: ['113-HR-3532', '114-HR-3131', '111-HR-2379', '112-HR-1767', '112-S-2218', '110-HR-1206', '113-HR-1428', '114-HR-3867', '111-HR-227', '114-S-1864']
INFO:TMmodel:Documents assigned to topic clusters based on max probability:
INFO:TMmodel:Topic 0 -> Doc IDs: ['114-S-1864', '111-HR-227', '114-HR-3867', '113-HR-1428', '110-HR-1206', '112-S-2218', '112-HR-1767', '111-HR-2379', '114-HR-3131', '113-HR-3532']
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
INFO:TMmodel:Most Representative documents saved to data/tests/test_test/TMmodel/most_representative_docs.jsonl

Error in pyLDAvis: 
