In [3]:
from src.commands.train import run
import chromadb

In [4]:
client = chromadb.PersistentClient(path="database/myDB")
collection = client.get_or_create_collection(name="documents")

In [5]:
def get_corpus_data(corpus_name, coll):
    """
    Retrieve all documents, metadata, and ids for a given corpus name.
    """
    results = coll.get(
        where={"corpus_name": corpus_name},  # 👈 Filter by corpus name
        include=["documents", "metadatas"],  # 👈 Only need documents and metadatas
        limit=10000  # Adjust if you expect very large corpora
    )

    documents = results.get("documents", [])
    metadatas = results.get("metadatas", [])
    ids = results.get("ids", [])  # ✅ ids are automatically returned even without include

    return documents, metadatas, ids



In [6]:
corpus_name = "billSample"
documents, metadatas, ids = get_corpus_data(corpus_name, collection)

print(f"Found {len(documents)} documents in '{corpus_name}'.")

for doc, meta, id_ in zip(documents, metadatas, ids):
    print(f"ID: {id_}")
    print(f"Document Preview: {doc[:100]}...")  # Print first 100 characters
    print(f"Metadata: {meta}")
    print("-" * 40)

Found 1000 documents in 'billSample'.
ID: dat/bills_sample_100.csv_0
Document Preview: amend dodd frank wall street reform consumer protection direct housingandurbandevelopment hud publis...
Metadata: {'corpus_name': 'billSample', 'file_name': 'dat/bills_sample_100.csv', 'original_content': 'amends dodd frank wall street reform consumer protection act direct secretary housing_and_urban_development hud publish informing mortgagees obligation provide prospective homebuyers specified hud public outreach publications concerning importance obtaining independent home inspections requires persons providing housing counseling hud approved counseling agencies trained certain voluntary home inspection training module distribute explain certain counseling aids requires homeownership counseling program hud administration provided organizations counselors hud certified competent provide voluntary home inspection counseling authorizes hud withhold withdraw suspend housing counseling certifications n

In [7]:
documents

['amend dodd frank wall street reform consumer protection direct housingandurbandevelopment hud publish inform mortgagee obligation prospective homebuyer specify hud outreach publication concern importance obtain independent home inspection person housing counseling hud approve counseling agency train voluntary home inspection training module distribute explain counseling aid homeownership counseling hud administration organization counselor hud certify competent voluntary home inspection counseling authorize hud withhold withdraw suspend housing counseling certification non compliant housing counselor counseling entity hud comprehensive train hud staff contractor individual entity housing counseling specify hud fund counseling consumer voluntary home inspection training include development training module train counselor counsel aid housing counselor suitable distribution consumer consumer ongoing assistance housing counselor hud discretion develop new independent protocol amend exist

In [10]:
model = "tomotopyLDA"
data_path = "/Users/danielstephens/Desktop/TOVA/data/dat/bills_sample_100.csv"
text_col = "tokenized_text" 
output_dir = "models/tomotopy"

In [11]:
model = run(model=model, data=documents, text_col=text_col, output=output_dir)

Running training with model: tomotopyLDA


src.topic_models.base_model - INFO - Model path models/tomotopy already exists. Saving a copy...
src.topic_models.base_model - INFO - TomotopyLDATMmodel initialized with num_topics=50, num_iters=1000, alpha=0.1, eta=0.01.
src.topic_models.base_model - INFO - Loaded processed data
src.topic_models.base_model - INFO - Creating TomotopyLDA object and adding docs...
src.topic_models.base_model - INFO - Training TomotopyLDA model with 50 topics...


Loaded config file static/config/config.yaml and section logger.
Logs will be saved in data/logs
Loaded config file static/config/config.yaml and section topic_modeling.


Training Progress:  38%|███▊      | 380/1000 [00:01<00:02, 266.75it/s]

Iteration: 300, Log-likelihood: -7.394252775908079, Perplexity: 1626.6090279403656


Training Progress:  65%|██████▌   | 650/1000 [00:02<00:01, 278.24it/s]

Iteration: 600, Log-likelihood: -7.359735110904634, Perplexity: 1571.42025573255


KeyboardInterrupt: 

Training Progress:  66%|██████▌   | 660/1000 [00:19<00:01, 278.24it/s]