In [5]:
from util import * 
import chromadb

In [6]:
import chromadb
client = chromadb.PersistentClient(path="database/myDB")
collection = client.get_or_create_collection(name="documents")
registry = client.get_or_create_collection("corpus_model_registry")

In [5]:
collection.delete(where={"corpus_name": "Corpus 3"})

In [7]:
import os

In [7]:
model_dir = os.path.join("models")
    
if not os.path.exists(model_dir):
    print("nothing") # Return empty if no directory

# List all directories/files inside model_dir
models = [
    name for name in os.listdir(model_dir)
    if os.path.isdir(os.path.join(model_dir, name))
]
print(models)


['model1']


In [13]:
import requests

url = "http://127.0.0.1:8989/queries/model-info"
headers = {
    "accept": "application/json",
    "Content-Type": "application/json"
}
payload = {
    "config_path": "static/config/config.yaml",
    "model_path": "models/model1"
}

response = requests.post(url, headers=headers, json=payload)

# Print response
print(response.status_code)
allInfo = response.json()


200


In [162]:

headers = {
        "accept": "application/json",
        "Content-Type": "application/json"
    }
payload = {
    "config_path": "static/config/config.yaml",
    "model_path": "models/myModel"
}

response = requests.post("http://127.0.0.1:8989/queries/model-info", headers=headers, json=payload)
print("Status Code:", response.status_code)

if response.status_code != 200:
    raise Exception(f"Request failed with status code {response.status_code}: {response.text}")

allInfo = response.json()

Status Code: 200


In [174]:
def build_theme_data_dict(allInfo, doc_collection):
    """
    Builds a dictionary of theme data indexed by ID, with text and score for each assigned document.
    """
    theme_dict = {}
    topics_info = allInfo.get("Topics Info", {})
    topic_keys = list(topics_info.keys())

    for idx, topic_key in enumerate(topic_keys):
        topic_data = topics_info[topic_key]
        theme_id = topic_key

        # Extract top_doc text
        top_doc_text = ""
        top_docs = topic_data.get("Top Documents", {})
        if isinstance(top_docs, dict) and top_docs:
            first_doc_key = next(iter(top_docs))
            top_doc_text = top_docs[first_doc_key]

        # Assigned Documents with Scores
        docs_prob = topic_data.get("Assigned Documents", {})
        assigned_results = []
        if docs_prob:
            ids = list(docs_prob.keys())
            results = doc_collection.get(ids=ids, include=["documents"])
            assigned_results = [
                {
                    "id": doc_id,
                    "text": text,
                    "score": docs_prob.get(doc_id, 0.0)
                }
                for doc_id, text in zip(results["ids"], results["documents"])
            ]

        

        # Build theme data
        theme_data = {
            "id": theme_id,
            "label": topic_data.get("Label", f"Topic {idx}"),
            "prevalence": topic_data["Size"],
            "coherence": topic_data["Coherence (NPMI)"],
            "uniqueness": topic_data["Entropy"],
            "keywords": topic_data["Keywords"],
            "summary": topic_data["Summary"],
            "top_doc": top_doc_text,
            "theme_matches": len(docs_prob),
            "similar_themes": topic_data.get("Similar Topics (Coocurring)", []),
            "trend": [],  # Add if you compute this
            "assigned_docs": assigned_results
        }

        theme_dict[theme_id] = theme_data

    return theme_dict

In [175]:
themeDetails = build_theme_data_dict(allInfo, collection)
summary = extract_topic_summaries(allInfo)
themeSummary = sorted(summary, key=lambda t: t["document_count"], reverse=True)


In [178]:
themeDetails["t1"]

{'id': 't1',
 'label': 'Topic 1',
 'prevalence': '11.89%',
 'coherence': 0.015335761657890395,
 'uniqueness': 0.7228546738624573,
 'keywords': 'research, projects, program, project, administrator, study, report, water, requires, small, establish, assistance, management, grants, develop',
 'summary': 'Placeholder for summary from Topic 1',
 'top_doc': 1.0,
 'theme_matches': 208,
 'similar_themes': [{'ID': 9,
   'Label': 'Topic 9',
   'Similarity': 0.01683720387518406},
  {'ID': 4, 'Label': 'Topic 4', 'Similarity': -0.07277941703796387},
  {'ID': 3, 'Label': 'Topic 3', 'Similarity': -0.0780392587184906},
  {'ID': 8, 'Label': 'Topic 8', 'Similarity': -0.08048803359270096},
  {'ID': 7, 'Label': 'Topic 7', 'Similarity': -0.1020778939127922}],
 'trend': [],
 'assigned_docs': [{'id': 'neww.csv_5',
   'text': 'amends water_resources_development_act direct secretary army enter local cooperation agreement non federal interest provide assistance designing constructing project provide continued sa

In [185]:

def fetch_and_process_model_info(model_path: str, endpoint: str = "http://127.0.0.1:8989/queries/model-info"):
    headers = {
        "accept": "application/json",
        "Content-Type": "application/json"
    }
    payload = {
        "config_path": "static/config/config.yaml",
        "model_path": f"models/{model_path}"
    }

    response = requests.post(endpoint, headers=headers, json=payload)
    print("Status Code:", response.status_code)
    
    if response.status_code != 200:
        raise Exception(f"Request failed with status code {response.status_code}: {response.text}")
    
    allInfo = response.json()

    themeDetails = build_theme_data_dict(allInfo, collection)
    summary = extract_topic_summaries(allInfo)
    themeSummary = sorted(summary, key=lambda t: t["document_count"], reverse=True)
    return themeSummary, themeDetails


def call_gateway(url, method="POST", payload=None, headers=None):
    """
    Call a gateway route with GET or POST.

    Args:
        url (str): The gateway endpoint URL.
        method (str): HTTP method ("GET" or "POST").
        params (dict, optional): Query parameters for GET.
        payload (dict, optional): JSON body for POST.
        headers (dict, optional): Headers to include.

    Returns:
        dict: JSON response if successful.
        str: Error message if request fails.
    """
    try:
        method = method.upper()
        if method == "GET":
            response = requests.get(url, params=["GET, POST"], headers=headers)
        elif method == "POST":
            headers = headers or {"Content-Type": "application/json"}
            response = requests.post(url, json=payload, headers=headers)
        else:
            return f"Unsupported method: {method}"

        if response.ok:
            return response.json()
        else:
            return f"Error {response.status_code}: {response.text}"
    except Exception as e:
        return f"Exception occurred: {str(e)}"


def build_theme_data_dict(allInfo, doc_collection):
    """
    Builds a dictionary of theme data indexed by ID, with text and score for each assigned document.
    """
    theme_dict = {}
    topics_info = allInfo.get("Topics Info", {})
    topic_keys = list(topics_info.keys())

    for idx, topic_key in enumerate(topic_keys):
        topic_data = topics_info[topic_key]
        theme_id = topic_key

        # Extract top_doc text
        top_doc_text = ""
        top_docs = topic_data.get("Top Documents", {})
        if isinstance(top_docs, dict) and top_docs:
            first_doc_key = next(iter(top_docs))
            top_doc_text = top_docs[first_doc_key]

        # Assigned Documents with Scores
        docs_prob = topic_data.get("Assigned Documents", {})
        assigned_results = []
        if docs_prob:
            ids = list(docs_prob.keys())
            results = doc_collection.get(ids=ids, include=["documents"])
            assigned_results = [
                {
                    "id": doc_id,
                    "text": text,
                    "score": docs_prob.get(doc_id, 0.0)
                }
                for doc_id, text in zip(results["ids"], results["documents"])
            ]

        

        # Build theme data
        theme_data = {
            "id": theme_id,
            "label": topic_data.get("Label", f"Topic {idx}"),
            "prevalence": topic_data["Size"],
            "coherence": topic_data["Coherence (NPMI)"],
            "uniqueness": topic_data["Entropy"],
            "keywords": topic_data["Keywords"],
            "summary": topic_data["Summary"],
            "top_doc": top_doc_text,
            "theme_matches": len(docs_prob),
            "similar_themes": topic_data.get("Similar Topics (Coocurring)", []),
            "trend": [],  # Add if you compute this
            "assigned_docs": assigned_results
        }

        theme_dict[theme_id] = theme_data

    return theme_dict


def get_assigned_documents_with_scores(doc_collection, docs_prob):
    """
    Given a ChromaDB document collection and a dictionary of Assigned Documents
    (with scores), return a list of {id, text, score} for each document.
    """
    if not docs_prob:
        return []

    doc_ids = list(docs_prob.keys())

    # Fetch document texts from ChromaDB
    results = doc_collection.get(
        ids=doc_ids,
        include=["documents"]
    )

    return [
        {
            "id": doc_id,
            "text": text,
            "score": docs_prob.get(doc_id, 0.0)
        }
        for doc_id, text in zip(results["ids"], results["documents"])
    ]

def extract_topic_summaries(allInfo):
    topic_info = allInfo.get("Topics Info", {})
    topic_summaries = []

    for idx, (topic_key, topic_data) in enumerate(topic_info.items()):
        label = topic_data.get("Label", f"Topic {idx}")
        document_count = len(topic_data.get("Assigned Documents", []))

        topic_summaries.append({
            "id": idx + 1,
            "label": label,
            "document_count": document_count
        })

    return topic_summaries


import json
from pathlib import Path

import json
from pathlib import Path

def load_or_create_dashboard_json(path: str = "static/config/dashboardData.json") -> dict:
    """
    Loads existing dashboard JSON data if available,
    or creates an empty JSON file and returns an empty dict.

    Args:
        path (str): Path to the JSON file.

    Returns:
        dict: The dashboard data.
    """
    file_path = Path(path)
    file_path.parent.mkdir(parents=True, exist_ok=True)

    if file_path.exists():
        # Load existing data
        with file_path.open("r", encoding="utf-8") as f:
            try:
                data = json.load(f)
                print(f"📂 Loaded dashboard data from {file_path.resolve()}")
            except json.JSONDecodeError:
                print(f"⚠️ File was invalid. Resetting to empty JSON.")
                data = {}
    else:
        # Create empty file
        data = {}
        with file_path.open("w", encoding="utf-8") as f:
            json.dump(data, f, indent=2)
        print(f"🆕 Created empty dashboard JSON at {file_path.resolve()}")

    return data


def add_model_to_dashboard(model_name: str, themeSummary, themeDetails, path: str = "static/config/dashboardData.json") -> dict:
    """
    Adds a new model entry to an existing dashboard JSON file.

    Args:
        model_name (str): The name of the model to add.
        path (str): Path to the JSON file.

    Returns:
        dict: The updated dashboard data.
    """
    file_path = Path(path)
    if file_path.exists():
        with file_path.open("r", encoding="utf-8") as f:
            data = json.load(f)
    else:
        data = {}

    # Add new model entry
    data[model_name] = {
        "Theme Summary": themeSummary,
        "Theme Details": themeDetails
    }

    # Write back to file
    with file_path.open("w", encoding="utf-8") as f:
        json.dump(data, f, indent=2)

    print(f"✅ Added model '{model_name}' to dashboard JSON")
    return data



In [190]:
fetch_and_process_model_info(model_path="myModel")

Status Code: 400


Exception: Request failed with status code 400: {"detail":"Model path not found or not a directory."}

In [192]:
add_model_to_dashboard("myModel1", themeSummary=themeSummary, themeDetails=themeDetails)

✅ Added model 'myModel1' to dashboard JSON


{'myModel': {'Theme Summary': [{'id': 't0',
    'label': 'Topic 0',
    'document_count': 383},
   {'id': 't2', 'label': 'Topic 2', 'document_count': 253},
   {'id': 't3', 'label': 'Topic 3', 'document_count': 232},
   {'id': 't1', 'label': 'Topic 1', 'document_count': 208},
   {'id': 't5', 'label': 'Topic 5', 'document_count': 201},
   {'id': 't8', 'label': 'Topic 8', 'document_count': 155},
   {'id': 't6', 'label': 'Topic 6', 'document_count': 151},
   {'id': 't4', 'label': 'Topic 4', 'document_count': 148},
   {'id': 't7', 'label': 'Topic 7', 'document_count': 148},
   {'id': 't9', 'label': 'Topic 9', 'document_count': 119}],
  'Theme Details': {'t0': {'id': 't0',
    'label': 'Topic 0',
    'prevalence': '18.44%',
    'coherence': 0.06168143053653866,
    'uniqueness': 0.6808832883834839,
    'keywords': 'tax, credit, income, medicare, code, internal, revenue, health, security, benefits, care, social, individual, coverage, property',
    'summary': 'Placeholder for summary from Top

In [17]:
data = read_dashboard_json()

📂 Successfully read dashboard data from /Users/danielstephens/Desktop/TOVA/static/config/dashboardData.json


In [20]:
data

{'myModel': {'Theme Summary': [{'id': 't0',
    'label': 'Topic 0',
    'document_count': 383},
   {'id': 't2', 'label': 'Topic 2', 'document_count': 253},
   {'id': 't3', 'label': 'Topic 3', 'document_count': 232},
   {'id': 't1', 'label': 'Topic 1', 'document_count': 208},
   {'id': 't5', 'label': 'Topic 5', 'document_count': 201},
   {'id': 't8', 'label': 'Topic 8', 'document_count': 155},
   {'id': 't6', 'label': 'Topic 6', 'document_count': 151},
   {'id': 't4', 'label': 'Topic 4', 'document_count': 148},
   {'id': 't7', 'label': 'Topic 7', 'document_count': 148},
   {'id': 't9', 'label': 'Topic 9', 'document_count': 119}],
  'Theme Details': {'t0': {'id': 't0',
    'label': 'Topic 0',
    'prevalence': '18.44%',
    'coherence': 0.06168143053653866,
    'uniqueness': 0.6808832883834839,
    'keywords': ['tax',
     'credit',
     'income',
     'medicare',
     'code',
     'internal',
     'revenue',
     'health',
     'security',
     'benefits',
     'care',
     'social',
  

In [24]:
allDocuments = [
    doc
    for details in data["myModel"]["Theme Details"].values()
    for doc in details.get("documents", [])
]


In [28]:
allDocuments

[{'id': 'neww.csv_1',
  'text': 'family savings act amends economic growth tax relief reconciliation act extend provisions allowing increased annual contributions coverdell education savings accounts amends internal revenue code allow tax free distributions coverdell education savings account time homebuyer expenses permit rollovers coverdell education savings accounts roth individual retirement accounts roth iras rename coverdell education savings accounts',
  'score': 1.0,
  'theme': 'Topic 0'},
 {'id': 'neww.csv_2',
  'text': 'internal revenue code respect tax credit health insurance expenses small employers expand eligibility credit employers having currently time equivalent employees annual wages exceed currently raise employee threshold triggering phaseout credit time employees eliminate requirement employers contribute percentage cost employee health insurance cap limiting eligible employer contributions average premiums paid state health care exchange credit available employee 

In [30]:
from datetime import datetime
from zoneinfo import ZoneInfo
results = registry.get()
model_name = "model2"

for id_, doc, meta in zip(results["ids"], results["documents"], results["metadatas"]):
    if meta.get("model_name") == model_name:
        model = {
            "model_id": meta.get("model_id", ""),
            "document": doc,
            "model_type": meta.get("model_type", ""),
            "model_name": meta.get("model_name", ""),
            "num_topics": meta.get("num_topic", ""),
            "corpus_names": meta.get("corpus_names", ""),
            "trained_on": datetime.fromisoformat(meta.get("trained_on", ""))
                .replace(tzinfo=ZoneInfo("UTC"))
                .astimezone(ZoneInfo("America/New_York"))
                .strftime("%Y-%m-%d %I:%M %p %Z")
        }

        print(f"\n\n{model}\n\n")



{'model_id': 'tomotopyLDA_Corpus2_Corpus4_billSample1_2025-06-08T23:50:53.343834', 'document': 'Trained tomotopyLDA on Corpus2, Corpus4, billSample1', 'model_type': 'tomotopyLDA', 'model_name': 'model2', 'num_topics': 60, 'corpus_names': 'Corpus2, Corpus4, billSample1', 'trained_on': '2025-06-08 07:50 PM EDT'}




In [33]:
model["model_id"]

'tomotopyLDA_Corpus2_Corpus4_billSample1_2025-06-08T23:50:53.343834'

In [43]:
dashboard_data = read_dashboard_json()
modelData = dashboard_data["model1"]
theme_entries = modelData.get("Theme Details", [])
# Extract ID, label, and coordinates
theme_coords = []
for modelkey, entry in theme_entries.items():
    theme_coords.append({
        "id": entry.get("topic_id") or entry.get("id"),
        "label": entry.get("label") or entry.get("theme"),
        "x": entry.get("Coordinates", [None, None])[0],
        "y": entry.get("Coordinates", [None, None])[1]
    })

# Print the result
for theme in theme_coords:
    print(theme)

📂 Successfully read dashboard data from /Users/danielstephens/Desktop/TOVA/static/config/dashboardData.json


In [45]:
# Adjust based on actual key, e.g., data["theme_details"] or data["themes"]




{'id': 't0', 'label': 'Topic 0', 'x': 0.09405651152169325, 'y': -0.13195343628206405}
{'id': 't1', 'label': 'Topic 1', 'x': -0.06298858324122691, 'y': 0.05257031394046082}
{'id': 't2', 'label': 'Topic 2', 'x': 0.07054789127316195, 'y': 0.1486368592478381}
{'id': 't3', 'label': 'Topic 3', 'x': -0.04874434498314165, 'y': -0.06403453206423088}
{'id': 't4', 'label': 'Topic 4', 'x': 0.1515540126233255, 'y': -0.12582239421260405}
{'id': 't5', 'label': 'Topic 5', 'x': -0.0006423262001792895, 'y': -0.11716255147800211}
{'id': 't6', 'label': 'Topic 6', 'x': -0.03643146649637813, 'y': -0.153154471472813}
{'id': 't7', 'label': 'Topic 7', 'x': 0.016653180415590295, 'y': 0.03316491222661456}
{'id': 't8', 'label': 'Topic 8', 'x': 0.14118671048355294, 'y': -0.0954465515422419}
{'id': 't9', 'label': 'Topic 9', 'x': -0.17496730410897893, 'y': -0.007255434971193988}
{'id': 't10', 'label': 'Topic 10', 'x': -0.07477731560884547, 'y': -0.11747279853639148}
{'id': 't11', 'label': 'Topic 11', 'x': -0.0197111