In [1]:
import pandas as pd

In [None]:
data =[
    {"Document Number": 1, "Content": "This is the first document.", "Category": "General"},
    {"Document Number": 2, "Content": "This is the second document.", "Category": "Finance"},
    {"Document Number": 3, "Content": "This is the third document.", "Category": "Health"}]

In [3]:
import pandas as pd

# Define the data
data = [
    {"Document Number": 1, "Content": "My name is Betty", "Category": "General"},
    {"Document Number": 2, "Content": "I am married to a handsome guy called Fiifi", "Category": "Finance"},
    {"Document Number": 3, "Content": "He will be rich in Jesus' name", "Category": "Health"}
]

# Convert to DataFrame
df = pd.DataFrame(data)

# Save as CSV file
df.to_csv("betty.csv", index=False)

print("CSV file 'converted_data.csv' has been created successfully.")


CSV file 'converted_data.csv' has been created successfully.


In [3]:
results = collection.get(include=["documents", "metadatas"])


In [4]:
results["metadatas"]

[]

In [12]:
class BaseTopicModel:
    """
    Abstract base class for all topic models integrated into the system.
    Every custom model (LDA, BERTopic, Top2Vec, etc.) should inherit from this class
    and implement the required methods to be compatible with the system.
    """

    def fit(self, documents, embeddings=None):
        raise NotImplementedError("fit() must be implemented.")

    def get_topics(self):
        raise NotImplementedError("get_topics() must be implemented.")

    def get_document_topics(self):
        raise NotImplementedError("get_document_topics() must be implemented.")

    def get_topic_embeddings(self):
        return None

    def get_topic_scores(self):
        return None

    def get_model_name(self):
        return self.__class__.__name__

    def transform(self, documents, embeddings=None):
        raise NotImplementedError("transform() is not implemented.")

    @property
    def requires_embeddings(self):
        return False

    def save(self, filepath):
        """
        Save the trained topic model to disk.

        Args:
            filepath (str): File path where the model should be saved.
        """
        raise NotImplementedError("save() must be implemented by subclasses.")

    @classmethod
    def load(cls, filepath):
        """
        Load a saved topic model from disk.

        Args:
            filepath (str): Path to the saved model file.

        Returns:
            BaseTopicModel: An instance of the model.
        """
        raise NotImplementedError("load() must be implemented by subclasses.")


In [14]:
from gensim import corpora, models
# from .base_model import BaseTopicModel


class LDAAdapter(BaseTopicModel):
    def __init__(self, num_topics=10, passes=10, random_state=42):
        self.num_topics = num_topics
        self.passes = passes
        self.random_state = random_state
        self.model = None
        self.dictionary = None
        self.corpus = None
        self.documents = []

    def fit(self, documents, embeddings=None):
        self.documents = [doc.split() for doc in documents]
        self.dictionary = corpora.Dictionary(self.documents)
        self.corpus = [self.dictionary.doc2bow(doc) for doc in self.documents]
        self.model = models.LdaModel(
            self.corpus,
            num_topics=self.num_topics,
            id2word=self.dictionary,
            passes=self.passes,
            random_state=self.random_state
        )

    def get_topics(self):
        return {
            topic_id: [word for word, _ in self.model.show_topic(topic_id, topn=10)]
            for topic_id in range(self.num_topics)
        }

    def get_document_topics(self):
        doc_topics = {}

        for i, bow in enumerate(self.corpus):
            dist = self.model.get_document_topics(bow, minimum_probability=0.0)
            dominant_topic, score = max(dist, key=lambda x: x[1])
            doc_topics[i] = {
                "topic_id": dominant_topic,
                "topic_score": float(score),
                "topic_distribution": {tid: float(prob) for tid, prob in dist}
            }

        return doc_topics

    def get_topic_scores(self):
        # Optional: Return frequency of each topic
        topic_freq = [0] * self.num_topics
        for bow in self.corpus:
            for tid, prob in self.model.get_document_topics(bow):
                topic_freq[tid] += prob
        return {i: float(score) for i, score in enumerate(topic_freq)}

    def get_topic_embeddings(self):
        # Gensim LDA does not support topic embeddings
        return None

    def get_model_name(self):
        return "LDA"

    def save(self, filepath):
        self.model.save(filepath + ".model")
        self.dictionary.save(filepath + ".dict")

    @classmethod
    def load(cls, filepath):
        instance = cls()
        instance.model = models.LdaModel.load(filepath + ".model")
        instance.dictionary = corpora.Dictionary.load(filepath + ".dict")
        return instance


In [15]:
model = LDAAdapter()

In [16]:
import chromadb
import json
client = chromadb.PersistentClient(path="database/myDB")
collection = client.get_or_create_collection(name="documents")
collect = client.get_or_create_collection(name="doc")

results = collection.get(include=["documents", "metadatas"])

documents = results["documents"]
ids = results["ids"]
metadatas = results["metadatas"]

model.fit(documents)

# print("fit complete")

# # === 4. Get topic assignments ===
# doc_topics = model.get_document_topics()

# # print(doc_topics)
# batch_ids = []
# batch_docs = []
# batch_metadatas = []

# for i, doc in enumerate(documents):
#     doc_id = ids[i]
#     base_metadata = metadatas[i] or {}
#     topic_data = doc_topics.get(i, {})

#     metadata = {
#         "original_content": base_metadata.get("original_content", ""),
#         "file_name": base_metadata.get("file_name", f"doc_{i}"),
#         "label": base_metadata.get("label"),
#         "model": model.get_model_name(),
#         "topic_id": topic_data.get("topic_id"),
#         "topic_score": topic_data.get("topic_score"),
#         "topic_distribution": json.dumps(topic_data.get("topic_distribution", {}))
#     }

#     batch_ids.append(doc_id)
#     batch_docs.append(doc)
#     batch_metadatas.append(metadata)
#     # print(i)

#     # ✅ Batch insert
# collect.add(
#     ids=batch_ids,
#     documents=batch_docs,
#     metadatas=batch_metadatas
# )


In [19]:
import chromadb
import json
client = chromadb.PersistentClient(path="database/myDB")
collection = client.get_or_create_collection(name="documents")
collect = client.get_or_create_collection(name="doc")

results = collection.get(include=["documents", "metadatas"])

documents = results["documents"]
ids = results["ids"]
metadatas = results["metadatas"]

model.fit(documents)
model.save("model/tryLDA")


In [23]:
# Load the model from file (without extension)
lda_model = LDAAdapter.load("/Users/danielstephens/Desktop/TOVA/model/tryLDA")

# Now you can use:
topics = lda_model.get_topics()
doc_topics = lda_model.get_document_topics()

TypeError: 'NoneType' object is not iterable

In [38]:
print(topic_data.get("topic_id"))

8


In [35]:
results = collect.get(
    # where={"topic_id": 2},
    include=["documents", "metadatas"]
)

for i in range(len(results["documents"])):
    print(f"\n📄 Document {i+1}")
    print("ID:", results["ids"][i])
    print("Text:", results["documents"][i])
    print("Metadata:", results["metadatas"][i])



📄 Document 1
ID: dataWithScores.xlsx_0
Text: enable cooperative localization connect semiautonomous sparse laplacian processing cooperative localization receive extensive interest scientific community include robotic optimization signal process wireless expect major aspect number crucial application field connect semi cavs collision avoidancewarne cooperative adaptive cruise safely navigation etc mobile key provide connectivity vx allow cavs share entity datum collect measure typical measurement usually deploy problem absolute position global positioning gps relative distance neighbouring relative angle azimuth angle light range radio range radar provide cooperative estimation perform multi modalfusion interconnect base graph signal processing tool know laplacian graph processing significantly outperform exist term attain computational complexity

📄 Document 2
ID: dataWithScores.xlsx_1
Text: mmwave vehicleroad cooperative drive vehicleroad cooperation large datum need transmit connect

In [23]:
import chromadb
client = chromadb.PersistentClient(path="database/myDB")
collection = client.get_or_create_collection(name="documents")
collect = client.get_or_create_collection(name="doc")

In [28]:
results = collection.get(include=["documents", "metadatas"])
# # print(chroma_data)
documents = results["documents"]
metadatas = results["metadatas"]
ids = results["ids"]

In [29]:
for i in range(min(5, len(results["ids"]))):
    print(f"\n📄 Document {i + 1}")
    print("ID:", results["ids"][i])
    print("Text:", results["documents"][i])
    print("Metadata:", results["metadatas"][i])



📄 Document 1
ID: dataWithScores.xlsx_0
Text: enable cooperative localization connect semiautonomous sparse laplacian processing cooperative localization receive extensive interest scientific community include robotic optimization signal process wireless expect major aspect number crucial application field connect semi cavs collision avoidancewarne cooperative adaptive cruise safely navigation etc mobile key provide connectivity vx allow cavs share entity datum collect measure typical measurement usually deploy problem absolute position global positioning gps relative distance neighbouring relative angle azimuth angle light range radio range radar provide cooperative estimation perform multi modalfusion interconnect base graph signal processing tool know laplacian graph processing significantly outperform exist term attain computational complexity

📄 Document 2
ID: dataWithScores.xlsx_1
Text: mmwave vehicleroad cooperative drive vehicleroad cooperation large datum need transmit connect

In [10]:
model.fit(documents)

print("fit complete")

# === 4. Get topic assignments ===
doc_topics = model.get_document_topics()

print(doc_topics)

fit complete
{0: {'topic_id': 8, 'topic_score': 0.9907179474830627, 'topic_distribution': {0: 0.0010313191451132298, 1: 0.0010315021499991417, 2: 0.001031360705383122, 3: 0.0010312909726053476, 4: 0.0010312637314200401, 5: 0.001031483174301684, 6: 0.0010313271777704358, 7: 0.0010312424274161458, 8: 0.9907179474830627, 9: 0.0010312877129763365}}, 1: {'topic_id': 9, 'topic_score': 0.9826870560646057, 'topic_distribution': {0: 0.0019237818196415901, 1: 0.001923600328154862, 2: 0.0019236025400459766, 3: 0.0019237394444644451, 4: 0.0019236064981669188, 5: 0.0019236212829127908, 6: 0.001923627918586135, 7: 0.0019236509688198566, 8: 0.001923736766912043, 9: 0.9826870560646057}}, 2: {'topic_id': 2, 'topic_score': 0.9879940748214722, 'topic_distribution': {0: 0.0013339875731617212, 1: 0.0013339832657948136, 2: 0.9879940748214722, 3: 0.0013339344877749681, 4: 0.0013340129517018795, 5: 0.0013340390287339687, 6: 0.001333985012024641, 7: 0.0013340342557057738, 8: 0.0013339522993192077, 9: 0.0013339

In [5]:
model.get_model_name()

NameError: name 'model' is not defined

In [12]:
import json

In [24]:
len(documents)

383

In [54]:
import pandas as pd
import plotly.express as px

# Assuming you've already fetched everything
results = collect.get(include=["documents", "metadatas"])

# Create a DataFrame
df = pd.DataFrame({
    "id": results["ids"],
    "document": results["documents"],
    "topic_id": [meta.get("topic_id", -1) for meta in results["metadatas"]]
})

# Group by topic
topic_counts = df["topic_id"].value_counts().reset_index()
topic_counts.columns = ["topic_id", "document_count"]




In [4]:
model.get_topic_embeddings()

NameError: name 'model' is not defined

In [None]:
# Load the model from file (without extension)
lda_model = LDAAdapter.load("/Users/danielstephens/Desktop/TOVA/model/tryLDA")

# Now you can use:
topics = lda_model.get_topics()
doc_topics = lda_model.get_document_topics()

In [25]:
# You must re-tokenize and recreate the corpus
tokenized_docs = [doc.split() for doc in documents]
bows = [lda_model.dictionary.doc2bow(doc) for doc in tokenized_docs]

# Optional: set corpus if you want to use .get_document_topics()
lda_model.documents = tokenized_docs
lda_model.corpus = bows

# Now call get_document_topics
doc_topics = lda_model.get_document_topics()

In [26]:
doc_topics

{0: {'topic_id': 8,
  'topic_score': 0.9907179474830627,
  'topic_distribution': {0: 0.0010313191451132298,
   1: 0.0010315021499991417,
   2: 0.001031360705383122,
   3: 0.0010312909726053476,
   4: 0.0010312637314200401,
   5: 0.001031483174301684,
   6: 0.0010313271777704358,
   7: 0.0010312424274161458,
   8: 0.9907179474830627,
   9: 0.0010312877129763365}},
 1: {'topic_id': 9,
  'topic_score': 0.9826870560646057,
  'topic_distribution': {0: 0.0019237818196415901,
   1: 0.001923600328154862,
   2: 0.0019236025400459766,
   3: 0.0019237394444644451,
   4: 0.0019236064981669188,
   5: 0.0019236212829127908,
   6: 0.001923627918586135,
   7: 0.0019236509688198566,
   8: 0.001923736766912043,
   9: 0.9826870560646057}},
 2: {'topic_id': 2,
  'topic_score': 0.9879940748214722,
  'topic_distribution': {0: 0.0013339875731617212,
   1: 0.0013339832657948136,
   2: 0.9879940748214722,
   3: 0.0013339344877749681,
   4: 0.0013340129517018795,
   5: 0.0013340390287339687,
   6: 0.00133398501