BERTopic Demo
https://maartengr.github.io/BERTopic/

In [None]:
%pip install bertopic
%pip install bertopic[flair, gensim, spacy, use]
%pip install openai
%pip install matplotlib

First we will create a sqlite database for the chatgpt conversations

In [None]:
from bertopic import BERTopic
import chatgpt_db_manager as cm

# Parse JSON and create the database
db_file = 'chat.db' # Database File
chatgpt_export = 'conversations.json' #ChatGPT export file
cm.create_database(db_file)
conversation_infos, all_chats = cm.parse_json(chatgpt_export)

# Insert conversations and chats with updated UUIDs
cm.insert_conversations_and_chats(db_file, conversation_infos, all_chats)

Now we will calculate topics for chat messages

In [None]:
from bertopic import BERTopic
conn = cm.connect_db(db_file)

# Fetch messages and uuids
chats = cm.fetch_all_chats(conn)
conn.close()
messages = chats["message"]
ids = chats["id"]

topic_model = BERTopic()
topics, probs = topic_model.fit_transform(messages)
topic_info = topic_model.get_topic_info()
print(topic_info)

In [None]:
topic_names = topic_model.get_document_info(messages)['Name'].tolist()
conn = cm.connect_db(db_file)
cm.insert_chat_topics(conn,ids,topic_names)
conn.close()

Now we will calculate topics for chat pairs

In [None]:
conn = cm.connect_db(db_file)

chats = cm.fetch_message_pairs(conn)
conn.close()
messages = chats["message"]
parent_ids = chats["parent_id"]
child_ids = chats["child_id"]
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(messages)
topic_names = topic_model.get_document_info(messages)['Name'].tolist()
conn = cm.connect_db(db_file)
cm.insert_chat_links(conn, parent_ids, child_ids,topic_names)
conn.close()    

Now we will calculate Topics for Conversations. 

# TODO we need to fix fetch_conversations_with_chats it is reurning a list of lists for ["message"]

In [None]:
conn = cm.connect_db(db_file)

# Fetch messages and uuids
conversations = cm.fetch_conversations_with_chats(conn)
print(conversations[2]["message"][0])
for chats in conversations:
    messages = chats["message"]
    id = chats["conversation_id"][0]
    topic_model = BERTopic()
    topics, probs = topic_model.fit_transform(messages)
    # topic_names = topic_model.get_document_info(messages)['Name'].tolist()
    # cm.insert_chat_topics(conn,id,topic_names)
conn.close()

In [None]:
topic_model.get_topic_info()

In [None]:
from bertopic import BERTopic
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
topic_model_cluster = BERTopic(hdbscan_model=hdbscan_model)

from embedding import fetch_entries
db_name = "law_database_old.db"
texts, uuids = fetch_entries(db_name)
topics_cluster, probs_cluster = topic_model_cluster.fit_transform(texts)

topic_model_cluster.get_topic_info()

In [None]:
from embedding import fetch_entries
db_name = "law_database_old.db"
docs, uuids = fetch_entries(db_name)
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP

# Prepare embeddings
# docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(docs, show_progress_bar=False)

# Train BERTopic
topic_model = BERTopic().fit(docs, embeddings)

# Run the visualization with the original embeddings
topic_model.visualize_documents(docs, embeddings=embeddings)

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
# topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)
print(reduced_embeddings)


In [None]:
# import json
# print(len(topic_model.get_document_info(docs)))
# print(len(reduced_embeddings))

# document_info = topic_model.get_document_info(docs)
# topics = document_info['Name']
# documents = document_info['Document']

# # Assuming `reduced_embeddings` is your UMAP data and it's a list of lists or a NumPy array
# data = [{"umap_x": float(point[0]), "umap_y": float(point[1]), "topic": topic, "document": document} for point, topic, document in zip(reduced_embeddings, topics, documents)]

# # Save the data as a JSON file
# with open("umap-data.json", "w") as f:
#     json.dump(data, f)

In [None]:
# import json

# # Get the document information from BERTopic
# d = topic_model.get_document_info(texts)
# d_doc = d['Document']
# d_name = d['Name']

# # Convert to list if necessary
# s_doc = d_doc.tolist() if hasattr(d_doc, 'tolist') else d_doc
# s_name = d_name.tolist() if hasattr(d_name, 'tolist') else d_name

# # Create a list of dictionaries for JSON output
# data = []
# for i, doc in enumerate(s_doc):
#     # Use the UUID corresponding to the document's text
#     law_entry_uuid = uuids[i]
#     data.append({
#         "umap_x": float(reduced_embeddings[i][0]),
#         "umap_y": float(reduced_embeddings[i][1]),
#         "item_id": law_entry_uuid  # Add the law_entry_uuid here
#     })

# # Save the data as a JSON file
# with open("umap-data.json", "w") as f:
#     json.dump(data, f)

In [None]:
from scipy.cluster import hierarchy as sch
from bertopic import BERTopic
# topic_model = BERTopic()
# topics, probs = topic_model.fit_transform(messages)

# Hierarchical topics
linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = topic_model.hierarchical_topics(messages, linkage_function=linkage_function)
cm.insert_hierarchical_topics_as_dag(conn, hierarchical_topics)
tree = topic_model.get_topic_tree(hierarchical_topics)
# print(tree)

In [None]:
import networkx as nx
import pandas as pd

# Extract mapping from ID to name
topic_to_name = dict(zip(hier_topics.Child_Left_ID, hier_topics.Child_Left_Name))
topic_to_name.update(dict(zip(hier_topics.Child_Right_ID, hier_topics.Child_Right_Name)))
topic_to_name = {topic: name[:100] for topic, name in topic_to_name.items()}

# Create tree
tree = {str(row[1].Parent_ID): [str(row[1].Child_Left_ID), str(row[1].Child_Right_ID)]
        for row in hier_topics.iterrows()}


def create_hierarchical_topic_dag(hier_topics: pd.DataFrame) -> nx.DiGraph:
    """
    Create a Directed Acyclic Graph (DAG) from hierarchical topics.

    Arguments:
        hier_topics: A DataFrame containing the hierarchical topic structure.
                     This is the output of `topic_model.hierarchical_topics()`.

    Returns:
        A networkx DiGraph representing the hierarchical topic connections.
    """
    # Initialize directed graph
    dag = nx.DiGraph()

    # Add nodes with topic names as node attributes
    for _, row in hier_topics.iterrows():
        dag.add_node(row['Parent_ID'], name=row['Parent_Name'])
        dag.add_node(row['Child_Left_ID'], name=row['Child_Left_Name'])
        dag.add_node(row['Child_Right_ID'], name=row['Child_Right_Name'])

        # Add edges from parent to children
        dag.add_edge(row['Parent_ID'], row['Child_Left_ID'])
        dag.add_edge(row['Parent_ID'], row['Child_Right_ID'])

    return dag

# Example usage
# Assuming `hierarchical_topics` is already defined as in your notebook
dag = create_hierarchical_topic_dag(hierarchical_topics)

# Optionally, visualize the DAG
# Note: You need to have matplotlib installed (`pip install matplotlib`)
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 8))
nx.draw(dag, with_labels=True, node_size=20, node_color="lightblue", font_size=10, font_weight="bold")
plt.show()

In [None]:
#Build rdf
import chatgpt_db_manager as cm
db_file = "chat.db"
conn = cm.connect_db(db_file)

chat_links = cm.fetch_chat_links(conn)
rdf_triples = []
for link in chat_links:
    triple = f"{link['source_chat_id']} {link['label']} {link['target_chat_id']}"
    next_message_triple = f"{link['source_chat_id']} {link['source_author']}_to_{link['target_author']} {link['target_chat_id']}"
    rdf_triples.append(triple)
    rdf_triples.append(next_message_triple)

chat_links = cm.fetch_chat_topics(conn)
for link in chat_links:
    triple = f"{link['label']} chat_type {link['chat_id']}"
    rdf_triples.append(triple)

# Example output of rdf_triples
for triple in rdf_triples[:5]:  # Print the first 5 triples for demonstration
    print(triple)

topic_links = cm.fetch_topic_links(conn)
print(topic_links)
for link in topic_links:
    triple = f"{link['parent_label']} is_member_of {link['child_label']}"
    rdf_triples.append(triple)

# Example output of rdf_triples
for triple in rdf_triples[:5]:  # Print the first 5 triples for demonstration
    print(triple)

with open("valid.txt", "w") as file:
    for triple in rdf_triples:
        file.write(triple + "\n")


Future Work:
Dynamic Topic Modeling over time
https://maartengr.github.io/BERTopic/getting_started/topicsovertime/topicsovertime.html

Topics per class:
https://maartengr.github.io/BERTopic/getting_started/topicsperclass/topicsperclass.html