In [None]:
# Depends on:
#     code/analysis/101_segments_embeddings.py
#     code/analysis/102_museum_objects_embeddings.py
#     code/analysis/121_dim_reduction_test.py

# Local setup
from local_dir_setup import *

# Libraries
import os
import string
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from bertopic import BERTopic
import plotly.express as px

## Load Data

In [None]:
# Setup
file_trans_mpnet = dir_storage +  "---transcripts-embeddings-file---"  # Folder with the embeddings from 101_segments_embeddings.py
file_museum_mpnet = dir_storage + "---museum-objects-embeddings-file---" # Folder with the embeddings from 102_museum_objects_embeddings.py

In [None]:
# Load embeddings
# see also https://www.sbert.net/examples/applications/computing-embeddings/README.html

with open(file_trans_mpnet, "rb") as fIn:
   data_trans_mpnet = pickle.load(fIn)
   trans_id_mpnet = data_trans_mpnet["transcript"]
   trans_start_id_mpnet = data_trans_mpnet["start_id"]
   trans_start_time_mpnet = data_trans_mpnet["start_time"]
   trans_end_id_mpnet = data_trans_mpnet["end_id"]
   trans_end_time_mpnet = data_trans_mpnet["end_time"]
   trans_text_mpnet = data_trans_mpnet["text"]
   trans_embeddings_mpnet = data_trans_mpnet["embeddings_sbert_mpnet"]

## BERTopic

In [None]:
from hdbscan import HDBSCAN
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired

In [None]:
umap_model = UMAP(n_neighbors=16, n_components=2, min_dist=0.0, metric="cosine")
hdbscan_model = HDBSCAN(min_cluster_size=8, metric="euclidean", cluster_selection_method="eom")
ctfidf_model = ClassTfidfTransformer(bm25_weighting=True)

topic_model = BERTopic(
    vectorizer_model=CountVectorizer(stop_words="english"),
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    # Topic representation
    ctfidf_model=ctfidf_model
    #nr_topics="auto",
    #n_gram_range=(2, 3)
)

topics, probs = topic_model.fit_transform(documents=trans_text_mpnet, embeddings=trans_embeddings_mpnet)

topic_model.get_topic_info()

In [None]:
reduced_embeddings = UMAP(n_neighbors=16, n_components=2, min_dist=0.0, metric="cosine").fit_transform(trans_embeddings_mpnet)
topic_model.visualize_documents(trans_text_mpnet, reduced_embeddings=reduced_embeddings)

In [None]:
data_with_topics = pd.concat([
    trans_id_mpnet,
    trans_start_id_mpnet,
    trans_start_time_mpnet,
    trans_end_id_mpnet,
    trans_end_time_mpnet,
    trans_text_mpnet,
    topic_model.get_document_info(trans_text_mpnet).drop(columns=["Document", "Representative_document"]).rename(columns={
        "Topic": "bertopic_topic",
        "Name": "bertopic_topic_name",
        "Top_n_words": "bertopic_topic_topwords",
        "Probability": "bertopic_probability"
    }),
    pd.DataFrame(reduced_embeddings, columns = ["UMAP1","UMAP2"])
    ], axis=1)

data_with_topics

## Join with museum data and save

In [None]:
top_museum_object = pd.read_csv(dir_storage + "---museum-objects-topics-file---") # Output from 121_matching_museum_objects.py

In [None]:
data_with_topics_and_museumobj = data_with_topics.merge(
    top_museum_object.drop(
        columns=["trans_start_time", "trans_end_time", "trans_text"]
    ).rename(columns={
        "trans_id": "transcript",
        "trans_start_id": "start_id",
        "trans_end_id": "end_id",
        "similarity_mpnet": "museum_sim_mpnet"
    }),
    on=["transcript", "start_id", "end_id"]
)

data_with_topics_and_museumobj

In [None]:
data_with_topics_and_museumobj.to_csv(
    dir_storage + "---transcripts-objects-topics-file---",
    index=False
)