# Setup
https://maartengr.github.io/BERTopic/getting_started/visualization/visualization.html#visualize-topics

In [None]:
# !pip install bertopic
# !pip install nbformat>=4.2.0
# !pip install tf-keras

In [None]:
import glob
from bertopic import BERTopic
import pandas as pd
from umap import UMAP
from hdbscan import HDBSCAN
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.dimensionality import BaseDimensionalityReduction
from miniArkivet import miniArkivet
#df = miniArkivet(r"C:\Users\frauker\OneDrive - Chalmers\Frauke\Projects\Agnotology of medical AI\A-Media\code\2024-07-07BERTopic_testrun")
df = miniArkivet(r"dummy")
df

# Data clean up

In [None]:
# !pip install amphi-etl 
#  AMPHI PIPELINE HERE
# import re
# Optional data cleanup here
#df.text = df.apply(lambda row: re.sub(r"http\S+", "", row.text).lower(), 1)
#df.text = df.apply(lambda row: " ".join(filter(lambda x:x[0]!="@", row.text.split())), 1)
#df.text = df.apply(lambda row: " ".join(re.sub("[^a-zA-Z]+", " ", row.text).split()), 1)

# Train

In [None]:
import itertools

def all_options():
    # Step 1 - Extract embeddings
    # KBLab/sentence-bert-swedish-cased
    # all-MiniLM-L6-v2 : Default 
    # paraphrase-multilingual-MiniLM-L12-v2
    embedding_model_0 = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
    embedding_model_1 = SentenceTransformer("KBLab/sentence-bert-swedish-cased")

    # Step 2 - Reduce dimensionality, n_components default value was 5
    umap_model_0 = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
    umap_model_1 = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine')

    # Step 3 - Cluster reduced embeddings
    hdbscan_model_0 = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
    hdbscan_model_1 = HDBSCAN(min_cluster_size=30, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

    # Step 4 - Tokenize topics
    SEstopwords = ["aderton","adertonde","adjö","aldrig","alla","allas","allt","alltid","alltså","andra","andras","annan","annat","artonde","artonn","att","av","bakom","bara","behöva","behövas","behövde","behövt","beslut","beslutat","beslutit","bland","blev","bli","blir","blivit","bort","borta","bra","bäst","bättre","båda","bådas","dag","dagar","dagarna","dagen","de","del","delen","dem","den","denna","deras","dess","dessa","det","detta","dig","din","dina","dit","ditt","dock","dom","du","där","därför","då","e","efter","eftersom","ej","elfte","eller","elva","emot","en","enkel","enkelt","enkla","enligt","ens","er","era","ers","ert","ett","ettusen","fanns","fem","femte","femtio","femtionde","femton","femtonde","fick","fin","finnas","finns","fjorton","fjortonde","fjärde","fler","flera","flesta","fram","framför","från","fyra","fyrtio","fyrtionde","få","får","fått","följande","för","före","förlåt","förra","första","genast","genom","gick","gjorde","gjort","god","goda","godare","godast","gott","gälla","gäller","gällt","gärna","gå","går","gått","gör","göra","ha","hade","haft","han","hans","har","heller","hellre","helst","helt","henne","hennes","hit","hon","honom","hundra","hundraen","hundraett","hur","här","hög","höger","högre","högst","i","ibland","icke","idag","igen","igår","imorgon","in","inför","inga","ingen","ingenting","inget","innan","inne","inom","inte","inuti","ja","jag","jo","ju","just","jämfört","kan","kanske","knappast","kom","komma","kommer","kommit","kr","kunde","kunna","kunnat","kvar","legat","ligga","ligger","lika","likställd","likställda","lilla","lite","liten","litet","länge","längre","längst","lätt","lättare","lättast","långsam","långsammare","långsammast","långsamt","långt","låt","man","med","mej","mellan","men","mer","mera","mest","mig","min","mina","mindre","minst","mitt","mittemot","mot","mycket","många","måste","möjlig","möjligen","möjligt","möjligtvis","ned","nederst","nedersta","nedre","nej","ner","ni","nio","nionde","nittio","nittionde","nitton","nittonde","nog","noll","nr","nu","nummer","när","nästa","någon","någonting","något","några","nån","nånting","nåt","nödvändig","nödvändiga","nödvändigt","nödvändigtvis","och","också","ofta","oftast","olika","olikt","om","oss","på","rakt","redan","rätt","sa","sade","sagt","samma","sedan","senare","senast","sent","sex","sextio","sextionde","sexton","sextonde","sig","sin","sina","sist","sista","siste","sitt","sitta","sju","sjunde","sjuttio","sjuttionde","sjutton","sjuttonde","själv","sjätte","ska","skall","skulle","slutligen","små","smått","snart","som","stor","stora","stort","större","störst","säga","säger","sämre","sämst","så","sådan","sådana","sådant","ta","tack","tar","tidig","tidigare","tidigast","tidigt","till","tills","tillsammans","tio","tionde","tjugo","tjugoen","tjugoett","tjugonde","tjugotre","tjugotvå","tjungo","tolfte","tolv","tre","tredje","trettio","trettionde","tretton","trettonde","två","tvåhundra","under","upp","ur","ursäkt","ut","utan","utanför","ute","va","vad","var","vara","varför","varifrån","varit","varje","varken","vars","varsågod","vart","vem","vems","verkligen","vi","vid","vidare","viktig","viktigare","viktigast","viktigt","vilka","vilkas","vilken","vilket","vill","väl","vänster","vänstra","värre","vår","våra","vårt","än","ännu","är","även","åt","åtminstone","åtta","åttio","åttionde","åttonde","över","övermorgon","överst","övre"]
    vectorizer_model_0 = CountVectorizer(stop_words=SEstopwords)

    # Step 5 - Create topic representation
    ctfidf_model_0 = ClassTfidfTransformer()

    # Step 6 - (Optional) Fine-tune topic representations with 
    # a `bertopic.representation` model
    representation_model_0 = KeyBERTInspired()

    options = [ [embedding_model_0, embedding_model_1], [umap_model_0, umap_model_1], [hdbscan_model_0, hdbscan_model_1] ,[vectorizer_model_0], [ctfidf_model_0], [representation_model_0]]
    options_name = [ ["embedding_model_0", "embedding_model_1"], ["umap_model_0", "umap_model_1"], ["hdbscan_model_0", "hdbscan_model_1"] ,["vectorizer_model_0"], ["ctfidf_model_0"], ["representation_model_0"]]
    return (options, options_name)

options, options_name = all_options()

for (id, option) in  enumerate(itertools.product(*options)):
    options, options_name = all_options() # RESET to make sure that objects and models are fresh
    embedding_model,umap_model,hdbscan_model,vectorizer_model,ctfidf_model,representation_model = option 
    option_name = list(itertools.product(*options_name))[id]
    option_string = ''.join([element + '-' for element in option_name])
    print("Training", option_string, "a model for option:",  option_name)
    topic_model = BERTopic(
      embedding_model=embedding_model,          # Step 1 - Extract embeddings
      umap_model=umap_model,                    # Step 2 - Reduce dimensionality
      hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
      vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
      ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
      representation_model=representation_model # Step 6 - (Optional) Fine-tune topic represenations
    )
    topics, probs = topic_model.fit_transform(df.text)
    topic_model.save(option_string + ".pickle", serialization="pickle")

# Load model from file without training

In [None]:
topic_model_load = None
topic_model_name = None
def f(x):
    global topic_model_load, topic_model_name
    if x != '' : 
        topic_model_load= BERTopic.load(x)
        print("loaded ", x)
        topic_model_name = x
trained_models = glob.glob('./*.pickle')
interact(f, x=[''] + trained_models);

In [None]:
freq = topic_model_load.get_topic_info(); freq.head(10)

In [None]:
topic_model_load.get_topic(4)

In [None]:
fig = topic_model_load.visualize_topics(); fig

In [None]:
topic_model_load.visualize_documents(df.text)

In [None]:
topic_model_load.visualize_hierarchy()

In [None]:
topic_model_load.visualize_heatmap()

In [None]:
topic_model_load.visualize_term_rank()

In [None]:
topic_model_load.visualize_barchart()

In [None]:
timestamps = df.date.to_list()
texts = df.text.to_list()
topics_over_time = topic_model_load.topics_over_time(texts, timestamps, nr_bins=20)

topic_model_load.visualize_topics_over_time(topics_over_time, topics=[1,2,3,4,5,6,7,8,9,10])

In [None]:
# list of all documents and their corresponding topic
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)

result = topic_model_load.get_document_info(df.text)
result = result.drop(['Representation', 'Representative_Docs' ,'Top_n_words', 'Representative_document' ], axis=1)
result

In [None]:
result.to_csv(topic_model_name + 'topic-docs.csv', index=False)