# Semi-supervised BERTopic Modeling with Probability Calculation
https://maartengr.github.io/BERTopic/getting_started/semisupervised/semisupervised.html

## Setup

In [None]:
import numba
import warnings
warnings.filterwarnings("ignore", category=numba.NumbaDeprecationWarning)

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired

import csv
import re

from nltk.corpus import stopwords

from hdbscan import HDBSCAN

import pandas as pd

import numpy as np

## Import and Clean Data

In [None]:
df = pd.read_csv('data/all_businesses.tsv', sep='\t', dtype={'BusinessShortNumber': str})


# Split text and TagNames
docs = df[["BusinessShortNumber", "all"]]
categories = df["TagNames"]

# Convert categories to numbers
categories = pd.factorize(categories)[0]

In [None]:
# Remove punctuation
#docs = [re.sub(r'[^\w\s]', '', doc) for doc in docs]
docs['all'] = docs['all'].apply(lambda doc: re.sub(r'[^\w\s]', '', doc))


# Lowercase
#docs = [doc.lower() for doc in docs]
docs['all'] = docs['all'].apply(lambda doc: doc.lower())


# Remove stopwords
german_stop_words = stopwords.words('german')

# Import custom stopwords file as list of strings
with open('../../data/custom_stopwords.txt', 'r') as f:
   custom_stopwords = f.readlines()

# remove whitespace characters like `\n` at the end of each line
custom_stopwords = [x.strip() for x in custom_stopwords]

# remove stopwords from docs
#docs = [' '.join(word for word in doc.lower().split() if word not in german_stop_words) for doc in docs]
#docs = [' '.join(word for word in doc.lower().split() if word not in custom_stopwords) for doc in docs]
docs['all'] = docs['all'].apply(lambda doc: ' '.join(word for word in doc.split() if word not in german_stop_words))
docs['all'] = docs['all'].apply(lambda doc: ' '.join(word for word in doc.split() if word not in custom_stopwords))

# remove "na" from docs
#docs = [doc for doc in docs if doc != "na"]
docs = docs[docs['all'] != 'na']


## Modelling

In [None]:
# https://maartengr.github.io/BERTopic/getting_started/parameter%20tuning/parametertuning.html#hdbscan
hdbscan_model = HDBSCAN(
    min_cluster_size = 2,
    min_samples = 1,
    metric = 'euclidean',
    prediction_data = True)

# BERTopic German model
# Parameter tuning: https://maartengr.github.io/BERTopic/getting_started/parameter%20tuning/parametertuning.html#bertopic
topic_model = BERTopic(
    language = "multilingual",
    min_topic_size = 2,
    verbose = True,
    top_n_words = 10,
    n_gram_range = (1, 3),
    hdbscan_model = hdbscan_model,
    calculate_probabilities = True,
     # https://www.sbert.net/docs/pretrained_models.html
    embedding_model = "paraphrase-multilingual-mpnet-base-v2").fit(docs, y = categories) # perform supervised topic modeling, we simply use all categories

topics, probs = topic_model.fit_transform(docs)

In [None]:
# Get Model Information
doc_info = topic_model.get_document_info(docs)
topic_info = topic_model.get_topic_info()

In [16]:
# Merge model output with docs in order to preserve BusinessShortNumber
doc_info = pd.merge(doc_info, docs, left_on='Document', right_on='all', how='left')

# Add BusinessShortNumber to probs for export
probs_df = pd.DataFrame(probs)
probs_df["BusinessShortNumber"] = doc_info["BusinessShortNumber"]


In [12]:
# Number of topics
num_topics = topic_info.shape[0]
print(f"There are {num_topics} topics.")

There are 147 topics.


## Save and Load Model

In [17]:
# Save model
topic_model.save("BERT_data/temp/topic_model", serialization="safetensors", save_ctfidf=True)

# Save topics and probs to file
np.save('BERT_data/temp/probs.npy', probs)
np.save('BERT_data/temp/topics.npy', topics)

# Save ndarray as csv file
probs_df.to_csv('BERT_data/temp/probs.csv', index=False)

# Save document level information to csv
doc_info.to_csv('BERT_data/temp/doc_info.csv', index=False)

# Save topic information to csv
topic_info.to_csv('BERT_data/temp/topic_info.csv', index=False)

In [None]:
# Load model
topic_model = BERTopic.load("BERT_data/v230820/topic_model")

# Load topics and probs
probs = np.load('BERT_data/v230820/probs.npy', allow_pickle=True)
topics = np.load('BERT_data/v230820/topics.npy', allow_pickle=True)
topics = topics.tolist()