# Semi-supervised BERTopic Modeling
https://maartengr.github.io/BERTopic/getting_started/semisupervised/semisupervised.html

## Setup

In [None]:
# ignore NumbaDeprecationWarning
import numba
import warnings
warnings.filterwarnings("ignore", category=numba.NumbaDeprecationWarning)

from bertopic import BERTopic
#from sklearn.datasets import fetch_20newsgroups

import csv
import re

from nltk.corpus import stopwords

from hdbscan import HDBSCAN

import pandas as pd

## Import and Clean Data

In [None]:
df = pd.read_csv('all_businesses_04.tsv', sep='\t')

docs = df["all"]
categories = df["TagNames"]

# Convert categories to numbers
categories = pd.factorize(categories)[0]

In [None]:
# Remove punctuation
docs = [re.sub(r'[^\w\s]', '', doc) for doc in docs]

# Lowercase
docs = [doc.lower() for doc in docs]

# Remove stopwords
german_stop_words = stopwords.words('german')

# Import custom stopwords file as list of strings
with open('../../data/custom_stopwords.txt', 'r') as f:
   custom_stopwords = f.readlines()

# remove whitespace characters like `\n` at the end of each line
custom_stopwords = [x.strip() for x in custom_stopwords]

# remove stopwords from docs
docs = [' '.join(word for word in doc.lower().split() if word not in german_stop_words) for doc in docs]
docs = [' '.join(word for word in doc.lower().split() if word not in custom_stopwords) for doc in docs]

# remove "na" from docs
docs = [doc for doc in docs if doc != "na"]

In [None]:
# Insepct Data

# print head of docs
print(docs[:2])

# print size of docs
print(len(docs)) # 18846

## Modelling

In [None]:
# https://maartengr.github.io/BERTopic/getting_started/parameter%20tuning/parametertuning.html#hdbscan
hdbscan_model = HDBSCAN(
    min_cluster_size = 2,
    min_samples = 1,
    metric = 'euclidean',
    prediction_data = True)


# BERTopic German model
# Parameter tuning: https://maartengr.github.io/BERTopic/getting_started/parameter%20tuning/parametertuning.html#bertopic
# topic_model = BERTopic(
#     language = "multilingual",
#     min_topic_size = 2,
#     verbose = True,
#     top_n_words = 20,
#     n_gram_range = (1, 2),
#     #calculate_probabilities = True, # turn on later again to calc probs
#     hdbscan_model=hdbscan_model,
#      # https://www.sbert.net/docs/pretrained_models.html
#     embedding_model = "distiluse-base-multilingual-cased-v1").fit(docs, y = categories) # perform supervised topic modeling, we simply use all categories

# Hyperparameter tuning - best results so far
topic_model = BERTopic(
    language = "multilingual",
    min_topic_size = 2,
    verbose = True,
    top_n_words = 10,
    n_gram_range = (1, 3),
    hdbscan_model = hdbscan_model,
     # https://www.sbert.net/docs/pretrained_models.html
    embedding_model = "paraphrase-multilingual-mpnet-base-v2").fit(docs, y = categories) # perform supervised topic modeling, we simply use all categories

topics, probs = topic_model.fit_transform(docs)


In [10]:
# Reduce outliers
new_topics = topic_model.reduce_outliers(docs, topics, strategy = "embeddings")

In [None]:
# Number of topics
topic_info = topic_model.get_topic_info()
num_topics = topic_info.shape[0]
print(f"There are {num_topics} topics.")

In [None]:
# save document level information to csv
doc_level_info = topic_model.get_document_info(docs)
doc_level_info.to_csv('doc_level_info_ss.csv', index=False)