# Semi-supervised BERTopic Modeling
https://maartengr.github.io/BERTopic/getting_started/semisupervised/semisupervised.html

## Setup

In [1]:
# ignore NumbaDeprecationWarning
import numba
import warnings
warnings.filterwarnings("ignore", category=numba.NumbaDeprecationWarning)

from bertopic import BERTopic
#from sklearn.datasets import fetch_20newsgroups

import csv
import re

from nltk.corpus import stopwords

from hdbscan import HDBSCAN

import pandas as pd

import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


## Import and Clean Data

In [2]:
df = pd.read_csv('all_businesses_04.tsv', sep='\t')

docs = df["all"]
categories = df["TagNames"]

# Convert categories to numbers
categories = pd.factorize(categories)[0]

In [3]:
# Remove punctuation
docs = [re.sub(r'[^\w\s]', '', doc) for doc in docs]

# Lowercase
docs = [doc.lower() for doc in docs]

# Remove stopwords
german_stop_words = stopwords.words('german')

# Import custom stopwords file as list of strings
with open('../../data/custom_stopwords.txt', 'r') as f:
   custom_stopwords = f.readlines()

# remove whitespace characters like `\n` at the end of each line
custom_stopwords = [x.strip() for x in custom_stopwords]

# remove stopwords from docs
docs = [' '.join(word for word in doc.lower().split() if word not in german_stop_words) for doc in docs]
docs = [' '.join(word for word in doc.lower().split() if word not in custom_stopwords) for doc in docs]

# remove "na" from docs
docs = [doc for doc in docs if doc != "na"]

In [4]:
# Insepct Data

# print head of docs
print(docs[:2])

# print size of docs
print(len(docs)) # 18846

['revision stiftungsrechtes 14 2000 reichte ständerat fritz schiesser rl gl parlamentarische form ausgearbeiteten entwurfs ziel geltende stiftungsrecht namentlich 80ff zivilgesetzbuchs zgb einschlägigen bestimmungen steuerrechts revidieren ziel gesetzentwurfs liberalisierung schweizerischen stiftungsrechts absicht stiftungsfreudigkeit erhöhen ständerat folgte antrag abgaben waks gab 8 20011 schiesser einstimmig revision stiftungsrecht beinhaltet namentlich revisionsbereiche einführung zweckänderungsvorbehalts einführung obligatorischen revisionsstelle erhöhung steuerlichen abzugsfähigkeit einfügung zweckänderungsvorbehalts errichtung stiftung zweckänderung einfacher erreicht interessen stifters lauf jahre verschieben neue bedürfnisse gesellschaft dringender stifter möchte veränderungen rechnung tragen zweckänderungsvorbehalt eröffnet möglichkeit zweckänderung ablauf beträchtlichen zeitspanne 10 jahre möglich bleiben interessen destinatäre gleichwohl geschützt kontrolle stiftungen erhöh

## Modelling

In [5]:
# https://maartengr.github.io/BERTopic/getting_started/parameter%20tuning/parametertuning.html#hdbscan
hdbscan_model = HDBSCAN(
    min_cluster_size = 2,
    min_samples = 1,
    metric = 'euclidean',
    prediction_data = True)


# BERTopic German model
# Parameter tuning: https://maartengr.github.io/BERTopic/getting_started/parameter%20tuning/parametertuning.html#bertopic
# topic_model = BERTopic(
#     language = "multilingual",
#     min_topic_size = 2,
#     verbose = True,
#     top_n_words = 20,
#     n_gram_range = (1, 2),
#     #calculate_probabilities = True, # turn on later again to calc probs
#     hdbscan_model=hdbscan_model,
#      # https://www.sbert.net/docs/pretrained_models.html
#     embedding_model = "distiluse-base-multilingual-cased-v1").fit(docs, y = categories) # perform supervised topic modeling, we simply use all categories

# Hyperparameter tuning - best results so far
topic_model = BERTopic(
    language = "multilingual",
    min_topic_size = 2,
    verbose = True,
    top_n_words = 10,
    n_gram_range = (1, 3),
    hdbscan_model = hdbscan_model,
     # https://www.sbert.net/docs/pretrained_models.html
    embedding_model = "paraphrase-multilingual-mpnet-base-v2").fit(docs, y = categories) # perform supervised topic modeling, we simply use all categories

topics, probs = topic_model.fit_transform(docs)

Batches: 100%|██████████| 19/19 [02:55<00:00,  9.23s/it]
2023-08-17 16:44:37,198 - BERTopic - Transformed documents to Embeddings
2023-08-17 16:44:45,547 - BERTopic - Reduced dimensionality
2023-08-17 16:44:45,595 - BERTopic - Clustered reduced embeddings
Batches: 100%|██████████| 19/19 [02:56<00:00,  9.28s/it]
2023-08-17 16:47:44,442 - BERTopic - Transformed documents to Embeddings
2023-08-17 16:47:47,133 - BERTopic - Reduced dimensionality
2023-08-17 16:47:47,181 - BERTopic - Clustered reduced embeddings


In [6]:
# Save model
topic_model.save("data/topic_model", serialization="safetensors", save_ctfidf=True)

# Save topics and probs to file
np.save('data/probs.npy', probs)
np.save('data/topics.npy', topics)

In [None]:
# Load model
BERTopic.load("data/topic_model")

# Import topics and probs
probs = np.load('data/probs.npy', allow_pickle=True)
topics = np.load('data/topics.npy', allow_pickle=True)
topics = topics.tolist()

In [29]:
# Reduce outliers
new_topics = topic_model.reduce_outliers(docs, topics, strategy = "embeddings")
#new_topics2 = topic_model.reduce_outliers(docs, topics, strategy = "embeddings", threshold = 0.5)
#new_topics3 = topic_model.reduce_outliers(docs, topics, strategy = "distributions") 
#new_topics4 = topic_model.reduce_outliers(docs, topics, strategy = "c-tf-idf")

In [30]:
# Evaluate new topics
new_topics == new_topics4

#diff = [(i, j) for i, j in zip(new_topics, new_topics3) if i != j]
#print("Differences:", diff)

False

In [None]:
# Number of topics
topic_info = topic_model.get_topic_info()
num_topics = topic_info.shape[0]
print(f"There are {num_topics} topics.")

In [7]:
# save document level information to csv
doc_level_info = topic_model.get_document_info(docs)
doc_level_info.to_csv('doc_level_info_ss.csv', index=False)