# Plain Vanilla BERTopic
https://maartengr.github.io/BERTopic/getting_started/quickstart/quickstart.html

## Setup

In [None]:
# ignore NumbaDeprecationWarning
import numba
import warnings
warnings.filterwarnings("ignore", category=numba.NumbaDeprecationWarning)

from bertopic import BERTopic
#from sklearn.datasets import fetch_20newsgroups

import csv
import re

from nltk.corpus import stopwords

from hdbscan import HDBSCAN

import pandas as pd

## Import and Clean Data

In [None]:
# Import Businesses TSV as list of strings
with open('all_businesses.tsv', newline='') as f:
    reader = csv.reader(f, delimiter='\t')
    docs = [item.replace('\xa0', ' ') for sublist in reader for item in sublist]

# Remove punctuation
docs = [re.sub(r'[^\w\s]', '', doc) for doc in docs]

# Lowercase
docs = [doc.lower() for doc in docs]

# Remove stopwords
german_stop_words = stopwords.words('german')

# Import custom stopwords file as list of strings
with open('../../data/custom_stopwords.txt', 'r') as f:
   custom_stopwords = f.readlines()

# remove whitespace characters like `\n` at the end of each line
custom_stopwords = [x.strip() for x in custom_stopwords]

# remove stopwords from docs
docs = [' '.join(word for word in doc.lower().split() if word not in german_stop_words) for doc in docs]
docs = [' '.join(word for word in doc.lower().split() if word not in custom_stopwords) for doc in docs]

# remove "na" from docs
docs = [doc for doc in docs if doc != "na"]

In [None]:
# Insepct Data

# print head of docs
print(docs[:2])

# print size of docs
print(len(docs)) # 18846

## Modelling

In [None]:
# https://maartengr.github.io/BERTopic/getting_started/parameter%20tuning/parametertuning.html#hdbscan
hdbscan_model = HDBSCAN(
    min_cluster_size = 2,
    metric = 'euclidean',
    prediction_data = True)


# BERTopic German model
# Parameter tuning: https://maartengr.github.io/BERTopic/getting_started/parameter%20tuning/parametertuning.html#bertopic
topic_model = BERTopic(
    language = "multilingual",
    min_topic_size = 2,
    verbose = True,
    top_n_words = 20,
    n_gram_range = (1, 2),
    #calculate_probabilities = True, # turn on later again to calc probs
    hdbscan_model=hdbscan_model,
    embedding_model = "distiluse-base-multilingual-cased-v1") # https://www.sbert.net/docs/pretrained_models.html
#.fit(docs)

topics, probs = topic_model.fit_transform(docs)

In [None]:
# Number of topics
topic_info = topic_model.get_topic_info()
num_topics = topic_info.shape[0]
print(f"There are {num_topics} topics.")

## Print Results

In [None]:
print("topic_model.get_topic_info()")
print(topic_model.get_topic_info())

In [None]:
print("topic_model.get_topic(0)")
print(topic_model.get_topic(0))

In [None]:
# extract information on a document level
#print("topic_model.get_document_info(docs)")
#print(topic_model.get_document_info(docs))

# save document level information to csv
doc_level_info = topic_model.get_document_info(docs)

doc_level_info.to_csv('doc_level_info.csv', index=False)

# save doc_level_info to csv
# with open('doc_level_info.csv', 'w', newline='') as f:
#     writer = csv.writer(f)
#     writer.writerows(doc_level_info)


In [None]:
# Get the unique values in the "Representation" column

unique_values_representation = doc_level_info['Representation']

type(unique_values_representation)



## Topic Distribution

In [None]:
#topic_distr, _ = topic_model.approximate_distribution(docs)
#print(topic_distr)

# print dimension of probs
#print(probs.shape)

In [None]:
# Export topic_distribution as CSV
with open('topic_distribution_test.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    #writer.writerows(topic_distr)
    writer.writerows(probs)

## Hierarchical Topic Modeling

In [None]:
hierarchical_topics = topic_model.hierarchical_topics(docs)
tree = topic_model.get_topic_tree(hierarchical_topics)
print(tree)