## Setup

In [None]:
# ignore NumbaDeprecationWarning
import numba
import warnings
warnings.filterwarnings("ignore", category=numba.NumbaDeprecationWarning)

from bertopic import BERTopic
#from sklearn.datasets import fetch_20newsgroups

import csv
import re

from nltk.corpus import stopwords

## Import and Clean Data

In [None]:
# Import Businesses TSV as list of strings
with open('initial_situation.tsv', newline='') as f:
    reader = csv.reader(f, delimiter='\t')
    docs = [item.replace('\xa0', ' ') for sublist in reader for item in sublist]

# Remove punctuation
docs = [re.sub(r'[^\w\s]', '', doc) for doc in docs]

# Remove stopwords
german_stop_words = stopwords.words('german')

# Import custom stopwords file as list of strings
with open('../../data/custom_stopwords.txt', 'r') as f:
   custom_stopwords = f.readlines()

# remove whitespace characters like `\n` at the end of each line
custom_stopwords = [x.strip() for x in custom_stopwords]

# remove stopwords from docs
docs = [' '.join(word for word in doc.lower().split() if word not in german_stop_words) for doc in docs]
docs = [' '.join(word for word in doc.lower().split() if word not in custom_stopwords) for doc in docs]

# remove "na" from docs
docs = [doc for doc in docs if doc != "na"]

In [None]:
# Insepct Data

# print head of docs
#print(docs[:2])

# print size of docs
#print(len(docs)) # 18846

## Modelling

In [9]:
# BERTopic German model
topic_model = BERTopic(language="german", min_topic_size = 25, verbose = True)#.fit(docs)
topics, probs = topic_model.fit_transform(docs)

Batches: 100%|██████████| 19/19 [00:57<00:00,  3.03s/it]
2023-07-25 13:45:13,033 - BERTopic - Transformed documents to Embeddings
2023-07-25 13:45:15,985 - BERTopic - Reduced dimensionality
2023-07-25 13:45:16,008 - BERTopic - Clustered reduced embeddings


In [None]:
#topic_model.visualize_topics()

In [11]:
# Print outputs

print("topic_model.get_topic_info()")
print(topic_model.get_topic_info())
print()

print("topic_model.get_topic(0)")
print(topic_model.get_topic(0))
print()

# extract information on a document level
print("topic_model.get_document_info(docs)")
print(topic_model.get_document_info(docs)) 
print()

# Number of topics
topic_info = topic_model.get_topic_info()
num_topics = topic_info.shape[0]
print(f"There are {num_topics} topics.")

topic_model.get_topic_info()
   Topic  Count                                               Name  \
0     -1    270          -1_schweiz_botschaft_abkommen_bundesrates   
1      0     99       0_bundesrates_entwurf_änderung_nationalrates   
2      1     77  1_versicherten_franken_massnahmen_krankenversi...   
3      2     63              2_schweiz_schweizer_botschaft_franken   
4      3     47         3_landwirtschaft_massnahmen_umwelt_schweiz   
5      4     44                  4_schweiz_eu_botschaft_massnahmen   

                                      Representation  \
0  [schweiz, botschaft, abkommen, bundesrates, wu...   
1  [bundesrates, entwurf, änderung, nationalrates...   
2  [versicherten, franken, massnahmen, krankenver...   
3  [schweiz, schweizer, botschaft, franken, inter...   
4  [landwirtschaft, massnahmen, umwelt, schweiz, ...   
5  [schweiz, eu, botschaft, massnahmen, europäisc...   

                                 Representative_Docs  
0  [bundesrates 28102015 verabsc

## Topic Distribution

In [None]:
# Get Topic Distribution
topic_model = BERTopic(language="german", min_topic_size=10).fit(docs)
topic_distr, _ = topic_model.approximate_distribution(docs)
print(topic_distr)

In [None]:
# Export topic_distribution as CSV
with open('topic_distribution.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(topic_distr)

## Hierarchical Topic Modeling

In [12]:
hierarchical_topics = topic_model.hierarchical_topics(docs)
tree = topic_model.get_topic_tree(hierarchical_topics)
print(tree)

100%|██████████| 4/4 [00:00<00:00, 123.69it/s]

.
├─bundesrates_massnahmen_änderung_regelung_entwurf
│    ├─■──versicherten_franken_massnahmen_krankenversicherung_leistungen ── Topic: 1
│    └─■──bundesrates_entwurf_änderung_nationalrates_bundesgericht ── Topic: 0
└─schweiz_botschaft_massnahmen_eu_franken
     ├─schweiz_eu_botschaft_franken_schweizer
     │    ├─■──schweiz_eu_botschaft_massnahmen_europäischen ── Topic: 4
     │    └─■──schweiz_schweizer_botschaft_franken_internationalen ── Topic: 2
     └─■──landwirtschaft_massnahmen_umwelt_schweiz_energie ── Topic: 3




