# BERTopic Sandbox
https://maartengr.github.io/BERTopic/getting_started/semisupervised/semisupervised.html

## Semi-supervised Topic Modeling
https://maartengr.github.io/BERTopic/getting_started/semisupervised/semisupervised.html

In [None]:
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups

data = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))
docs = data["data"]
categories = data["target"]
category_names = data["target_names"]


## Supervised Topic Modeling
https://maartengr.github.io/BERTopic/getting_started/supervised/supervised.html

In [None]:
from sklearn.datasets import fetch_20newsgroups

# Get labeled data
data = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))
docs = data['data']
y = data['target']

In [None]:
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.dimensionality import BaseDimensionalityReduction
from sklearn.linear_model import LogisticRegression


In [None]:

# Get labeled data
data = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))
docs = data['data']
y = data['target']


In [None]:

# Skip over dimensionality reduction, replace cluster model with classifier,
# and reduce frequent words while we are at it.
empty_dimensionality_model = BaseDimensionalityReduction()
clf = LogisticRegression()
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)


In [None]:

# Create a fully supervised BERTopic instance
topic_model= BERTopic(
        umap_model=empty_dimensionality_model,
        hdbscan_model=clf,
        ctfidf_model=ctfidf_model
)
topics, probs = topic_model.fit_transform(docs, y=y)


## Guided Topic Modeling
https://maartengr.github.io/BERTopic/getting_started/guided/guided.html

In [1]:
from bertopic import BERTopic
#from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import csv
import re
from nltk.corpus import stopwords

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [2]:

#docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))["data"]
#docs = fetch_20newsgroups(subset='test', categories=['sci.med', 'sci.space', 'comp.windows.x'], remove=('headers', 'footers', 'quotes'))["data"]

df = pd.read_csv('all_businesses_04.tsv', sep='\t')
docs = df["all"]

# Remove punctuation
docs = [re.sub(r'[^\w\s]', '', doc) for doc in docs]

# Lowercase
docs = [doc.lower() for doc in docs]

# Remove stopwords
german_stop_words = stopwords.words('german')

# Import custom stopwords file as list of strings
with open('../../data/custom_stopwords.txt', 'r') as f:
   custom_stopwords = f.readlines()

# remove whitespace characters like `\n` at the end of each line
custom_stopwords = [x.strip() for x in custom_stopwords]

# remove stopwords from docs
docs = [' '.join(word for word in doc.lower().split() if word not in german_stop_words) for doc in docs]
docs = [' '.join(word for word in doc.lower().split() if word not in custom_stopwords) for doc in docs]

# remove "na" from docs
docs = [doc for doc in docs if doc != "na"]

In [3]:

#seed_topic_list = [["steuer", "ausland", "hilfe"],
#                   ["covid-19", "ausgleich", "franken"]]

seed_topic_list = [["konjunkturpolitik"],
                   ["wettbewerbspolitik"],
                   ["strukturpolitik"],
                   ["preispolitik"],
                   ["konsumentenschutz"],
                   ["gesellschaftsrecht"],
]

topic_model = BERTopic(seed_topic_list = seed_topic_list, verbose = True)
topics, probs = topic_model.fit_transform(docs)


Batches: 100%|██████████| 19/19 [01:23<00:00,  4.41s/it]
2023-08-15 15:40:32,787 - BERTopic - Transformed documents to Embeddings
Batches: 100%|██████████| 1/1 [00:00<00:00, 18.14it/s]
2023-08-15 15:40:43,105 - BERTopic - Reduced dimensionality
2023-08-15 15:40:43,145 - BERTopic - Clustered reduced embeddings


In [4]:
# Number of topics
topic_info = topic_model.get_topic_info()
num_topics = topic_info.shape[0]
print(f"There are {num_topics} topics.")

There are 18 topics.


In [5]:
print("topic_model.get_topic_info()")
print(topic_model.get_topic_info())

topic_model.get_topic_info()
    Topic  Count                                               Name  \
0      -1    265                    -1_schweiz_botschaft_10_kantone   
1       0     48  0_bundesgericht_richter_richterinnen_bundesgesetz   
2       1     32                1_schweiz_europischen_eu_europische   
3       2     25  2_informationsaustausch_doppelbesteuerung_divi...   
4       3     25  3_landwirtschaft_agrarpolitik_umwelt_direktzah...   
5       4     22              4_bibliomedia_banken_schweiz_stiftung   
6       5     20      5_verkehr_ffentlichen_finanzierung_bahnreform   
7       6     20          6_ahvnummer_ministerinnen_minister_urlaub   
8       7     19               7_armee_zellen_organen_mindeststrafe   
9       8     17                 8_pflege_kvg_gesundheit_ambulanten   
10      9     14  9_vertrge_unterdeckung_anlagestrategie_leibrenten   
11     10     14  10_2050_treibhausgasemissionen_co2gesetzes_gle...   
12     11     12          11_daten_datenschutz_e

In [6]:
doc_level_info = topic_model.get_document_info(docs)

In [None]:
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups

docs = fetch_20newsgroups(subset='test',  remove=('headers', 'footers', 'quotes'))["data"]

seed_topic_list = [["drug", "cancer", "drugs", "doctor"],
                   ["windows", "drive", "dos", "file"],
                   ["space", "launch", "orbit", "lunar"]]

topic_model = BERTopic(seed_topic_list=seed_topic_list, verbose = True)
topics, probs = topic_model.fit_transform(docs)


In [None]:
import numpy as np
print(np.__version__) # 1.24.4


In [None]:
pip install numpy==1.23.5