In [None]:
import re
import pandas as pd
from datetime import datetime
import pickle
import os

#BERTopic related imports
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
import hdbscan
from hdbscan import HDBSCAN


#sklean imports
from sklearn.feature_extraction.text import CountVectorizer

### Setting paths

In [None]:
os.chdir("../..")
data_path = os.path.join(os.path.abspath(os.curdir), 'corpus','preprocessed')
result_path = os.path.join(os.path.abspath(os.curdir),'models','BERTopic')

### Loading pre-processed files from disk

In [None]:
file_name = os.path.join(data_path,'electoralTerms', 'BERTopic_time_steps.pkl')
with open(file_name, 'rb') as pickle_file:
    speeches = pickle.load(pickle_file)

file_name = os.path.join(data_path,'corpus', 'BERTopic_corpus_preprocessed.pkl')
with open(file_name, 'rb') as pickle_file:
    time_steps = pickle.load(pickle_file)

file_name = os.path.join(data_path,'stopwords', 'stopwords_custom.pkl')
with open(file_name, 'rb') as pickle_file:
    stopwords = pickle.load(pickle_file)

file_name = os.path.join(data_path,'stopwords', 'additional_stopwords.txt')
additional_stopwords = list(line.strip() for line in open(file_name))
stopwords.update(additional_stopwords)

### Instantiating the components of BERTopic

#### Instantiate vectorizer model and pass it the complete set of stopwords used for LDA and DTM

In [None]:
vectorizer_model = CountVectorizer(stop_words=stopwords,min_df=10)

#### Instantiate sentenceTransformer wit pre-tranined multilingual model and create embeddings

In [None]:
sentence_model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")
embeddings = sentence_model.encode(speeches, show_progress_bar=True)

#### Instantiate UMAP Model with n_neighbors = 100 (default = 15) and random_state=41

In [None]:
umap_model = UMAP(n_neighbors=100, n_components=5,
                  min_dist=0.0, metric='cosine',
                  random_state=41)

#### Instantiate HDBSCAN model with min_cluster_size = 300 (default = 10)

In [None]:
hdbscan_model = HDBSCAN(min_cluster_size=300, metric='euclidean',
                        cluster_selection_method='eom', prediction_data=True)

### Training BERTopic

In [None]:
topic_model = BERTopic(embedding_model=sentence_model,
                       umap_model=umap_model,
                       vectorizer_model=vectorizer_model,
                       hdbscan_model=hdbscan_model,
                       min_topic_size=300,
                       top_n_words=25,
                       nr_topics=31,
                       calculate_probabilities=True,
                       verbose=True)
topics, probs = topic_model.fit_transform(documents=speeches,embeddings=embeddings)

### Modeling topics over time

In [None]:
topics_over_time = topic_model.topics_over_time(docs=speeches,
                                                topics=topics,
                                                timestamps=time_steps,
                                                global_tuning=True,
                                                evolution_tuning=True,
                                                nr_bins=19)

### Saving trained BERTopic and topics over time to disk

In [None]:
file_name = os.path.join(result_path,'model_results', 'BERTopic')
topic_model.save(file_name)

file_name = os.path.join(result_path,'model_results', 'topics_over_time.pkl')
with open(file_name, 'wb') as handle:
     pickle.dump(topics_over_time, handle)