In [None]:
import os
import re
import pandas as pd
import numpy as np
from datetime import datetime
import pickle

#BERTopic related imports
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
import hdbscan
from hdbscan import HDBSCAN

#gensim imports
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

#sklean imports
from sklearn.feature_extraction.text import CountVectorizer

#visual imports
import seaborn as sns
import colorcet as cc
import matplotlib.pyplot as plt

### Setting paths

In [None]:
os.chdir("../..")
data_path = os.path.join(os.path.abspath(os.curdir), 'corpus','preprocessed')
result_path = os.path.join(os.path.abspath(os.curdir),'models','BERTopic')

### Loading pre-processed data

In [None]:
file_name = os.path.join(data_path,'electoralTerms', 'BERTopic_time_steps.pkl')
with open(file_name, 'rb') as pickle_file:
    speeches = pickle.load(pickle_file)

file_name = os.path.join(data_path,'corpus', 'BERTopic_corpus_preprocessed.pkl')
with open(file_name, 'rb') as pickle_file:
    time_steps = pickle.load(pickle_file)

file_name = os.path.join(result_path,'model_results', 'topics_over_time.pkl')
with open(file_name, 'rb') as pickle_file:
    topics_over_time = pickle.load(pickle_file)

### Loading trained BERTopic model

In [None]:
model_path = os.path.join(result_path, 'model_results', 'BERTopic')
topic_model = BERTopic.load(model_path)

# re-calculate topics and probabilities as these are not saved along with the model to save diskspace
topics= topic_model._map_predictions(topic_model.hdbscan_model.labels_)

probs = hdbscan.all_points_membership_vectors(topic_model.hdbscan_model)
probs = topic_model._map_probabilities(probs, original_topics=True)

### Calculating topic coherence and topic diversity

In [None]:
# Extract vectorizer and tokenizer from BERTopic
vectorizer = topic_model.vectorizer_model
tokenizer = vectorizer.build_tokenizer()

# Extract features for topic coherence evaluation
words = vectorizer.get_feature_names()
tokens = [tokenizer(doc) for doc in speeches]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in topic_model.get_topic(topic)] for topic in range(len(set(topics))-1)]

In [None]:
coherence_model = CoherenceModel(topics=topic_words,
                                 texts=tokens,
                                 corpus=corpus,
                                 dictionary=dictionary,
                                 coherence='c_v')
coherence = coherence_model.get_coherence()
print(coherence)

In [None]:
def get_list_of_topics (dataframe,time_step):
  '''

  :param dataframe: dataframe containing all topics for all time steps
  :param time: a specific time step a list of topics is to be returned for
  :return: list of list containing the top words associated with each topic at a given time step. Repeat for all time stes
  '''
  dataframe.drop(dataframe[dataframe['Topic'] == -1].index, inplace = True)
  topics_at_time = dataframe.loc[dataframe["Timestamp"] == time_step,"Words"]
  word_list = pd.DataFrame(topics_at_time)
  word_list_of_list = word_list.values.tolist()
  list_of_topics = [[word for line in sub_list for word in line.split()] for sub_list in word_list_of_list]
  return list_of_topics

topic_list = get_list_of_topics (dataframe=topics_over_time,time_step=19)

In [None]:
def get_topic_diversity(topics,topn):
    '''

    :param topics: list of list of topics as strings
    :param topn: number of top words used to compute topic divcersity
    :return:  topic diversity score as proportion of unique words for a specific time step. Repeat for all time steps
    and average score.
    '''
    if topn > len(topics[0]):
        raise Exception('not enough words for topn ' + str(topn))
    else:
        unique_words = set()
        for topic in topics:
            unique_words = unique_words.union(set(topic[:topn]))
        td = len(unique_words) / (topn * len(topics))
        return td
print('Topic diversity: ', get_topic_diversity(topics=topic_list, topn=25))

### Further analysis of the model results and creation of plots

In [None]:
#print static topic representations
topic_model.get_topics()

In [None]:
#Create a dataframe that contains the consolidated topic probabilities along time steps
topic_probabilities = pd.DataFrame(data=probs, index=None, columns=["Topic 0","Topic 1","Topic 2","Topic 3","Topic 4","Topic 5","Topic 6","Topic 7","Topic 8","Topic 9","Topic 10","Topic 11","Topic 12","Topic 13","Topic 14","Topic_15","Topic 16","Topic 17","Topic 18","Topic 19","Topic 20","Topic_21","Topic 22","Topic 23","Topic 24","Topic 25","Topic 26","Topic 27","Topic 28","Topic 29","Topic 30"])

step_1 = pd.DataFrame(topic_probabilities.iloc[0:4444, :].sum()/4444).T
step_2 = pd.DataFrame(topic_probabilities.iloc[4444:8079, :].sum()/3635).T
step_3 = pd.DataFrame(topic_probabilities.iloc[8079:11335, :].sum()/3256).T
step_4 = pd.DataFrame(topic_probabilities.iloc[11335:16499, :].sum()/5164).T
step_5 = pd.DataFrame(topic_probabilities.iloc[16499:23888, :].sum()/7389).T
step_6 = pd.DataFrame(topic_probabilities.iloc[23888:29859, :].sum()/5971).T
step_7 = pd.DataFrame(topic_probabilities.iloc[29859:37816, :].sum()/7957).T
step_8 = pd.DataFrame(topic_probabilities.iloc[37816:44674, :].sum()/6858).T
step_9 = pd.DataFrame(topic_probabilities.iloc[44674:48617, :].sum()/3943).T
step_10 = pd.DataFrame(topic_probabilities.iloc[48617:58073, :].sum()/9456).T
step_11 = pd.DataFrame(topic_probabilities.iloc[58073:67408, :].sum()/9335).T
step_12 = pd.DataFrame(topic_probabilities.iloc[67408:77520, :].sum()/10112).T
step_13 = pd.DataFrame(topic_probabilities.iloc[77520:87945, :].sum()/10452).T
step_14 = pd.DataFrame(topic_probabilities.iloc[87945:98976, :].sum()/11031).T
step_15 = pd.DataFrame(topic_probabilities.iloc[98976:106746, :].sum()/7770).T
step_16 = pd.DataFrame(topic_probabilities.iloc[106746:119131, :].sum()/12385).T
step_17 = pd.DataFrame(topic_probabilities.iloc[119131:135905, :].sum()/16774).T
step_18 = pd.DataFrame(topic_probabilities.iloc[135905:148132, :].sum()/12227).T
step_19 = pd.DataFrame(topic_probabilities.iloc[148132:164869, :].sum()/16737).T

time_steps = [step_1,step_2,step_3,step_4,step_5,step_6,step_7,step_8,step_9,step_10,step_11,step_12,step_13,step_14,step_15,step_16,step_17,step_18,step_19]

topic_probabilities_over_time_steps= pd.concat(time_steps)

topic_probabilities_over_time_steps.index = ['1. Term','2. Term','3. Term','4. Term','5. Term','6. Term','7. Term','8. Term', '9. Term',
                  '10. Term', '11. Term', '12. Term', '13. Term', '14. Term', '15. Term', '16. Term','17. Term',
                  '18. Term', '19. Term']

In [None]:
def plot_topic_probability_over_time(data, number_of_plots, result_path=None, title=None):
    '''

    :param data: dataframe containing the topic probabilities for documents in a corpus
    :param number_of_plots: number of topics that should be plotted
    :param result_path: path for saving the plot to disk (optional)
    :param title: title of the plot (optional)
    :return: A figure that displays the share of topics over all elecotral Terms as a line plot
    '''
    #define color palette for multiplot
    colors = sns.color_palette(cc.glasbey_light, number_of_plots)

    ax = plt.gca()
    #set general fontsize
    plt.rcParams['font.size'] = '16'
    #fontsize for ticks
    for label in (ax.get_xticklabels() + ax.get_yticklabels()):
        label.set_fontsize(14)


    data.plot(kind="line", y="Topic 15", color='sienna',linewidth=7.0,figsize=(20, 10),ax=ax)
    data.plot(kind="line", y="Topic 17", color='lime',linewidth=7.0,figsize=(20, 10),ax=ax)
    data.plot(kind="line", y="Topic 22", color='orange', linewidth=7.0, figsize=(20, 10), ax=ax)
    data.plot(kind="line", y="Topic 23", color='lightseagreen',linewidth=7.0,figsize=(20, 10), ax=ax)
    # data.plot(kind="line", y="Topic 18", color='plum', linewidth=7.0, figsize=(20, 10), ax=ax)
    # data.plot(kind="line", y="Topic 25", color='darkolivegreen', linewidth=7.0, figsize=(20, 10), ax=ax)

    ax.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
    ax.set_ylabel('Topic Probability')
    ax.set_xlabel('Electoral Terms')
    ax.set_xticks(np.arange(0, 19, 1))

    fig = ax.get_figure()
    if result_path:
        fig.savefig(os.path.join(result_path, 'topic_probability_over_time',title))
    fig.show()
    return fig


plot_topic_probability_over_time(data=topic_probabilities_over_time_steps, number_of_plots=31, result_path=result_path, title='BERTopic_topic_probability_over_time_topic_15_17_22_23.png')