In [1]:
import pickle
import os

import re
import numpy as np
import pandas as pd

import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from gensim.models.ldamulticore import LdaMulticore

import seaborn as sns
import colorcet as cc
import matplotlib.pyplot as plt

import logging

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("test")

DEBUG:root:test


Setting paths

In [2]:
os.chdir("../..")
data_path = os.path.join(os.path.abspath(os.curdir), 'corpus')
result_path = os.path.join(os.path.abspath(os.curdir),'models','LDA')

### Loading preprocessed files

In [6]:
corpus = pd.read_pickle(os.path.join(data_path,'preprocessed', 'corpus', 'corpus_preprocessed.pkl'))

dictionary = corpora.Dictionary.load_from_text(os.path.join(data_path, 'preprocessed','dictionary', 'dictionary_preprocessed.txt'))

texts = pd.read_pickle(os.path.join(data_path, 'preprocessed','lemmas', 'lemmatized_preprocessed.pkl'))

speeches_df = pd.read_csv(os.path.join(data_path,'prepared', 'corpus.csv'))
speeches_df.sort_values(by='id', ascending=True, inplace=True)
speeches_df.drop(columns=speeches_df.columns[0],axis=1,inplace=True)
speeches_df.reset_index(drop=True,inplace=True)

DEBUG:smart_open.smart_open_lib:{'uri': '/Users/florianlorisch/PycharmProjects/open-discourse-dynamic-topic-model/corpus/preprocessed/dictionary/dictionary_preprocessed.txt', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'compression': None, 'transport_params': None}


### Loading modeling results

In [None]:
results_num_topics_comparison = pd.read_pickle(
    os.path.join(result_path, 'topic_search_results','num_topics_search_results.pkl'))
results_alpha_eta_comparison = pd.read_pickle(
    os.path.join(result_path, 'alpha_beta_search_results','alpha_beta_search_results.pkl'))

### Loading best performing LDA model

In [3]:
lda= gensim.models.LdaMulticore.load(os.path.join(result_path, 'model_results', 'lda.model'))

INFO:gensim.utils:loading LdaMulticore object from /Users/florianlorisch/PycharmProjects/open-discourse-dynamic-topic-model/models/LDA/model_results/lda.model
DEBUG:smart_open.smart_open_lib:{'uri': '/Users/florianlorisch/PycharmProjects/open-discourse-dynamic-topic-model/models/LDA/model_results/lda.model', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'compression': None, 'transport_params': None}
INFO:gensim.utils:loading expElogbeta from /Users/florianlorisch/PycharmProjects/open-discourse-dynamic-topic-model/models/LDA/model_results/lda.model.expElogbeta.npy with mmap=None
INFO:gensim.utils:setting ignored attribute id2word to None
INFO:gensim.utils:setting ignored attribute dispatcher to None
INFO:gensim.utils:setting ignored attribute state to None
INFO:gensim.utils:loaded /Users/florianlorisch/PycharmProjects/open-discourse-dynamic-topic-model/models/LDA/model_results/lda.model
INFO:gensim

### Quantitative evaluation of the best performing LDA with topic coherence (c_v)

In [None]:
def get_coherence(model, texts, dictionary, coherence):
    '''

    :param model: best performing lda model
    :param texts: tokenized texts
    :param dictionary: Gensim dictionary mapping of id word to create corpus
    :param coherence: specifies the coherence metric (e.g., c_v,c_uci','c_npmi')
    :return: coherence score
    '''
    coherence=CoherenceModel(model=model,
                   texts=texts,
                   dictionary=dictionary,
                   coherence=coherence).get_coherence()
    print('Coherence Score: ', round(coherence, 4))
    return coherence

#function call
get_coherence(model=lda, texts=texts, dictionary=dictionary, coherence='c_v')

### Quantitative evaluation of the best performing LDA with topic diversity

In [None]:
def get_list_of_topics(topicids,model,topn):
    '''
    :param topicids: id of topic that should be added to list
    :param model: the model the topics are to be taken from
    :param topn: number of top words that should be considered
    :return: a list of list containing the topn words for selected topics

    '''
    topics_as_list = []
    for topicid in topicids:
        keywords = []
        for keyword,_ in model.show_topic(topicid=topicid, topn=topn):
            keywords.append(keyword)
        topics_as_list.append(keywords)
    return topics_as_list

# function call
topn=25
model = lda
topicids = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
topic_list = get_list_of_topics(topicids=topicids,model=model,topn=topn)

In [None]:
def get_topic_diversity(topics,topn):
    '''

    :param topics: list of list of topics as strings
    :param topk: number of top words used to compute topic divcersity
    :return: topic diversity score based on the top n words selected
    '''
    if topn > len(topics[0]):
        raise Exception('not enough words for topn ' + str(topn))
    else:
        unique_words = set()
        for topic in topics:
            unique_words = unique_words.union(set(topic[:topn]))
        td = len(unique_words) / (topn * len(topics))
        return td


get_topic_diversity(topics=topic_list, topn=25)

### Further analysis of the results

In [None]:
def topic_to_dataframe(topicid, model, topn):
    '''
    :param topicids: id of topic that should be added to list
    :param model: the model the topics are to be taken from
    :param topn: number of top words that should be considered
    :return: returns a dataframe with top n terms for the selected topic
    '''

    keywords = []

    for keyword,probability in model.show_topic(topicid=topicid, topn=topn):
        keywords.append((keyword,probability))
    return pd.DataFrame(keywords)

topic_df_5 = topic_to_dataframe(model=lda, topicid=5, topn=10)

In [None]:
def summary(model, topicids, topn=10):
    '''

    :param topicids: id of topic that should be added to list
    :param model: the model the topics are to be taken from
    :param topn: number of top words that should be considered
    :return: a well formatted summary of all topics, containing the defined number of top words
    '''
    for topicid in topicids:
        print('Topic %d' % topicid)
        print(topic_to_dataframe(model=model, topicid=topicid, topn=topn))
        print()


# function call
topicids = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]

summary(model=lda, topicids=topicids, topn=10)

In [None]:
def get_topic_probabilities(model,corpus,document_range):
    '''

    :param model: the model the topics are to be taken from
    :param corpus: the corpus the model is trained with
    :param document_range: the range of documents within the corpus that should be considered
    :return: a dataframe with the topic probability distributions for all documents
    '''
    topic_documents = []
    for document in document_range:
        topic_distribution = model[corpus[document]]
        topic_distribution = [tuple[1] for tuple in topic_distribution]
        topic_documents.append(topic_distribution)
    return topic_documents

document_range = range(0,164869)
topic_probabilities = get_topic_probabilities(model=lda,corpus=corpus, document_range=document_range)
topic_probabilities_df = pd.DataFrame(topic_probabilities)

In [None]:
def get_dominant_topics(data):
    '''

    :param data: dataframe with full topic distribution over all documents or timeslice
    :return: dataframe with only the top topics for each document or timeslice
    '''
    top_topics = data.idxmax(axis = 1)
    top_topics_share = data.max(axis = 1)
    top_topics_df= pd.concat([top_topics, top_topics_share], axis=1)
    top_topics_df.columns=['Top Topic', 'Topic Share']
    return top_topics_df

top_topics_df = get_dominant_topics(data=topic_probabilities_df)
##%

def get_speeches_top_topics(data, top_topics):
    '''

    :param data: dataframe containing the preprocessed speeches
    :param top_topics: dataframe with top topics per speech
    :return: Concenate dataframe with original speechcontent with top topics and top topic share per speech
    '''
    return pd.concat([data, top_topics], axis=1)

speeches_with_topics = get_speeches_top_topics(data=speeches_df, top_topics=top_topics_df)

### Creating plots

In [None]:
def get_topic_probabilities_over_time_steps(topic_probabilities):
    '''

    :param topic_probabilities: dataframe containing the topic probabilities for documents in a corpus
    :return: dataframe containing the consolidated topic probabilities along time steps
    '''
    step_1 = pd.DataFrame(topic_probabilities.iloc[0:4444, :].sum()/4444).T
    step_2 = pd.DataFrame(topic_probabilities.iloc[4444:8079, :].sum()/3635).T
    step_3 = pd.DataFrame(topic_probabilities.iloc[8079:11335, :].sum()/3256).T
    step_4 = pd.DataFrame(topic_probabilities.iloc[11335:16499, :].sum()/5164).T
    step_5 = pd.DataFrame(topic_probabilities.iloc[16499:23888, :].sum()/7389).T
    step_6 = pd.DataFrame(topic_probabilities.iloc[23888:29859, :].sum()/5971).T
    step_7 = pd.DataFrame(topic_probabilities.iloc[29859:37816, :].sum()/7957).T
    step_8 = pd.DataFrame(topic_probabilities.iloc[37816:44674, :].sum()/6858).T
    step_9 = pd.DataFrame(topic_probabilities.iloc[44674:48617, :].sum()/3943).T
    step_10 = pd.DataFrame(topic_probabilities.iloc[48617:58073, :].sum()/9456).T
    step_11 = pd.DataFrame(topic_probabilities.iloc[58073:67408, :].sum()/9335).T
    step_12 = pd.DataFrame(topic_probabilities.iloc[67408:77520, :].sum()/10112).T
    step_13 = pd.DataFrame(topic_probabilities.iloc[77520:87945, :].sum()/10452).T
    step_14 = pd.DataFrame(topic_probabilities.iloc[87945:98976, :].sum()/11031).T
    step_15 = pd.DataFrame(topic_probabilities.iloc[98976:106746, :].sum()/7770).T
    step_16 = pd.DataFrame(topic_probabilities.iloc[106746:119131, :].sum()/12385).T
    step_17 = pd.DataFrame(topic_probabilities.iloc[119131:135905, :].sum()/16774).T
    step_18 = pd.DataFrame(topic_probabilities.iloc[135905:148132, :].sum()/12227).T
    step_19 = pd.DataFrame(topic_probabilities.iloc[148132:164869, :].sum()/16737).T
    time_steps = [step_1,step_2,step_3,step_4,step_5,step_6,step_7,step_8,step_9,step_10,step_11,step_12,step_13,step_14,step_15,step_16,step_17,step_18,step_19]
    topic_distribution_total= pd.concat(time_steps)
    topic_distribution_total.index = ['1. Term','2. Term','3. Term','4. Term','5. Term','6. Term','7. Term','8. Term', '9. Term',
                  '10. Term', '11. Term', '12. Term', '13. Term', '14. Term', '15. Term', '16. Term','17. Term',
                  '18. Term', '19. Term']
    topic_distribution_total.columns = ['Term','Topic 0','Topic 1','Topic 2','Topic 3','Topic 4','Topic 5','Topic 6','Topic 7','Topic 8','Topic 9','Topic 10', 'Topic 11','Topic 12','Topic 13','Topic 14','Topic 15','Topic 16','Topic 17','Topic 18', 'Topic 19','Topic 20','Topic 21','Topic 22','Topic 23', 'Topic 24','Topic 25','Topic 26','Topic 27','Topic 28','Topic 29','Topic 30']
    return topic_distribution_total

topic_probabilities_over_time_steps = get_topic_probabilities_over_time_steps(topic_probabilities=topic_probabilities_df)

In [None]:
def plot_topic_probability_over_time(data, number_of_plots, result_path=None, title=None):
    '''

    :param data: dataframe containing the topic probabilities for documents in a corpus
    :param number_of_plots: number of topics that should be plotted
    :param result_path: path for saving the plot to disk (optional)
    :param title: title of the plot (optional)
    :return: A figure that displays the share of topics over all elecotral Terms as a line plot
    '''
    #define color palette for multiplot
    colors = sns.color_palette(cc.glasbey_light, number_of_plots)

    ax = plt.gca()
    #set general fontsize
    plt.rcParams['font.size'] = '16'
    #fontsize for ticks
    for label in (ax.get_xticklabels() + ax.get_yticklabels()):
        label.set_fontsize(14)

    data.plot(kind="line", color=colors,linewidth=4.0,figsize=(20, 10),ax=ax)
    #data.plot(kind="line", y="Topic 14", color='palegoldenrod',linewidth=7.0,figsize=(20, 10),ax=ax)
    #data.plot(kind="line", y="Topic 28", color='deeppink',linewidth=7.0,figsize=(20, 10),ax=ax)
    #data.plot(kind="line", y="Topic 30", color='lightslategrey',linewidth=7.0,figsize=(20, 10), ax=ax)
    #data.plot(kind="line", y="Topic 18", color='plum', linewidth=7.0, figsize=(20, 10), ax=ax)
    #data.plot(kind="line", y="Topic 25", color='darkolivegreen', linewidth=7.0, figsize=(20, 10), ax=ax)

    ax.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
    ax.set_ylabel('Topic Probability')
    ax.set_xlabel('Electoral Terms')
    ax.set_xticks(np.arange(0, 19, 1))

    fig = ax.get_figure()
    if result_path:
        fig.savefig(os.path.join(result_path, 'topic_probability_over_time',title))
    fig.show()
    return fig


plot_topic_probability_over_time(data=topic_probabilities_over_time_steps, number_of_plots=31, result_path=result_path, title='LDA_topic_probability_over_time.png')

In [None]:
def plot_coherence_num_topics_random(data,result_path=None,title=None):
    '''

    :param data: dataframe with the results from the gridsearch performed to identify the optimal combination of num_topics and random_state
    :return: Plots Topics on x and Coherence on y axis, while the hue indicates the Random_State used at a particular run
    '''
    ax = sns.catplot(data=data, x='Topics', y='Coherence', hue='Random_State', kind='point')
    ax.fig.set_figwidth(16)
    ax.fig.set_figheight(6)
    fig = ax.fig
    fig.show()
    if result_path:
        fig.savefig(os.path.join(result_path, 'topic_search_results',title))

    return fig

plot_coherence_num_topics_random(data=results_num_topics_comparison)