In [None]:
import os
import gensim
from gensim import corpora, utils
from gensim.models.wrappers.dtmmodel import DtmModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary
from gensim.corpora import textcorpus

import numpy as np
import pandas as pd
import pickle
from scipy.stats import linregress

import seaborn as sns
import colorcet as cc
import matplotlib.pyplot as plt

# from DTM import Dtm
import logging

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("test")

### Setting paths

In [None]:
os.chdir("../..")
data_path = os.path.join(os.path.abspath(os.curdir), 'corpus', 'preprocessed')
result_path = os.path.join(os.path.abspath(os.curdir),'models','DTM')
dtm_path = "/Users/florianlorisch/Downloads/dtm-master/dtm/dtm"

### Loading pre-processed data

In [None]:
corpus = pd.read_pickle(os.path.join(data_path, 'corpus', 'corpus_preprocessed.pkl'))
dictionary = corpora.Dictionary.load_from_text(os.path.join(data_path, 'dictionary', 'dictionary_preprocessed.txt'))
texts = pd.read_pickle(os.path.join(data_path, 'lemmas', 'lemmatized_preprocessed.pkl'))


### Loading trained DTM model

In [None]:
model_path = os.path.join(result_path, 'model_results', 'dtm.model')
dtm = DtmModel.load(model_path)

### Calculating topic coherence and topic diversity

In [None]:
def coherence_dtm_over_time_steps(model,time_steps):
    '''

    :param model: trained DTM model
    :param time_steps: the time steps that are considered to calculate an overarching topic coherence score
    :return: topic coherence score averaged over time steps
    '''
    coherence_sum = 0
    for time in time_steps:
        topics = model.dtm_coherence(time=time)
        coherence_model_dtm = CoherenceModel(topics=topics, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_dtm = coherence_model_dtm.get_coherence()
        print ('DTM coherence (c_v) at time: ',time ,coherence_dtm)
        coherence_sum = coherence_sum + coherence_dtm
    print(coherence_sum)
    return coherence_sum/len(time_steps)

time_steps = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]
print('overall coherence: ',coherence_dtm_over_time_steps(model=dtm,time_steps=time_steps))

In [1]:
def get_list_of_topics(topicids,time,model,topn):
    '''

    :param topicids: id of topic that should be added to list
    :param model: the model the topics are to be taken from
    :return: a list of list containing topn words for selected topics at the selected time step. Performed for each time
    as input to calculate the topic diversity
    '''
    topics_as_list = []
    for topicid in topicids:
        keywords = []
        for _,keyword in model.show_topic(topicid=topicid,time=time,topn=topn):
            keywords.append(keyword)
        topics_as_list.append(keywords)
    return topics_as_list
topn=25
model = dtm
topicids = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]

topic_list = get_list_of_topics(topicids=topicids,time=18,model=model,topn=topn)

NameError: name 'dtm' is not defined

In [None]:
def get_topic_diversity(topics,topn):
    '''

    :param topics: list of list of topics as strings
    :param topn: number of top words used to compute topic diversity
    :return: topic diversity score as proportion of unique words at a given time step.
    Performed for each time step.
    '''
    if topn > len(topics[0]):
        raise Exception('not enough words for topk ' + str(topn))
    else:
        unique_words = set()
        for topic in topics:
            unique_words = unique_words.union(set(topic[:topn]))
        td = len(unique_words) / (topn * len(topics))
        return td

print('diversity topk25: ', get_topic_diversity(topics=topic_list, topn=25))

### Further analysis of the model results

In [None]:
def topic_to_dataframe(model, time_steps, topicid, topn):
    '''

    :param model: the model the topics are to be taken from
    :param time_steps: time steps considered
    :param topicid: id of topic that should be added to list
    :param topn: number of top terms considered
    :return: returns a dataframe with top n Terms for the selected topic and time steps. Define
    time steps before running
    '''
    topic_data = {}

    for time_step in time_steps:
        keywords = []
        for _, keyword in model.show_topic(topicid=topicid, time=time_step, topn=topn):
            keywords.append(keyword)
        topic_data[time_step] = keywords
    return pd.DataFrame(topic_data)

time_steps = [0, 4, 12, 18]
topic_5_df = topic_to_dataframe(model=dtm, time_steps=time_steps, topicid=5, topn=10)

In [None]:
def topic_with_p_to_dataframe(model, time_steps, topicid, topn):
    '''

    :param model: the model the topics are to be taken from
    :param time_steps: time steps considered
    :param topicid: id of topic that should be added to list
    :param topn: number of top terms considered
    :return:  a dataframe with top n terms and their probabilities for the selected topic and time steps. Define
    time steps before running
    '''
    topic_data = {}

    for time_step in time_steps:
        keywords = []
        for probability, keyword in model.show_topic(topicid=topicid, time=time_step, topn=topn):
            keywords.append((probability, keyword))
        topic_data[time_step] = keywords
    return pd.DataFrame(topic_data)

time_steps = [0, 4, 12, 18]
topic_5_df_p = topic_with_p_to_dataframe(model=dtm, time_steps=time_steps, topicid=5, topn=10)

In [None]:
def summary(model, time_steps, topicids, topn):
    '''

    :param model: the model the topics are to be taken from
    :param time_steps: time steps considered
    :param topicid: id of topic that should be added to list
    :param topn: number of top terms considered
    :return: Prints out a summary of all topics, containing the defined number of top words in the time steps provided
    '''
    for topicid in topicids:
        print('Topic %d' % topicid)
        print(topic_to_dataframe(model=model, topicid=topicid, time_steps=time_steps, topn=topn))
        print()

topicids = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
time_steps = [0,5,13,17]

summary(model=dtm, time_steps=time_steps, topicids=topicids, topn=10)

### Creating plots

In [None]:
def get_topic_term_probability(model,time_steps,topicids):
    '''

    :param model: DTM Model with gamma Matrix
    :param time_steps: range of the individual time steps indicating rows of plenary speeches belonging to each elecotralTerm
    :param topicids: topics that should be considered
    :return: dataframe with the topic probabilities over electoralTerms (time steps)
    '''
    gamma_total = {}
    for index, time_step in enumerate(time_steps):
        gamma_time_step = {}
        for topicid in topicids:
            gamma_value = 0
            for doc_number in time_step:
                gamma_value = gamma_value + model.gamma_[doc_number, topicid]
            gamma_time_step[topicid] = gamma_value / len(time_step)
        print(gamma_time_step)
        gamma_total[index] = gamma_time_step
    return gamma_total

topicids = range(0, 31)
time_steps = [range(0, 4444), range(4444, 8079), range(8079, 11335), range(11335, 16499), range(16499, 23888),
          range(23888, 29859), range(29859, 37816), range(37816, 44674), range(44674, 48617), range(48617, 58073),
          range(58073, 67408), range(67408, 77520), range(77520, 87945), range(87945, 98976), range(98976, 106746),
          range(106746, 119131), range(119131, 135905), range(135905, 148132), range(148132, 164869)]

gamma_total = get_topic_term_probability(model=dtm,time_steps=time_steps,topicids=topicids)

In [None]:
# create a dataframe with topic probabilities over time steps from the dictionary populated above
topic_probabilities_over_time_steps = pd.DataFrame.from_dict(gamma_total, orient="index")

topic_probabilities_over_time_steps.index = ['1. Term', '2. Term', '3. Term', '4. Term', '5. Term', '6. Term', '7. Term', '8. Term', '9. Term',
                  '10. Term', '11. Term', '12. Term', '13. Term', '14. Term', '15. Term', '16. Term', '17. Term',
                  '18. Term', '19. Term']

topic_probabilities_over_time_steps.columns = ['Topic 0','Topic 1', 'Topic 2', 'Topic 3', 'Topic 4', 'Topic 5', 'Topic 6', 'Topic 7', 'Topic 8',
                    'Topic 9', 'Topic 10', 'Topic 11', 'Topic 12', 'Topic 13', 'Topic 14', 'Topic 15', 'Topic 16',
                    'Topic 17', 'Topic 18', 'Topic 19', 'Topic 20', 'Topic 21', 'Topic 22', 'Topic 23', 'Topic 24',
                    'Topic 25', 'Topic 26', 'Topic 27', 'Topic 28', 'Topic 29', 'Topic 30']

In [None]:
def plot_topic_probability_over_time(data, number_of_plots, result_path, title=None):
    '''

    :param data: dataframe with topic probabilities over time steps
    :param number_of_plots:
    :param result_path: path for saving the plot to disk (optional)
    :param title: title of the plot (optional)
    :return:  plot that displays the share of topics over all electoral Terms as line plot
    '''
    #define color palette
    colors = sns.color_palette(cc.glasbey_light, number_of_plots)

    ax = plt.gca()
    #set general fontsize
    plt.rcParams['font.size'] = '16'
    #fontsize for ticks
    for label in (ax.get_xticklabels() + ax.get_yticklabels()):
        label.set_fontsize(14)

    #data.plot(kind="line", color=colors, linewidth=4.0, figsize=(20, 10), ax=ax)
    data.plot(kind="line", y="Topic 0", color='tab:red',linewidth=7.0,figsize=(20, 10),ax=ax)
    data.plot(kind="line", y="Topic 16", color='lightskyblue',linewidth=7.0,figsize=(20, 10),ax=ax)
    data.plot(kind="line", y="Topic 20", color='peachpuff',linewidth=7.0,figsize=(20, 10),ax=ax)

    ax.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
    ax.set_ylabel('Topic Probability')
    ax.set_xlabel('Electoral Terms')
    ax.set_xticks(np.arange(0, 19, 1))

    fig = ax.get_figure()
    if result_path:
        fig.savefig((os.path.join(result_path, 'topic_probability_over_time',title)))
    fig.show()
    return fig


plot_topic_probability_over_time(data=topic_probabilities_over_time_steps, number_of_plots=31,result_path=result_path, title='DTM_topic_probability_over_time_topic_0_16_20.png')