In [None]:
# ADD CAPTIONS

from pprint import pprint
from gensim.models import CoherenceModel
from wordcloud import WordCloud
from collections import Counter
import matplotlib.colors as mcolors
from matplotlib.ticker import FuncFormatter
import pyLDAvis
import pyLDAvis.gensim_models as LDAgensim


In [None]:
# We construct our LDA model

warnings.filterwarnings('ignore')
model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics = 6, 
                                            random_state = 100, update_every = 1, chunksize = 100, passes = 14, alpha = 'auto', per_word_topics=True) # Here we selected 5 topics
pprint(model.print_topics())
model_cor = model[corpus]

In [None]:
# Now we calculate coherence score and perplexity

model_coher = CoherenceModel(model=model, texts=cleaned_data, dictionary=id2word, coherence='c_v')
coher_s = model_coher.get_coherence()
print('Coherence Score: ', coherence_lda)
print('Perplexity: ', model.log_perplexity(corpus))

To find the most important words for each topic, we first find the dominant topics by taking the distribution of the topics per document

In [1]:
topic_dist = [model.get_document_topics(item, minimum_probability=0.0) for item in corpus]
topic_dist

NameError: name 'corpus' is not defined

In [None]:
top_cor = [sorted(topics, key=lambda record: -record[1])[0] for topics in topic_dist]
top_cor

In [None]:
model.num_topics

In [None]:
topics = [[(term, round(wt, 3)) for term, wt in model.show_topic(n, topn=20)] for n in range(0, model.num_topics)]
topics

Next, we construct a dataframe matrix for the topics and keywords

In [None]:
topics_mat = pd.DataFrame([[term for term, wt in topic] for topic in topics], columns = ['Keyword '+str(i) for i in range(1, 21)],
                         index=['Topic '+str(t) for t in range(1, model.num_topics+1)]).T
topics_mat.head()

The keywords per topic are now viewable

In [None]:
pd.set_option('display.max_colwidth', -1)

topics_mat = pd.DataFrame([', '.join([term for term, wt in topic]) for topic in topics], columns = ['Topic Keywords'],
                         index=['Topic'+str(t) for t in range(1, model.num_topics+1)] )
topics_mat

Wordcloud

We construct a wordcloud for our LDA model from the keywords for each topic

In [None]:
cloud = WordCloud(background_color="white", colormap="Dark2",
               max_font_size=150, random_state=42)
plt.rcParams['figure.figsize'] = [20, 10]

# We also construct subplots per topic
for i in range(model.num_topics): # this is how many topics we show the wordclouds for

    cloud.generate(text=topics_mat["Topic Keywords"][i])
    
    plt.subplot(2, 3, i+1)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(topics_mat.index[i])

plt.show()

Word count and word weights or significance

In [None]:
######

In [None]:
# we create the data frame for the word count and keyword weights 
tops = model.show_topics(formatted=False)
flat_data = [w for w_list in cleaned_data for w in w_list]
counts = Counter(flat_data)

output = []
for i, topic in tops:
    for word, weight in topic:
        output.append([word, i , weight, counter[word]])

dataframe = pd.DataFrame(out, columns=['word', 'topic_id', 'weights', 'word_count'])        

In [None]:
# now we plot the word count and the keyword weights
fig, axes = plt.subplots(2, 3, figsize=(16,10), sharey=True, dpi=160)
colors = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
    ax.bar(x='word', height="word_count", data=dataframe.loc[dataframe.topic_id==i, :], color=colors[i], width=0.5, alpha=0.3, label='Word Count')
    axtwin = ax.twinx()
    axtwin.bar(x='word', height="weights", data=dataframe.loc[df.topic_id==i, :], color=colors[i], width=0.2, label='Weights')
    ax.set_ylabel('Word Count', color=colors[i])
    ax_twin.set_ylim(0, 0.030); ax.set_ylim(0, 3500)
    ax.set_title('Topic: ' + str(i), color=colors[i], fontsize=16)
    ax.tick_params(axis='y', left=False)
    ax.set_xticklabels(dataframe.loc[dataframe.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right')
    ax.legend(loc='upper left'); axtwin.legend(loc='upper right')

fig.tight_layout(w_pad=2)    
fig.suptitle('Word Count and Weights of Topic Keywords', fontsize=22, y=1.05)    
plt.show()

We investigate the number of speeches corresponding to a topic

In [None]:
def speeches_per_topic (model, corpus, start=0, end=1):
    full_corpus = corpus[start:end]
    domtopics = []
    percentage_topic = []
    for i, corp in enumerate(full_corpus):
        topic_percs, wordid_topics, wordid_phivalues = model[corp]
        dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
        dominant_topics.append((i, dominant_topic))
        percentage_topic.append(topic_percs)
    return(dominant_topics, percentage_topic)

domtopics, percentage_topic = topics_per_document(model=lda_model, corpus=corpus, end=-1)            

# Dominant Topics per speech
dataframe = pd.DataFrame(domtopics, columns=['Document_Id', 'Dominant_Topic'])
speech_dom_top = df.groupby('Dominant_Topic').size()
df_speech_dom_top = speech_dom_top.to_frame(name='count').reset_index()

# Distribution of topics by weight
doc_weight = pd.DataFrame([dict(t) for t in percentage_topic])
df_doc_weight = doc_weight.sum().to_frame(name='count').reset_index()

# 3 main keywords per topic
keywords3 = [(i, topic) for i, topics in model.show_topics(formatted=False) 
                                 for j, (topic, wt) in enumerate(topics) if j < 3]

stacked_df_keywords3 = pd.DataFrame(keywords3, columns=['topic_id', 'words'])
df_keywords3 = stacked_df_keywords3.groupby('topic_id').agg(', \n'.join)
df_keywords3.reset_index(level=0,inplace=True)

In [None]:
#NOT DONE YET
# Plot speeches per dominant topic
#fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10), dpi=120, sharey=True)

# Topic Distribution by Dominant Topics
#ax1.bar(x='Dominant_Topic', height='count', data=df_dominant_topic_in_each_doc, width=.5, color='firebrick')
#ax1.set_xticks(range(df_dominant_topic_in_each_doc.Dominant_Topic.unique().__len__()))
#tick_formatter = FuncFormatter(lambda x, pos: 'Topic ' + str(x)+ '\n' + df_top3words.loc[df_top3words.topic_id==x, 'words'].values[0])
#ax1.xaxis.set_major_formatter(tick_formatter)
#ax1.set_title('Number of Documents by Dominant Topic', fontdict=dict(size=10))
#ax1.set_ylabel('Number of Documents')
#ax1.set_ylim(0, 1000)

# Topic Distribution by Topic Weights
#ax2.bar(x='index', height='count', data=df_topic_weightage_by_doc, width=.5, color='steelblue')
#ax2.set_xticks(range(df_topic_weightage_by_doc.index.unique().__len__()))
#ax2.xaxis.set_major_formatter(tick_formatter)
#ax2.set_title('Number of Documents by Topic Weightage', fontdict=dict(size=10))

#plt.show()