# End-to-End Topic Modeling 

In [4]:
%matplotlib inline
from pathlib import Path
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import ipywidgets as widgets
from ipywidgets import interact, FloatRangeSlider
from matplotlib.ticker import FuncFormatter
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import LdaModel, LdaMulticore
from gensim.corpora import Dictionary
from gensim.matutils import Sparse2Corpus
from gensim.corpora.dictionary import Dictionary
from pprint import pprint
from scipy import sparse
import pyLDAvis
from pyLDAvis.gensim import prepare
from gensim.models import CoherenceModel

In [5]:
warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')

In [6]:
percentiles=np.arange(.1, 1, .1).round(1)

## Read Text

In [25]:
text_path = Path('../data/clean_stop')
text_files = text_path.glob('*.txt')
docs = [f.read_text() for f in text_files]
len(docs)

26717

## Create Document-Term Matrix

In [26]:
vectorizer = CountVectorizer(max_df=.8,
                             min_df=.05,
                             stop_words='english',
                             max_features=None,
                             binary=False)
dtm = vectorizer.fit_transform(docs)

### Token per doc

In [27]:
token_per_doc = np.array(dtm.sum(axis=1)).squeeze()

In [28]:
pd.Series(token_per_doc).describe(percentiles=percentiles)

count    26717.000000
mean      1973.682449
std       1279.306551
min          7.000000
10%        919.000000
20%       1213.200000
30%       1422.000000
40%       1590.000000
50%       1749.000000
60%       1931.000000
70%       2150.000000
80%       2504.000000
90%       3191.000000
max      45356.000000
dtype: float64

### Token Frequencies

In [29]:
tokens = vectorizer.get_feature_names()

In [30]:
t_count = pd.Series(np.array(dtm.sum(axis=0)).squeeze(), index=tokens).sort_values(ascending=False)
t_count.head(10).append(t_count.tail(10))

image          529010
network        402964
feature        394381
dataset        288647
training       279551
learn          264218
sample         254497
layer          231629
matrix         227439
point          216743
albeit           1814
remarkably       1796
involved         1782
ghz              1780
noticeable       1770
devote           1765
faculty          1747
seminal          1743
fortunately      1727
terms            1445
dtype: int64

## Run Topic Model

In [31]:
id2word = pd.Series(tokens).to_dict()
corpus = Sparse2Corpus(dtm, documents_columns=False)

In [32]:
n_topics = 10
topic_labels = [f'Topic {i}' for i in range(1, n_topics + 1)]

In [33]:
LdaModel(corpus=corpus,
         num_topics=n_topics,
         id2word=id2word)

<gensim.models.ldamodel.LdaModel at 0x112ff4908>

### Show topics

In [34]:
topic_words = pd.DataFrame()
topic_probs = pd.DataFrame()
for topic in range(n_topics):
    terms = lda_gensim.get_topic_terms(topic)
    top_words = pd.DataFrame(terms, columns=['term', 'probability'])
    topic_words[topic_labels[topic]] = top_words.term.map(id2word)
    topic_probs[topic_labels[topic]] = top_words.probability
topic_words.T

NameError: name 'lda_gensim' is not defined

In [None]:
# can maybe transfer style from probabilities to terms
cm = sns.light_palette("green", as_cmap=True)
topic_probs.T.style.background_gradient(cmap=cm).format('{:,.2%}'.format)

In [None]:
lda_gensim.print_topics()

### Show topic coherence

In [None]:
coherence = lda_gensim.top_topics(corpus=corpus, coherence='u_mass')

In [None]:
topic_coherence  = []
topic_words = pd.DataFrame()
for t in range(len(coherence)):
    label = topic_labels[t]
    topic_coherence.append(coherence[t][1])
    df = pd.DataFrame(coherence[t][0], columns=[(label, 'prob'), (label, 'term')])
    df[(label, 'prob')] = df[(label, 'prob')].apply(lambda x: '{:.2%}'.format(x))
    topic_words = pd.concat([topic_words, df], axis=1)
                      
topic_words.columns = pd.MultiIndex.from_tuples(topic_words.columns)
pd.set_option('expand_frame_repr', False)
# print(topic_words.head())
pd.Series(topic_coherence, index=topic_labels).plot.bar(figsize=(14,6));

## PyLDAVis

In [None]:
dictionary = Dictionary.from_corpus(corpus, id2word)
vis = prepare(lda_gensim, corpus, dictionary)
pyLDAvis.display(vis)

## Topic Sentences

In [None]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel= lda_gensim, corpus=corpus, texts=docs)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(20)

In [None]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet.head(20)