<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Set-cosmetic-paramters." data-toc-modified-id="Set-cosmetic-paramters.-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Set cosmetic paramters.</a></span></li><li><span><a href="#Create-streamed-corpus-object." data-toc-modified-id="Create-streamed-corpus-object.-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Create streamed corpus object.</a></span></li><li><span><a href="#Run-Mallet-LDA" data-toc-modified-id="Run-Mallet-LDA-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Run Mallet LDA</a></span><ul class="toc-item"><li><span><a href="#Get-top-terms-per-topic" data-toc-modified-id="Get-top-terms-per-topic-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Get top terms per topic</a></span><ul class="toc-item"><li><span><a href="#Visualize" data-toc-modified-id="Visualize-3.1.1"><span class="toc-item-num">3.1.1&nbsp;&nbsp;</span>Visualize</a></span></li></ul></li><li><span><a href="#Get-topic-thetas-and-entropy-for-each-speech" data-toc-modified-id="Get-topic-thetas-and-entropy-for-each-speech-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Get topic thetas and entropy for each speech</a></span></li></ul></li></ul></div>

***The notebook implements LDA on the uk parliamentary speech corpus, producing a data frame containing each speech's 200 thetas as well as word clouds to visualize selected topics.***

## Set cosmetic paramters.

In [1]:
%config InlineBackend.figure_format = 'retina'

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

In [3]:
### Libraries

# Standard Libraries 
import importlib
import csv
import pandas as pd
import numpy as np
import os
import random
import string
from collections import defaultdict

# Plotting Libraries
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
from matplotlib import pylab
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

# Gensim
from gensim import corpora, models
from gensim.corpora.dictionary import Dictionary
from gensim.models.wrappers import LdaMallet

# NLTK
import nltk
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Scientific libraries
from numpy import arange,array,ones
from scipy import stats

# Other
import glob
import pickle
from PIL import Image 

In [4]:
ktopics = 200

In [5]:
DIRECTORY = os.path.dirname(os.getcwd()) + '/'
print(DIRECTORY)
DVS = DIRECTORY + '00_data/00_dvs/'
print(DVS)
LDA = DIRECTORY + '00_data/99_lda{}/'.format(ktopics)
print(LDA)
VISUALS = DIRECTORY + '02_visuals/00_text/'
print(VISUALS)

/Volumes/GoogleDrive/My Drive/02_Stanford/00_Researching/16_SocialScientization/-03_HM/00_replication/
/Volumes/GoogleDrive/My Drive/02_Stanford/00_Researching/16_SocialScientization/-03_HM/00_replication/00_data/00_dvs/
/Volumes/GoogleDrive/My Drive/02_Stanford/00_Researching/16_SocialScientization/-03_HM/00_replication/00_data/99_lda200/
/Volumes/GoogleDrive/My Drive/02_Stanford/00_Researching/16_SocialScientization/-03_HM/00_replication/02_visuals/00_text/


## Create streamed corpus object.

In [6]:
class StreamedCorpus(object):
    def __iter__(self):
        for line in open(DVS+'docs_bigrams.txt'):
            yield dictionary.doc2bow(line.split())

In [7]:
corpus_stream = StreamedCorpus()

In [8]:
dictionary = Dictionary(line.split() for line in open(DVS+'docs_bigrams.txt'))

In [9]:
print(dictionary)

Dictionary(549369 unique tokens: ['account', 'acquaint', 'attend', 'chair', 'command']...)


## Run Mallet LDA

In [6]:
#Mallet
os.environ.update({'MALLET_HOME':r'/Applications/mallet-2.0.8/bin/mallet'})
mallet_path = r'/Applications/mallet-2.0.8/bin/mallet'

In [48]:
ldamallet = models.wrappers.LdaMallet(mallet_path, 
                                      prefix=LDA,
                                      corpus=corpus_stream, 
                                      num_topics=ktopics, 
                                      id2word=dictionary, 
                                      iterations=100, 
                                      random_seed=777)

In [49]:
ldamallet.save(LDA+"lda.{}".format(ktopics))

### Get top terms per topic

In [50]:
with open(DVS+"topics_terms{}.csv".format(ktopics), 'w') as f:
    csvwriter = csv.writer(f)
    for i in range(len(ldamallet.get_topics())):
        row = ['topic{}'.format(i)]
        row.extend([term[0] for term in ldamallet.show_topic(i, 25)])
        csvwriter.writerow(row)

#### Visualize

In [6]:
ldamallet = models.LdaModel.load(LDA+"lda.{}".format(ktopics))



In [7]:
topics = ldamallet.show_topics(formatted=False, num_topics=ktopics, num_words=50)
colors = [color for name, color in mcolors.TABLEAU_COLORS.items()]

In [11]:
topics[0]

(0,
 [('establish', 0.08251800136023274),
  ('maintain', 0.03187356185466231),
  ('secur', 0.025804316984360667),
  ('admit', 0.01326310477301325),
  ('doctrin', 0.01266319868588817),
  ('ground', 0.011331373751518565),
  ('union', 0.010706401950446755),
  ('danger', 0.010706401950446755),
  ('mainten', 0.008651013406814866),
  ('support', 0.008525684837081214),
  ('civil', 0.008288396078385501),
  ('declar', 0.008268343507228116),
  ('privileg', 0.007823844846572765),
  ('free', 0.007760345037907714),
  ('argument', 0.007661753229717242),
  ('contend', 0.007494648470072373),
  ('exclus', 0.007360964662356478),
  ('equal', 0.006649098386269336),
  ('claim', 0.006548835530482414),
  ('institut', 0.006204599725613985),
  ('distinct', 0.00617284982128146),
  ('opinion', 0.006116034203002204),
  ('proposit', 0.006080942203476781),
  ('essenti', 0.006079271155880333),
  ('separ', 0.005958955728936027),
  ('found', 0.005922192681814156),
  ('consist', 0.005669864494750404),
  ('preserv', 0.0

In [8]:
def grey_color_func(word, font_size, position, orientation, random_state=None,
                    **kwargs):
    return "hsl(0, 0%%, %d%%)" % random.randint(0, 0)

In [10]:
cloud = WordCloud(stopwords=None,
                  background_color='white',
                  width=2500,
                  height=2500,
                  max_words=100,
                  colormap='tab10',
#                   color_func= grey_color_func, 
                  color_func= lambda *args, **kwargs: colors[color_n],
                  prefer_horizontal=1, 
                 max_font_size=500)

In [None]:
### Visualize as 10 topics in 1 pdf 
# for j in range(0,ktopics,10):
#     fig, axes = plt.subplots(5, 2, figsize=(30,30*1.2941), sharex=True, sharey=True)
#     for i, ax in enumerate(axes.flatten()):
#         fig.add_subplot(ax)
#         topic_words = dict(topics[i+j][1])
#         cloud.generate_from_frequencies(topic_words, max_font_size=300)
#         plt.gca().imshow(cloud)
#         plt.gca().set_title('Topic ' + str(i+j) + "\n", fontdict=dict(size=50))
#         plt.gca().axis('off')

#     plt.subplots_adjust(wspace=0, hspace=0)
#     plt.axis('off')
#     plt.margins(x=0, y=0)
#     plt.tight_layout()
#     plt.savefig(prefix+"/{}topic{}.pdf".format(ktopics, j))
#     plt.close('all')
    
selection = list(range(0, ktopics))
selection = [106, 122, 45, 95, 96, 102] 
color_n = -1
for selected in selection: 
    if color_n == 9:
        color_n = -1
    color_n += 1
    topic_words = dict(topics[selected][1])
    cloud.generate_from_frequencies(topic_words)
    plt.figure( figsize=(3,3), dpi=1000)
    plt.gca().imshow(cloud)
#     plt.gca().set_title(str(selected) + "\n", fontdict=dict(size=40))
    plt.gca().axis('off')
    plt.axis('off')
    plt.margins(x=0, y=0)
    plt.tight_layout()
    plt.savefig(VISUALS+"topic{}.jpg".format(selected))
    plt.close('all')
    fig = Image.open(VISUALS+"topic{}.jpg".format(selected))
    os.remove(VISUALS+"topic{}.jpg".format(selected))
    fig.save(VISUALS+"topic{}.tif".format(selected))
    fig.close()

# ## Visualize each separately as jpgs
# color_n = -1
# for j in range(0,ktopics):
#     if color_n == 9:
#         color_n = -1
#     color_n += 1
#     topic_words = dict(topics[j][1])
#     cloud.generate_from_frequencies(topic_words, max_font_size=300)
#     plt.figure( figsize=(8,6) )
#     plt.gca().imshow(cloud)
#     plt.gca().set_title('Topic ' + str(j) + "\n", fontdict=dict(size=40))
#     plt.gca().axis('off')
#     plt.axis('off')
#     plt.margins(x=0, y=0)
#     plt.tight_layout()
#     plt.savefig(LDA+"01_visuals/"+"topic{}.jpg".format(j))
#     plt.close('all')

### Get topic thetas and entropy for each speech

Get relevant meta-data from raw (parsed) data

In [53]:
uk_coded = pd.read_csv(DVS+"uk_terms.csv", usecols=["date", "speaker", "year", 
                                                "chamber", 'ndigits', "length"], 
                       encoding="utf-8")

In [54]:
uk_coded.head()

Unnamed: 0,date,speaker,chamber,year,ndigits,length
0,1803-11-22,The Speaker,lower,1803,1,22
1,1803-11-22,Lord Hawkesbury,upper,1803,0,5
2,1803-11-22,The Lord Chancellor,upper,1803,0,24
3,1803-11-22,Lord Walsingham,upper,1803,0,67
4,1803-11-22,The Earl of Limerick,upper,1803,12,698


In [55]:
speeches = ldamallet.load_document_topics()
with open(DVS+"speech_thetas{}.csv".format(ktopics), 'w') as f: 
    f_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    i = -1
    columns = ["date", "speaker", "chamber", "year", 'ndigits', "length"]
    columns.extend(["topic" + str(i) for i in range(0,ktopics)])
    columns.append("entropy")
    f_writer.writerow(columns)
    for speech in speeches:
        i += 1
        row = []
        row.extend(list(uk_coded.loc[i])) # speech meta data
        row.extend([round(theta, 5) for theta in dict(speech).values()])
        row.append(stats.entropy(list(dict(speech).values())))
        f_writer.writerow(row)