# pyLDAviz per sentence - Aniket

In [35]:
import re
import pandas as pd
from collections import defaultdict
import operator
from pprint import pprint

# Set Pandas to display all rows of dataframes
pd.set_option('display.max_rows', 500)

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import LdaMulticore

# spacy for lemmatization
import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
nlp = spacy.load("en_core_web_sm")
from spacy import displacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

from tqdm import tqdm_notebook

In [16]:
tttc_text = open('../DATA/tttc_scrape.txt', 'r').read()

In [17]:
tttc_text

'The Things They Carried First Lieutenant Jimmy Cross carried letters from a girl named Martha, a junior at Mount Sebastian College in New Jersey. They were not love letters, but Lieutenant Cross was hoping, so he kept them folded in plastic at the bottom of his rucksack. In the late afternoon, after a day\'s march, he would dig his foxhole, wash his hands under a canteen, unwrap the letters, hold them with the tips of his fingers, and spend the last hour of light pretending. He would imagine romantic camping trips into the White Mountains in New Hampshire. He would sometimes taste the envelope flaps, knowing her tongue had been there. More than anything, he wanted Martha to love him as he loved her, but the letters were mostly chatty, elusive on the matter of love. She was a virgin, he was almost sure. She was an English major at Mount Sebastian, and she wrote beautifully about her professors and roommates and midterm exams, about her respect for Chaucer and her great affection for Vi

In [18]:
tttc_doc = nlp(tttc_text)

In [19]:
for token in tttc_doc[0:4]:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

The the DET DT det Xxx True True
Things thing NOUN NNS nsubj Xxxxx True False
They -PRON- PRON PRP nsubj Xxxx True True
Carried carry VERB VBD relcl Xxxxx True False


In [20]:
displacy.render(tttc_doc, style="ent")

# LDA pipeline

In [30]:
stop_list = []

# Updates spaCy's default stop words list with TTTC-specific additional words. 
nlp.Defaults.stop_words.update(stop_list)

for word in STOP_WORDS:
    lexeme = nlp.vocab[word]
    lexeme.is_stop = True

In [31]:
def lemmatizer(doc):
    # This takes in a doc of tokens from the NER and lemmatizes them. 
    doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)
    
def remove_stopwords(doc):
    # Use token.text to return strings -> Gensim.
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return doc

# The add_pipe function appends our functions to the default pipeline.
nlp.add_pipe(lemmatizer,name='lemmatizer',after='ner')
nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

In [32]:
doc_list = []
# Iterates through each article in the corpus.
for doc in tqdm_notebook(tttc_text):
    # Passes that article through the pipeline and adds to a new list.
    pr = nlp(doc)
    doc_list.append(pr)

HBox(children=(IntProgress(value=0, max=349273), HTML(value='')))




In [49]:
doc_list

[['t'],
 ['h'],
 ['e'],
 [' '],
 ['t'],
 ['h'],
 [],
 ['n'],
 ['g'],
 ['s'],
 [' '],
 ['t'],
 ['h'],
 ['e'],
 ['y'],
 [' '],
 ['C'],
 [],
 ['r'],
 ['r'],
 [],
 ['e'],
 ['d'],
 [' '],
 ['F'],
 [],
 ['r'],
 ['s'],
 ['t'],
 [' '],
 ['l'],
 [],
 ['e'],
 ['u'],
 ['t'],
 ['e'],
 ['n'],
 [],
 ['n'],
 ['t'],
 [' '],
 ['J'],
 [],
 ['m'],
 ['m'],
 ['y'],
 [' '],
 ['C'],
 ['r'],
 ['o'],
 ['s'],
 ['s'],
 [' '],
 ['c'],
 [],
 ['r'],
 ['r'],
 [],
 ['e'],
 ['d'],
 [' '],
 ['l'],
 ['e'],
 ['t'],
 ['t'],
 ['e'],
 ['r'],
 ['s'],
 [' '],
 ['f'],
 ['r'],
 ['o'],
 ['m'],
 [' '],
 [],
 [' '],
 ['g'],
 [],
 ['r'],
 ['l'],
 [' '],
 ['n'],
 [],
 ['m'],
 ['e'],
 ['d'],
 [' '],
 ['M'],
 [],
 ['r'],
 ['t'],
 ['h'],
 [],
 [],
 [' '],
 [],
 [' '],
 ['j'],
 ['u'],
 ['n'],
 [],
 ['o'],
 ['r'],
 [' '],
 [],
 ['t'],
 [' '],
 ['M'],
 ['o'],
 ['u'],
 ['n'],
 ['t'],
 [' '],
 ['S'],
 ['e'],
 ['b'],
 [],
 ['s'],
 ['t'],
 [],
 [],
 ['n'],
 [' '],
 ['C'],
 ['o'],
 ['l'],
 ['l'],
 ['e'],
 ['g'],
 ['e'],
 [' '],
 [],
 ['n'],
 [

In [33]:
# Creates, which is a mapping of word IDs to words.
words = corpora.Dictionary(doc_list)

# Turns each document into a bag of words.
corpus = [words.doc2bow(doc) for doc in doc_list]

In [39]:
lda_model = LdaMulticore(corpus=corpus,
                       id2word=words,
                       num_topics=10, 
                       random_state=42,
                       passes=50,
                       per_word_topics=True)

In [41]:
from gensim.test.utils import datapath

# Save model to disk.
temp_file = datapath("model_LDA_Multicore_per_Sentence")
lda_model.save(temp_file)

In [44]:
import pyLDAvis.gensim

preparation = pyLDAvis.gensim.prepare(lda_model, corpus, words)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [45]:
pyLDAvis.display(preparation)