# Modeling
ML Tasks

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

## Input

In [2]:
from sklearn.datasets import load_files

corpus = load_files("../data/")

doc_count = len(corpus.data)
print("Doc count:", doc_count)
assert doc_count is 56, "Wrong number of documents loaded, should be 56 (56 stories)"

Doc count: 56


## Vectorizer

In [3]:
from tokenizer import TextWrangler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

bow_stem = CountVectorizer(strip_accents="ascii", tokenizer=TextWrangler(kind="stem"))
X_bow_stem = bow_stem.fit_transform(corpus.data)

tfidf_stem = TfidfVectorizer(strip_accents="ascii", tokenizer=TextWrangler(kind="stem"))
X_tfidf_stem = tfidf_stem.fit_transform(corpus.data)

[nltk_data] Downloading package punkt to ../nltk/...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to ../nltk/...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to ../nltk/...
[nltk_data]   Package wordnet is already up-to-date!


## Models

In [4]:
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, NMF

n_topics = 5

lda = LatentDirichletAllocation(n_components=n_topics, 
                                learning_decay=0.5, learning_offset=1.,
                                random_state=23)
lsa = TruncatedSVD(n_components=n_topics, random_state=23)
nmf = NMF(n_components=n_topics, solver="mu", beta_loss="kullback-leibler", alpha=0.1, random_state=23)

In [5]:
lda_params = {"lda__learning_decay": [0.5, 0.7, 0.9],
              "lda__learning_offset": [1., 5., 10.]}

## Pipelines

In [6]:
from sklearn.pipeline import Pipeline

lda_pipe = Pipeline([
    ("bow", bow_stem),
    ("lda", lda)
])

lsa_pipe = Pipeline([
    ("tfidf", tfidf_stem),
    ("lsa", lsa)
])

nmf_pipe = Pipeline([
    ("tfidf", tfidf_stem),
    ("nmf", nmf)
])

## Gridsearch

In [7]:
from sklearn.model_selection import GridSearchCV

lda_model = GridSearchCV(lda_pipe, param_grid=lda_params, cv=5, n_jobs=-1)

## Training

In [8]:
lda_pipe.fit(corpus.data)
nmf_pipe.fit(corpus.data)
lsa_pipe.fit(corpus.data)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...', TruncatedSVD(algorithm='randomized', n_components=5, n_iter=5,
       random_state=23, tol=0.0))])

## Evaluation

In [9]:
print("LDA")
print("Log Likelihood:", lda_pipe.score(corpus.data))

LDA
Log Likelihood: -1281669.2791995243


In [10]:
def show_topics(vectorizer, lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

In [11]:
topic_keywords = show_topics(vectorizer=bow_stem, lda_model=lda_pipe.named_steps.lda, n_words=15)        

print("LDA")
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

LDA


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,room,hand,com,fac,littl,lady,know,think,hous,way,look,wom,door,mat,tim
Topic 1,know,hand,sir,com,let,room,look,tim,door,say,fac,think,cam,good,littl
Topic 2,com,room,hand,know,fac,tim,day,cas,hous,way,look,lik,think,littl,sir
Topic 3,com,littl,hand,room,look,tim,night,know,good,think,door,hous,ask,op,cas
Topic 4,com,look,know,cas,turn,left,think,littl,young,hand,let,road,day,fac,way


In [12]:
topic_keywords = show_topics(vectorizer=tfidf_stem, lda_model=lsa_pipe.named_steps.lsa, n_words=15)        

print("LSA")
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

LSA


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,com,room,hand,know,look,littl,tim,fac,think,hous,door,let,cas,way,cam
Topic 1,lestrad,colonel,bust,cunningham,mycroft,oldacr,mccarthy,straker,mcfarlane,norwood,alec,cadog,barclay,pap,ross
Topic 2,godfrey,colonel,staunton,emswor,cunningham,barclay,armstrong,straker,hors,old,overton,sir,tregen,ralph,bennet
Topic 3,simon,robert,st,lestrad,godfrey,hors,ferguson,miss,wif,lady,lord,barclay,straker,hosm,sarah
Topic 4,hopkin,lestrad,carey,smi,bust,godfrey,pet,bicyc,stanley,mccarthy,carruth,staunton,hut,oldacr,harpoon


In [13]:
topic_keywords = show_topics(vectorizer=tfidf_stem, lda_model=nmf_pipe.named_steps.nmf, n_words=15)        

print("NMF")
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

NMF


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,wait,way,went,view,tru,NUM,voic,walk,threw,understand,us,window,ask,want,short
Topic 1,triv,revelry,retir,ring,street,venezuel,greek,buzz,return,troph,got,NUM,seen,viol,warm
Topic 2,tap,unpleas,villain,pip,vit,surfac,try,twitch,vary,team,wak,fing,rock,path,perceiv
Topic 3,understood,suggest,tabl,tobacco,tig,undertak,swung,try,sound,urg,tin,story,cheek,trembl,brok
Topic 4,twic,trick,tragedy,waddl,string,saw,ut,suppos,unquest,westminst,pol,turn,instinct,struck,rocky


In [14]:
import pyLDAvis
from pyLDAvis.sklearn import prepare
pyLDAvis.enable_notebook()

prepare(lda_pipe.named_steps.lda, X_bow_stem, bow_stem, mds="tsne")

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [15]:
prepare(nmf_pipe.named_steps.nmf, X_tfidf_stem, tfidf_stem, mds="tsne")

  kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
  log_lift = np.log(topic_term_dists / term_proportion)
  log_ttd = np.log(topic_term_dists)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
