# Modeling
ML Tasks

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

## Input

In [2]:
from sklearn.datasets import load_files

corpus = load_files("../data/")

doc_count = len(corpus.data)
print("Doc count:", doc_count)
assert doc_count is 56, "Wrong number of documents loaded, should be 56 (56 stories)"

Doc count: 56


## Vectorizer

In [3]:
from tokenizer import TextWrangler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

bow_stem = CountVectorizer(strip_accents="ascii", tokenizer=TextWrangler(kind="stem"))
X_bow_stem = bow_stem.fit_transform(corpus.data)

tfidf_stem = TfidfVectorizer(strip_accents="ascii", tokenizer=TextWrangler(kind="stem"))
X_tfidf_stem = tfidf_stem.fit_transform(corpus.data)

[nltk_data] Downloading package punkt to ../nltk/...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to ../nltk/...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to ../nltk/...
[nltk_data]   Package wordnet is already up-to-date!


## Models

In [4]:
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, NMF

n_topics = 5

lda = LatentDirichletAllocation(n_components=n_topics, 
                                learning_decay=0.5, learning_offset=1.,
                                random_state=23)
lsa = TruncatedSVD(n_components=n_topics, random_state=23)
nmf = NMF(n_components=n_topics, solver="mu", beta_loss="kullback-leibler", alpha=0.1, random_state=23)

In [5]:
lda_params = {"lda__learning_decay": [0.5, 0.7, 0.9],
              "lda__learning_offset": [1., 5., 10.]}

## Pipelines

In [6]:
from sklearn.pipeline import Pipeline

lda_pipe = Pipeline([
    ("bow", bow_stem),
    ("lda", lda)
])

lsa_pipe = Pipeline([
    ("tfidf", tfidf_stem),
    ("lsa", lsa)
])

nmf_pipe = Pipeline([
    ("tfidf", tfidf_stem),
    ("nmf", nmf)
])

## Gridsearch

In [7]:
from sklearn.model_selection import GridSearchCV

lda_model = GridSearchCV(lda_pipe, param_grid=lda_params, cv=5, n_jobs=-1)
#lda_model.fit(corpus.data)
#lda_model.best_params_

## Training

In [8]:
lda_pipe.fit(corpus.data)
nmf_pipe.fit(corpus.data)
lsa_pipe.fit(corpus.data)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...', TruncatedSVD(algorithm='randomized', n_components=5, n_iter=5,
       random_state=23, tol=0.0))])

## Evaluation

In [9]:
print("LDA")
print("Log Likelihood:", lda_pipe.score(corpus.data))

LDA
Log Likelihood: -1281860.968742299


In [10]:
def show_topics(vectorizer, model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

In [11]:
topic_keywords = show_topics(vectorizer=bow_stem, model=lda_pipe.named_steps.lda, n_words=15)        

print("LDA")
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

LDA


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,room,com,fac,hand,hous,know,littl,wom,think,look,way,lady,say,door,mat
Topic 1,hand,know,com,let,room,look,tim,sir,think,littl,say,fac,door,cas,cam
Topic 2,com,room,know,hand,fac,tim,day,cas,way,look,think,door,littl,window,sir
Topic 3,com,hand,room,littl,look,tim,night,hous,op,good,door,know,think,old,cam
Topic 4,com,look,know,cas,turn,left,young,road,littl,think,day,let,hand,fac,shal


In [12]:
topic_keywords = show_topics(vectorizer=tfidf_stem, model=lsa_pipe.named_steps.lsa, n_words=15)        

print("LSA")
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

LSA


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,com,room,hand,know,look,littl,tim,fac,think,hous,door,let,cas,way,cam
Topic 1,lestrad,bust,oldacr,mccarthy,mcfarlane,norwood,mycroft,sarah,napoleon,jona,pearl,moriarty,beppo,hark,pap
Topic 2,godfrey,colonel,staunton,straker,cunningham,hors,emswor,stabl,moor,armstrong,overton,bicyc,jam,ross,alec
Topic 3,hopkin,trev,blessington,carey,stanley,hut,room,smi,harpoon,captain,tregen,pet,stackhurst,ship,moriarty
Topic 4,robert,godfrey,lestrad,ferguson,staunton,mccarthy,lady,shoscomb,emswor,rond,mason,leonardo,coffin,crypt,hopkin


In [13]:
topic_keywords = show_topics(vectorizer=tfidf_stem, model=nmf_pipe.named_steps.nmf, n_words=15)        

print("NMF")
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

NMF


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,wast,wait,think,turn,us,valu,thought,watch,act,whisp,thing,adv,ask,way,vent
Topic 1,tap,vast,wednesday,veng,shav,untidy,shal,train,unexpect,unwieldy,incid,track,valet,van,ush
Topic 2,mov,vein,tast,thirteen,wish,sunk,unfold,upward,parl,leav,journey,priv,victor,porch,unsight
Topic 3,scyll,tumbl,upset,arm,stat,suff,passeng,sur,tre,test,account,tawny,reach,hang,bind
Topic 4,read,shook,ut,tackl,til,tru,vehic,tel,triumph,arm,villa,jaw,baronet,unev,moth


In [14]:
import pyLDAvis
from pyLDAvis.sklearn import prepare
pyLDAvis.enable_notebook()

#prepare(lda_pipe.named_steps.lda, X_bow_stem, bow_stem, mds="tsne")

In [15]:
#prepare(nmf_pipe.named_steps.nmf, X_tfidf_stem, tfidf_stem, mds="tsne")

# Document-topic Assignment

In [16]:
lda_distr = lda_pipe.transform(corpus.data)

In [17]:
lda_distr

array([[9.99750897e-01, 6.24027959e-05, 6.23210092e-05, 6.23195454e-05,
        6.20595651e-05],
       [2.15789483e-02, 8.15893182e-05, 9.78177688e-01, 8.10181676e-05,
        8.07565057e-05],
       [8.48430034e-05, 9.42662150e-02, 9.01181750e-01, 4.38267345e-03,
        8.45181367e-05],
       [6.36455861e-02, 9.36064973e-01, 9.66887851e-05, 9.65898611e-05,
        9.61621712e-05],
       [7.12174022e-05, 7.12858436e-05, 9.99715174e-01, 7.13817199e-05,
        7.09413246e-05],
       [6.68479836e-05, 9.99732667e-01, 6.70432456e-05, 6.68509891e-05,
        6.65902914e-05],
       [5.71232529e-05, 5.71928254e-05, 5.71448495e-05, 9.99771474e-01,
        5.70650305e-05],
       [6.76312688e-05, 6.77478591e-05, 9.99729559e-01, 6.76142762e-05,
        6.74474178e-05],
       [5.90912651e-05, 9.99763840e-01, 5.90772195e-05, 5.91354184e-05,
        5.88564580e-05],
       [6.15578817e-01, 3.84248919e-01, 5.73992894e-05, 5.75645793e-05,
        5.72998807e-05],
       [1.00572218e-04, 9.9959

In [18]:
collections_map = {0: "His_Last_Bow", 1: "The_Adventures_of_Sherlock_Holmes",
                   2: "The_Case-Book_of_Sherlock_Holmes", 3: "The_Memoirs_of_Sherlock_Holmes",
                   4: "The_Return_of_Sherlock_Holmes"}

In [19]:
topics = ["Topic" + str(i) for i in range(n_topics)]
#docs = ["Doc" + str(i) for i in range(len(corpus.data))]
docs = [f_name.split("/")[-1].split(".")[0] 
        for f_name in corpus.filenames]

df_document_topic = pd.DataFrame(np.round(lda_distr, 3), columns=topics, index=docs)
df_document_topic["assigned_topic"] = np.argmax(df_document_topic.values, axis=1)
df_document_topic["collection"] = [collections_map.get(item, item) for item in corpus.target]

df_document_topic.sort_values("assigned_topic").loc[:, ["collection", "assigned_topic"]]

Unnamed: 0,collection,assigned_topic
THE_ADVENTURE_OF_THE_ABBEY_GRANGE,The_Return_of_Sherlock_Holmes,0
THE_ADVENTURE_OF_THE_RED_CIRCLE,His_Last_Bow,0
THE_ADVENTURE_OF_THE_LION'S_MANE,The_Case-Book_of_Sherlock_Holmes,0
THE_ADVENTURE_OF_THE_VEILED_LODGER,The_Case-Book_of_Sherlock_Holmes,0
THE_ADVENTURE_OF_THE_BERYL_CORONET,The_Adventures_of_Sherlock_Holmes,0
THE_ADVENTURE_OF_THE_SUSSEX_VAMPIRE,The_Case-Book_of_Sherlock_Holmes,0
THE_ILLUSTRIOUS_CLIENT,The_Case-Book_of_Sherlock_Holmes,0
THE_YELLOW_FACE,The_Memoirs_of_Sherlock_Holmes,0
THE_ADVENTURE_OF_WISTERIA_LODGE,His_Last_Bow,0
THE_ADVENTURE_OF_THE_COPPER_BEECHES,The_Adventures_of_Sherlock_Holmes,0
