# Modeling
ML Tasks

In [118]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

## Input

In [119]:
from sklearn.datasets import load_files

corpus = load_files("../data/")

doc_count = len(corpus.data)
print("Doc count:", doc_count)
assert doc_count is 56, "Wrong number of documents loaded, should be 56 (56 stories)"

Doc count: 56


## Vectorizer

In [120]:
from tokenizer import TextWrangler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

bow_stem = CountVectorizer(strip_accents="ascii", tokenizer=TextWrangler(kind="stem"))
X_bow_stem = bow_stem.fit_transform(corpus.data)

tfidf_stem = TfidfVectorizer(strip_accents="ascii", tokenizer=TextWrangler(kind="stem"))
X_tfidf_stem = tfidf_stem.fit_transform(corpus.data)

## Models

In [121]:
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, NMF

n_topics = 5

lda = LatentDirichletAllocation(n_components=n_topics, 
                                learning_decay=0.5, learning_offset=1.,
                                random_state=23)
lsa = TruncatedSVD(n_components=n_topics, random_state=23)
nmf = NMF(n_components=n_topics, solver="mu", beta_loss="kullback-leibler", alpha=0.1, random_state=23)

In [122]:
lda_params = {"lda__learning_decay": [0.5, 0.7, 0.9],
              "lda__learning_offset": [1., 5., 10.]}

## Pipelines

In [123]:
from sklearn.pipeline import Pipeline

lda_pipe = Pipeline([
    ("bow", bow_stem),
    ("lda", lda)
])

lsa_pipe = Pipeline([
    ("tfidf", tfidf_stem),
    ("lsa", lsa)
])

nmf_pipe = Pipeline([
    ("tfidf", tfidf_stem),
    ("nmf", nmf)
])

## Gridsearch

In [124]:
from sklearn.model_selection import GridSearchCV

lda_model = GridSearchCV(lda_pipe, param_grid=lda_params, cv=5, n_jobs=-1)
#lda_model.fit(corpus.data)
#lda_model.best_params_

## Training

In [125]:
lda_pipe.fit(corpus.data)
nmf_pipe.fit(corpus.data)
lsa_pipe.fit(corpus.data)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...', TruncatedSVD(algorithm='randomized', n_components=5, n_iter=5,
       random_state=23, tol=0.0))])

## Evaluation

In [126]:
print("LDA")
print("Log Likelihood:", lda_pipe.score(corpus.data))

LDA
Log Likelihood: -1281860.968742299


### Visual Inspection

In [127]:
def df_topic_model(vectorizer, model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
        
    df_topic_keywords = pd.DataFrame(topic_keywords)
    df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
    df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
    
    return df_topic_keywords

In [128]:
print("LDA")
df_topic_model(vectorizer=bow_stem, model=lda_pipe.named_steps.lda, n_words=15)

LDA


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,room,com,fac,hand,hous,know,littl,wom,think,look,way,lady,say,door,mat
Topic 1,hand,know,com,let,room,look,tim,sir,think,littl,say,fac,door,cas,cam
Topic 2,com,room,know,hand,fac,tim,day,cas,way,look,think,door,littl,window,sir
Topic 3,com,hand,room,littl,look,tim,night,hous,op,good,door,know,think,old,cam
Topic 4,com,look,know,cas,turn,left,young,road,littl,think,day,let,hand,fac,shal


In [129]:
print("LSA")
df_topic_model(vectorizer=tfidf_stem, model=lsa_pipe.named_steps.lsa, n_words=15)

LSA


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,com,room,hand,know,look,littl,tim,fac,think,hous,door,let,cas,way,cam
Topic 1,lestrad,bust,oldacr,mccarthy,mcfarlane,norwood,mycroft,sarah,napoleon,jona,pearl,moriarty,beppo,hark,pap
Topic 2,godfrey,colonel,staunton,straker,cunningham,hors,emswor,stabl,moor,armstrong,overton,bicyc,jam,ross,alec
Topic 3,hopkin,trev,blessington,carey,stanley,hut,room,smi,harpoon,captain,tregen,pet,stackhurst,ship,moriarty
Topic 4,robert,godfrey,lestrad,ferguson,staunton,mccarthy,lady,shoscomb,emswor,rond,mason,leonardo,coffin,crypt,hopkin


In [130]:
print("NMF")
df_topic_model(vectorizer=tfidf_stem, model=nmf_pipe.named_steps.nmf, n_words=15)

NMF


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,wast,wait,think,turn,us,valu,thought,watch,act,whisp,thing,adv,ask,way,vent
Topic 1,tap,vast,wednesday,veng,shav,untidy,shal,train,unexpect,unwieldy,incid,track,valet,van,ush
Topic 2,mov,vein,tast,thirteen,wish,sunk,unfold,upward,parl,leav,journey,priv,victor,porch,unsight
Topic 3,scyll,tumbl,upset,arm,stat,suff,passeng,sur,tre,test,account,tawny,reach,hang,bind
Topic 4,read,shook,ut,tackl,til,tru,vehic,tel,triumph,arm,villa,jaw,baronet,unev,moth


In [131]:
import pyLDAvis
from pyLDAvis.sklearn import prepare
pyLDAvis.enable_notebook()

#prepare(lda_pipe.named_steps.lda, X_bow_stem, bow_stem, mds="tsne")

In [132]:
#prepare(nmf_pipe.named_steps.nmf, X_tfidf_stem, tfidf_stem, mds="tsne")

#### Conclusion:

Topic models derived from different approaches look dissimilar. Top word distribution of NMF appears most 
meaningful, mostly because its topics doesn't share same words (due to NMF algorithm). LSA topic model is 
better interpretable than its LDA counterpart. Nonetheless, topics from both are hard to distinguish and 
doesn't make much sense. Therefore I'll go with the NMF topic model for the assginment to novel collections
step.

### Jaccard Index

In [76]:
df_topic_word_lda = df_topic_model(vectorizer=bow_stem, model=lda_pipe.named_steps.lda, n_words=10)
df_topic_word_lsa = df_topic_model(vectorizer=tfidf_stem, model=lsa_pipe.named_steps.lsa, n_words=10)
df_topic_word_nmf = df_topic_model(vectorizer=tfidf_stem, model=nmf_pipe.named_steps.nmf, n_words=10)

In [77]:
def jaccard_index(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    jaccard_index = len(s1.intersection(s2)) / len(s1.union(s2))
    return jaccard_index

In [95]:
sims_lda_lsa, sims_lda_nmf, sims_lsa_nmf = {}, {}, {}
assert df_topic_word_lda.shape[0] == df_topic_word_lsa.shape[0] == df_topic_word_nmf.shape[0], "n_topics mismatch"

for ix, row in df_topic_word_lda.iterrows(): 
    l1 = df_topic_word_lda.loc[ix, :].values.tolist()
    l2 = df_topic_word_lsa.loc[ix, :].values.tolist()
    l3 = df_topic_word_nmf.loc[ix, :].values.tolist()
    sims_lda_lsa[ix] = jaccard_index(l1, l2)
    sims_lda_nmf[ix] = jaccard_index(l1, l3)
    sims_lsa_nmf[ix] = jaccard_index(l2, l3)

df_jaccard_sims = pd.DataFrame([sims_lda_lsa, sims_lda_nmf, sims_lsa_nmf])
df_jaccard_sims.index = ["LDA vs LSA", "LDA vs NMF", "LSA vs NMF"]
df_jaccard_sims["mean_sim"] = df_jaccard_sims.mean(axis=1)
df_jaccard_sims

Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,mean_sim
LDA vs LSA,0.818182,0.0,0.0,0.052632,0.0,0.174163
LDA vs NMF,0.052632,0.0,0.0,0.0,0.0,0.010526
LSA vs NMF,0.052632,0.0,0.0,0.0,0.0,0.010526


#### Conclusion:

Topics derived from different topic modeling approaches are fundamentally dissimilar.

# Document-topic Assignment

In [21]:
lda_topic_distr = lda_pipe.transform(corpus.data)
nmf_topic_distr = nmf_pipe.transform(corpus.data)

In [22]:
nmf_topic_distr

array([[1.96144039e-01, 1.26620442e-07, 4.27265265e-08, 2.58301191e-01,
        1.78068220e-02],
       [1.66740024e-01, 4.19534126e-05, 1.93780594e-01, 2.92664535e-02,
        1.19632393e-02],
       [1.68366019e-01, 4.13647446e-03, 1.09076591e-03, 2.48256130e-01,
        1.32306630e-05],
       [2.66386808e-01, 3.26620556e-06, 2.54327735e-07, 1.67940838e-07,
        1.49194275e-02],
       [2.53620514e-01, 5.09314790e-02, 4.38230793e-05, 5.23588633e-05,
        1.42575286e-06],
       [1.70028924e-01, 2.70114838e-01, 1.46454425e-03, 1.54537746e-03,
        2.65057592e-02],
       [1.60147139e-01, 1.86420237e-04, 1.99676856e-01, 5.87836278e-03,
        9.32152717e-03],
       [1.51833371e-01, 3.55915194e-03, 1.82975246e-06, 2.62907637e-01,
        5.92613029e-03],
       [2.38330398e-01, 2.05391070e-03, 1.80676501e-02, 5.25527255e-02,
        9.20898027e-05],
       [3.33639233e-01, 4.52201701e-03, 1.10524419e-03, 5.88707306e-03,
        2.23604666e-02],
       [2.12848687e-01, 2.3812

In [18]:
collections_map = {0: "His_Last_Bow", 1: "The_Adventures_of_Sherlock_Holmes",
                   2: "The_Case-Book_of_Sherlock_Holmes", 3: "The_Memoirs_of_Sherlock_Holmes",
                   4: "The_Return_of_Sherlock_Holmes"}
novel_collections_map = {0: "His_Last_Bow", 1: "The_Adventures_of_Sherlock_Holmes",
                   2: "The_Case-Book_of_Sherlock_Holmes", 3: "The_Memoirs_of_Sherlock_Holmes",
                   4: "The_Return_of_Sherlock_Holmes"}

In [24]:
topics = ["Topic" + str(i) for i in range(n_topics)]
#docs = ["Doc" + str(i) for i in range(len(corpus.data))]
docs = [f_name.split("/")[-1].split(".")[0] 
        for f_name in corpus.filenames]

df_document_topic = pd.DataFrame(np.round(nmf_topic_distr, 3), columns=topics, index=docs)
df_document_topic["assigned_topic"] = np.argmax(df_document_topic.values, axis=1)
df_document_topic["collection"] = [collections_map.get(item, item) for item in corpus.target]

df_document_topic.sort_values("assigned_topic").loc[:, ["collection", "assigned_topic"]]

Unnamed: 0,collection,assigned_topic
THE_ADVENTURE_OF_CHARLES_AUGUSTUS_MILVERTON,The_Return_of_Sherlock_Holmes,0
THE_ADVENTURE_OF_THE_THREE_STUDENTS,The_Return_of_Sherlock_Holmes,0
THE_NAVAL_TREATY,The_Memoirs_of_Sherlock_Holmes,0
THE_ADVENTURE_OF_THE_COPPER_BEECHES,The_Adventures_of_Sherlock_Holmes,0
THE_ADVENTURE_OF_THE_RED_CIRCLE,His_Last_Bow,0
THE_ADVENTURE_OF_THE_SPECKLED_BAND,The_Adventures_of_Sherlock_Holmes,0
THE_YELLOW_FACE,The_Memoirs_of_Sherlock_Holmes,0
THE_MAN_WITH_THE_TWISTED_LIP,The_Adventures_of_Sherlock_Holmes,0
THE_ADVENTURE_OF_THE_RETIRED_COLOURMAN,The_Case-Book_of_Sherlock_Holmes,0
THE_ADVENTURE_OF_THE_THREE_GARRIDEBS,The_Case-Book_of_Sherlock_Holmes,0
