# Modeling
ML Tasks

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

## Input

In [2]:
from sklearn.datasets import load_files

corpus = load_files("../data/")

doc_count = len(corpus.data)
print("Doc count:", doc_count)
assert doc_count is 60, "Wrong number of documents loaded, should be 60 (4 novels + 56 stories)"

Doc count: 60


## Vectorizer

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from tokenizer import TextWrangler

tfidf_stem = TfidfVectorizer(strip_accents="ascii", tokenizer=TextWrangler(kind="stem"))
tfidf_stem.fit(corpus.data)
X_tfidf_stem = tfidf_stem.transform(corpus.data)

[nltk_data] Downloading package punkt to ../nltk/...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to ../nltk/...
[nltk_data]   Package wordnet is already up-to-date!


## Models

In [4]:
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, NMF

n_topics = 15

lda = LatentDirichletAllocation(n_components=n_topics)
lsa = TruncatedSVD()
nmf = NMF()

In [5]:
lda_params = {"lda__learning_decay": [0.7, 0.9]}


## Pipeline

In [6]:
from sklearn.pipeline import Pipeline

lda_pipe = Pipeline([
    ("tfidf", tfidf_stem),
    ("lda", lda)
])

## Gridsearch

In [7]:
from sklearn.model_selection import GridSearchCV

lda_model = GridSearchCV(lda_pipe, param_grid=lda_params, cv=5, n_jobs=-1)

## Training

In [8]:
lda_model.fit(corpus.data)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...        random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'lda__learning_decay': [0.7, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [9]:
print(lda_model.best_params_)
lda_model.best_score_

{'lda__learning_decay': 0.9}


-5538.667962969713

## Evaluation

In [10]:
#print("Log Likelihood:", model.score(corpus.data))

In [12]:
def show_topics(vectorizer, lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=tfidf_stem, lda_model=lda_model.best_estimator_.named_steps.lda, n_words=15)        

df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,godfrey,bennet,ferguson,staunton,coronet,emswor,armstrong,presbury,overton,vampir,ralph,dod,dol,burnwel,cambridg
Topic 1,oldacr,baskervil,mcfarlane,moor,morst,norwood,stapleton,sholto,adair,barrym,jon,jona,ronald,thadde,mortim
Topic 2,majesty,briony,adl,norton,serpentin,rocket,godfrey,landau,guardsm,monic,warsaw,ormstein,kram,edgew,waylaid
Topic 3,amberley,josiah,ernest,lewisham,purlington,chess,mackinnon,futil,colourm,crockford,haymarket,disus,indel,someth,ingratitud
Topic 4,milverton,bayn,stackhurst,mcpherson,rond,leonardo,cubit,garc,hilton,murdoch,lion,merrilow,cag,hydra,elsy
Topic 5,bork,mccarthy,tregen,gibson,von,hosm,windibank,dunb,angel,mortim,boscomb,sternd,typewrit,shlessinger,lausan
Topic 6,hopkin,cunningham,carey,stanley,alec,acton,harpoon,smi,hut,willoughby,coram,nelig,yoxley,cabin,chatham
Topic 7,carruth,mela,woodley,openshaw,mycroft,charlington,greek,horsham,williamson,interpret,munro,smi,farnham,norbury,latim
Topic 8,said,man,com,hand,room,know,look,littl,tim,fac,did,think,hous,door,way
Topic 9,blessington,clair,nevil,lasc,brook,whitney,swandam,boon,bradstreet,is,kat,catalepsy,hugh,ced,wharf


In [13]:
import pyLDAvis
from pyLDAvis.sklearn import prepare
pyLDAvis.enable_notebook()

prepare(lda_model.best_estimator_.named_steps.lda, X_tfidf_stem, tfidf_stem, mds="tsne")

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
