# Modeling
ML Tasks

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

## Input

In [2]:
from sklearn.datasets import load_files

corpus = load_files("../data/")

doc_count = len(corpus.data)
print("Doc count:", doc_count)
assert doc_count is 60, "Wrong number of documents loaded, should be 60 (4 novels + 56 stories)"

Doc count: 60


## Vectorizer

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from tokenizer import TextWrangler

tfidf_stem = TfidfVectorizer(strip_accents="ascii", tokenizer=TextWrangler(kind="stem"))
tfidf_stem.fit(corpus)

tfidf_stem.get_feature_names()

[nltk_data] Downloading package punkt to nltk/...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to nltk/...
[nltk_data]   Package wordnet is already up-to-date!


['dat', 'descr', 'filenam', 'target', 'target_names']

## Models

In [4]:
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, NMF

n_topics = 20 #FIXME GridSearch

lda = LatentDirichletAllocation(n_components=n_topics)
lsa = TruncatedSVD(n_components=n_topics)
nmf = NMF(n_components=n_topics)

## Pipeline

In [5]:
from sklearn.pipeline import Pipeline

model = Pipeline([
    ("tfidf", tfidf_stem),
    ("model", lda)
])

## Training

In [6]:
model.fit(corpus.data)



Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...ol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0))])

## Evaluation

In [14]:
print("Log Likelelihood:", model.score(corpus.data))

Log Likelelihood: -146556.86892199639


In [12]:
# Inspect topics
names = tfidf_stem.get_feature_names()

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

print_top_words(lda, names, 10)

Topic #0:
corp mathem dusky casket penang acquit fiery ardo end rak
Topic #1:
sway unty haughty nucle said dessert rift suicid leath phas
Topic #2:
holm said man littl ey fury toujo rosyth brunton paddington
Topic #3:
stylestown parlia lich breez gar bob hostel crop faird conquest
Topic #4:
embed montagu nat din overdid piano turquo pint borough ffolliot
Topic #5:
disgrac obl possess briarbra improv miss sect sutherland mongoos violin
Topic #6:
radi lombard wok precipit snatch hostil swath trait fastidy horr
Topic #7:
outbreak nod rais littl whiplash meagr nicknam meantim insinu xx
Topic #8:
program claim valentin dutch doggy exam paragon serv fresno snap
Topic #9:
klux dutchm apply thankless pithy mem midland alib tapanul penny
Topic #10:
holm said man mr hand room door com know watson
Topic #11:
shun pray big creak subcut dismay abernetty ribston upstair mistress
Topic #12:
amberley wantin train button us ryd holm swain chancellery roadsid
Topic #13:
peel castalot envelop lanky laugh