# Modeling
ML Tasks

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

## Input

In [2]:
from sklearn.datasets import load_files

corpus = load_files("../data/")

doc_count = len(corpus.data)
print("Doc count:", doc_count)
assert doc_count is 60, "Wrong number of documents loaded, should be 60 (4 novels + 56 stories)"

Doc count: 60


## Vectorizer

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from tokenizer import TextWrangler

tfidf_stem = TfidfVectorizer(strip_accents="ascii", tokenizer=TextWrangler(kind="stem"))
tfidf_stem.fit(corpus.data)

[nltk_data] Downloading package punkt to ../nltk/...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to ../nltk/...
[nltk_data]   Package wordnet is already up-to-date!


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='ascii', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<tokenizer.TextWrangler object at 0x7f78d2c09048>,
        use_idf=True, vocabulary=None)

## Models

In [4]:
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, NMF

n_topics = 20 #FIXME GridSearch

lda = LatentDirichletAllocation(n_components=n_topics)
lsa = TruncatedSVD(n_components=n_topics)
nmf = NMF(n_components=n_topics)

## Pipeline

In [5]:
from sklearn.pipeline import Pipeline

model = Pipeline([
    ("tfidf", tfidf_stem),
    ("model", lda)
])

## Training

In [6]:
model.fit(corpus.data)



Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...ol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0))])

## Evaluation

In [7]:
print("Log Likelelihood:", model.score(corpus.data))

Log Likelelihood: -146336.68785699786


In [8]:
# Inspect topics
names = tfidf_stem.get_feature_names()

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

print_top_words(lda, names, 10)

Topic #0:
blym haf shelterless malefact divin streamed abstract vic slumb throng
Topic #1:
cruel discern unimport watercours gauz hoolig crav jupit queen ft
Topic #2:
holm said briarbra night com man atom wom scribbled know
Topic #3:
condon cruellest lanky unbrush dainty torto hut hum employ keepsak
Topic #4:
holm said man mr com bust beetl whereabout await shal
Topic #5:
beetl subordin attir electro vic battlefield breech pres lik afflu
Topic #6:
holm said murdoch man switzerland did peer cork hardiest mr
Topic #7:
helpm unimpass meredi transylvan prevy jefferson finest waldba ardu buzz
Topic #8:
nant vish phrases cloudy sigismond intrud naught unquest whe impetu
Topic #9:
mood stal provoc flesh entangl foreign slap eclips eastern interject
Topic #10:
unexplain promiscu hampshir carpet rein glossiest wait metic wallow porty
Topic #11:
holm said man mr com hand room watson know look
Topic #12:
raj remonst cap wholesom saturnin rul vanguard nov exclam fortescu
Topic #13:
crack holm bomb