In [1]:
%matplotlib inline

In [35]:
import pandas
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import sklearn
import sklearn.linear_model
from datetime import datetime
from sklearn_pandas import DataFrameMapper
import warnings
import timeit
from collections import defaultdict, OrderedDict
import tabulate
import time
import GPy
from IPython.display import display

timeit.template = """
def inner(_it, _timer{init}):
    {setup}
    _t0 = _timer()
    for _i in _it:
        retval = {stmt}
    _t1 = _timer()
    return _t1 - _t0, retval
"""

matplotlib.style.use('ggplot')

RANDOM_SEED = 33

In [88]:
CONTENTS_OPTIONS = ('CONTENTS', 'Contents')
IGNORE_CONTENTS = ('NOTES', ':', 'CONTENTS', 'Contents', 'Contents:')
SKIP_CONTENTS = ('APPENDIX', 'GLOSSARY')
TERMINATION = '*** END OF THIS PROJECT GUTENBERG EBOOK'

def split_book_by_contents(path, contents=None):
    chapters = OrderedDict()
    
    with open(path) as book_file:
        book = book_file.read()
        
        if not contents:
            contents_start = -1
            for c in CONTENTS_OPTIONS:
                idx = book.find(c)
                if -1 != idx:
                    contents_start = idx + len(c)
                    break

            if -1 == contents_start:
                raise ValueError('Failed to find table of contents, aborting...')

            contents_end = book.find('\n' * 4, contents_start)
            contents = book[contents_start : contents_end].split('\n')
            contents = [c.strip() for c in contents]
            contents = [c for c in contents if c and c not in IGNORE_CONTENTS]
            start_index = contents_end
            
        else:
            start_index = book.find(contents[0]) - 1
        
        contents_and_end = contents + [TERMINATION]
        
        for start_title, end_title in zip(contents, contents[1:]):
            if start_title in SKIP_CONTENTS: continue
                
            chapter_start = book.find(start_title, start_index) + len(start_title)
            chapter_end = book.find(end_title, chapter_start)
            chapters[start_title] = book[chapter_start : chapter_end].strip()
            
            start_index = chapter_end - 1
        
        
    return chapters

In [118]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

# print_top_words stolen shamelessly from 
# http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
    
    
def extract_fit_print(book_path, contents=None, n_components=20, min_df=2, split_chapters=None):
    book = split_book_by_contents(book_path, contents)
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=min_df, stop_words='english')
    chapters = [book[c] for c in book]
    
    if split_chapters:
        chapter_words = [chapter.split() for chapter in chapters]
        chapter_lengths = [len(chap) for chap in chapter_words]
        chapter_ranges = [[int(length * i) for i in range(split_chapters + 1)] for length in chapter_lengths]
        chapter_splits = [[' '.join(split[start : end]) for start, end in zip(cr, cr[1:])] 
                          for split, cr in zip(chapter_words, chapter_ranges)]
        chapters = [s for split_chapter in chapter_splits for s in split_chapter]
            
    count_vectors = tf_vectorizer.fit_transform(chapters)

    lda = LatentDirichletAllocation(n_components=n_components, max_iter=20)
    lda.fit(count_vectors)
    
    tf_feature_names = tf_vectorizer.get_feature_names()
    print_top_words(lda, tf_feature_names, 10)
    
    return lda

In [108]:
extract_fit_print(r'data/books/meditations.txt', split_chapters=5)

Topic #0: thou things thy unto man nature thee thyself life doth
Topic #1: thou things unto thy man thee doth good thyself nature
Topic #2: thou things unto thee thy man doth nature whatsoever world
Topic #3: thou unto things thee nature natural life doth thy person
Topic #4: thou thy things unto doth whatsoever nature world shall good
Topic #5: thou things man nature thy unto thee good doth thyself
Topic #6: thou man things unto thy doth nature good did thee
Topic #7: thou things unto man thee thy nature doth world thyself
Topic #8: thou things thyself man says art life nature fable let
Topic #9: good winds ingenuous personal cured requisite whereof roman gain seeks
Topic #10: things thou unto thy thee doth man shall thyself whatsoever
Topic #11: thou things unto thee good thy hath man doth constitution
Topic #12: thou things unto thy man thee nature doth good thyself
Topic #13: things thou thy men nature unto man mind did kind
Topic #14: thou things man unto men good nature thy time 

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=20,
             mean_change_tol=0.001, n_components=20, n_jobs=1,
             n_topics=None, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [109]:
federalist_contents = ['FEDERALIST No. {idx}'.format(idx=i + 1) for i in range(84)]
extract_fit_print(r'data/books/federalist.txt', federalist_contents)

Topic #0: constitution executive federal legislative powers ought members convention national representatives
Topic #1: national constitution union general ought convention members great body court
Topic #2: national union constitution public general authority great duties taxation influence
Topic #3: ought executive great legislative nations national general public common constitution
Topic #4: constitution cases union citizens ought great number federal shall case
Topic #5: exclusion ill magistrate office perpetual entertain men man station predecessor
Topic #6: courts cases constitution court authority law jurisdiction jury united trial
Topic #7: constitution federal authority united cases convention body representatives general public
Topic #8: executive legislative constitution union powers members men great department authority
Topic #9: union national public great general time men constitution nations powers
Topic #10: federal members authority general governments national union

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=20,
             mean_change_tol=0.001, n_components=20, n_jobs=1,
             n_topics=None, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [110]:
extract_fit_print(r'data/books/poe.txt', min_df=1, split_chapters=5)

Topic #0: poe allan edgar ebook works gutenberg volume edition raven project
Topic #1: 2016 utf david note english www 2008 restrictions added project
Topic #2: language use endnotes title start purloined author encoding 2148 contents
Topic #3: release www added produced letter works whatsoever updated end english
Topic #4: david set 2016 terms edition license use date griswold contents
Topic #5: gutenberg restrictions project 19 release start added title use encoding
Topic #6: contents included set use terms away project title start gutenberg
Topic #7: edgar cost volume whatsoever october away contents language included 2008
Topic #8: 19 included 2008 allan poe set terms language david letter
Topic #9: set widger edition updated start title ebook cost license ii
Topic #10: www endnotes english author volume use poe release widger restrictions
Topic #11: 2148 note away edition release org restrictions ii october edgar
Topic #12: notes letter widger ii whatsoever produced restrictions s

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=20,
             mean_change_tol=0.001, n_components=20, n_jobs=1,
             n_topics=None, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [119]:
comc_lda = extract_fit_print(r'data/books/count_of_monte_cristo.txt', split_chapters=10, n_components=50)

Topic #0: villefort little grows performed sympathy saint mind swollen getting unknown
Topic #1: fernand said dantès caderousse danglars mercédès edmond know man young
Topic #2: said villefort man dantès mercédès danglars fernand caderousse yes edmond
Topic #3: villefort said dantès king know renée cried marquise saint sire
Topic #4: dantès said villefort know door sire jailer police minister letter
Topic #5: require father invitation said sail méran piastres pharaon land firing
Topic #6: danglars caderousse quite 1st tried dantès did ah water laden
Topic #7: dantès replied door saw said jailer mercédès did gendarme young
Topic #8: said dantès fernand mercédès young danglars man edmond did sire
Topic #9: dantès grasped said villefort mercédès sweat sentiments tremulous advanced confirmed
Topic #10: villefort said know father man police sire dantès heart king
Topic #11: dear sire man said king majesty father villefort duke louis
Topic #12: villefort renée marquise king marquis napoleon 

In [120]:
np.sum(comc_lda.components_, axis=1)

array([    92.22224406,     92.85245253,     93.97565776,     92.52839384,
           92.75169741,     92.23859345,     92.20997308,     92.37006683,
           92.82955542,     92.21145197,     92.55922885,     92.39084862,
          581.66755588,     93.00458192,     92.50545424,     92.19525675,
           93.28570705,     92.13317696,     92.66002947,     93.09578361,
           93.58961007,     94.76718033,     92.12700038,     93.15508932,
           92.42536493,     92.28001317,     92.10429422,     92.20191818,
           92.13184027,     93.05178921,     92.8698482 ,     92.14644615,
           93.29194898,     92.18926615,     92.55154106,     92.11162521,
           92.27269324,     92.1336747 ,     92.36781736,     92.4770595 ,
        33570.34820202,     93.22581873,     92.38976125,   2186.09564104,
           92.13866829,     92.39287529,     92.25193521,     92.51903128,
           92.94076458,     92.39121468])