In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import datetime
import nltk
import time
import logging
import itertools
import gensim

from shared_lib import utils, vocabulary
from shared_lib import ngram_lm
from shared_lib import ngram_utils
from shared_lib import simple_trigram

%matplotlib inline

### Setup reddit data

In [3]:
boston_bomb_df = pd.read_json('data/reddit/boston_comments_2013.03-2013.05.txt', lines=True)
boston_series_df = pd.read_json('data/reddit/boston_comments_2013.10-2013.11.txt', lines=True)
florida_df = pd.read_json('data/reddit/florida_comments_2017.06-2017.10.txt', lines=True)
houston_df = pd.read_json('data/reddit/houston_comments_2017.06-2017.10.txt', lines=True)
miami_df = pd.read_json('data/reddit/miami_comments_2017.06-2017.10.txt', lines=True)
nyc_df = pd.read_json('data/reddit/nyc_comments_2012.08-2012.12.txt', lines=True)
puerto_rico_df = pd.read_json('data/reddit/puerto_rico_comments_2017.06-2017.10.txt', lines=True)
vegas_df = pd.read_json('data/reddit/vegas_comments_2017.06-2017.10.txt', lines=True)

In [4]:
# setup local times
boston_bomb_df['created_at_local'] = pd.to_datetime(boston_bomb_df['created_utc'], unit='s') \
                                     .dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
boston_series_df['created_at_local'] = pd.to_datetime(boston_series_df['created_utc'], unit='s') \
                                     .dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
florida_df['created_at_local'] = pd.to_datetime(florida_df['created_utc'], unit='s') \
                                     .dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
houston_df['created_at_local'] = pd.to_datetime(houston_df['created_utc'], unit='s') \
                                     .dt.tz_localize('UTC').dt.tz_convert('US/Central')
miami_df['created_at_local'] = pd.to_datetime(miami_df['created_utc'], unit='s') \
                                     .dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
nyc_df['created_at_local'] = pd.to_datetime(nyc_df['created_utc'], unit='s') \
                                 .dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
puerto_rico_df['created_at_local'] = pd.to_datetime(puerto_rico_df['created_utc'], unit='s') \
                                 .dt.tz_localize('UTC').dt.tz_convert('America/Puerto_Rico')
vegas_df['created_at_local'] = pd.to_datetime(vegas_df['created_utc'], unit='s') \
                                 .dt.tz_localize('UTC').dt.tz_convert('US/Pacific')  

### Wiki helper functions

In [2]:
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

def head(stream, n=10):
    """Convenience fnc: return the first `n` elements of the stream, as plain list."""
    return list(itertools.islice(stream, n))

In [5]:
from gensim.utils import smart_open, simple_preprocess
from gensim.corpora.wikicorpus import _extract_pages, filter_wiki
from gensim.parsing.preprocessing import STOPWORDS

def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

def iter_wiki(dump_file):
    """Yield each article from the Wikipedia dump, as a `(title, tokens)` 2-tuple."""
    ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split()
    for title, text, pageid in _extract_pages(smart_open(dump_file)):
        text = filter_wiki(text)
        tokens = tokenize(text)
        if len(tokens) < 50 or any(title.startswith(ns + ':') for ns in ignore_namespaces):
            continue  # ignore short articles and various meta-articles
        yield title, tokens

In [6]:
wiki_archive = '/Users/chrisfleisch/Downloads/enwiki-latest-pages-articles.xml.bz2'
stream = iter_wiki(wiki_archive)
for title, tokens in itertools.islice(iter_wiki(wiki_archive), 8):
    print title, tokens[:10]  # print the article title and its first ten tokens

Anarchism [u'anarchism', u'political', u'philosophy', u'advocates', u'self', u'governed', u'societies', u'based', u'voluntary', u'institutions']
Autism [u'autism', u'disorder', u'characterized', u'impaired', u'social', u'interaction', u'impaired', u'verbal', u'non', u'verbal']
Albedo [u'percentage', u'diffusely', u'reflected', u'sunlight', u'relation', u'surface', u'conditions', u'albedo', u'measure', u'reflectance']
A [u'writing', u'cursive', u'forms', u'named', u'plural', u'aes', u'letter', u'vowel', u'iso', u'basic']
Alabama [u'alabama', u'state', u'southeastern', u'region', u'united', u'states', u'bordered', u'tennessee', u'north', u'georgia']
Achilles [u'achilles', u'nereid', u'cymothoe', u'attic', u'red', u'figure', u'kantharos', u'volci', u'cabinet', u'des']
Abraham Lincoln [u'abraham', u'lincoln', u'february', u'april', u'american', u'statesman', u'lawyer', u'served', u'th', u'president']
Aristotle [u'aristotle', u'aristot\xe9l\u0113s', u'bc', u'ancient', u'greek', u'philosophe

### Dictionary of Words and Counts

In [7]:
doc_stream = (tokens for _, tokens in iter_wiki(wiki_archive))

In [8]:
%time id2word_wiki = gensim.corpora.Dictionary(doc_stream)
print(id2word_wiki)

INFO : adding document #0 to Dictionary(0 unique tokens: [])
INFO : adding document #10000 to Dictionary(444723 unique tokens: [u'biennials', u'tripolitan', u'oblocutor', u'woode', u'maderista']...)
INFO : adding document #20000 to Dictionary(636710 unique tokens: [u'biennials', u'tripolitan', u'oblocutor', u'shatzky', u'woode']...)
INFO : adding document #30000 to Dictionary(778555 unique tokens: [u'tripolitan', u'oblocutor', u'shatzky', u'dulcitone', u'olivierre']...)
INFO : adding document #40000 to Dictionary(904977 unique tokens: [u'tripolitan', u'oblocutor', u'shatzky', u'dulcitone', u'olivierre']...)
INFO : adding document #50000 to Dictionary(975827 unique tokens: [u'tripolitan', u'dr\xfcckt', u'oblocutor', u'shatzky', u'dulcitone']...)
INFO : adding document #60000 to Dictionary(995734 unique tokens: [u'tripolitan', u'dr\xfcckt', u'oblocutor', u'shatzky', u'dulcitone']...)
INFO : adding document #70000 to Dictionary(1011647 unique tokens: [u'tripolitan', u'dr\xfcckt', u'oblocu

INFO : keeping 2000000 tokens which were in no less than 0 and no more than 300000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'soestdijk', u'billycorgan', u'olmsville']...)
INFO : adding document #300000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'soestdijk', u'billycorgan', u'olmsville']...)
INFO : discarding 50925 tokens: [(u'slaughterball', 1), (u'ciwanan', 1), (u'zheskasgan', 1), (u'comitatives', 1), (u'nociperception', 1), (u'melanodytes', 1), (u'sublineages', 1), (u'caneproduction', 1), (u'zzgt', 1), (u'protocommunist', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 310000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'soestdijk', u'billycorgan', u'olmsville']...)
INFO : adding document #310000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'soestdijk', u'billycorgan', u'olmsvil

INFO : adding document #430000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'soestdijk', u'billycorgan', u'olmsville']...)
INFO : discarding 40535 tokens: [(u'papahan', 1), (u't\u012bv', 1), (u'tutores', 1), (u'\u043a\u0438\u0437\u043b\u044f\u0440\u0441\u043a\u0438\u0439', 1), (u'uscwf', 1), (u'ukrainified', 1), (u'zandsq', 1), (u'nayanjot', 1), (u'cityneon', 1), (u'\u0627\u0644\u0639\u0628\u0627\u0633\u064a\u0629', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 440000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'soestdijk', u'billycorgan', u'olmsville']...)
INFO : adding document #440000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'soestdijk', u'billycorgan', u'olmsville']...)
INFO : discarding 43021 tokens: [(u'khuongpui', 1), (u'keku\u02bbi\u02bbapowa', 1), (u'eventbit', 1), (u'\uc190\uc218', 1), (u'\u590f\u840d\u7063\u6cf3\u7058', 1), (u'sixsa

INFO : discarding 35630 tokens: [(u'nisnose', 1), (u'maledicti', 1), (u'matalvi', 1), (u'naqa\u0294i', 1), (u'kuetu', 1), (u'sadoveanus', 1), (u'anteriolateral', 1), (u'naticousti', 1), (u'foin\xedki', 1), (u'klieste', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 570000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'soestdijk', u'billycorgan', u'olmsville']...)
INFO : adding document #570000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'soestdijk', u'billycorgan', u'olmsville']...)
INFO : discarding 40816 tokens: [(u'pressuretrol', 1), (u'lamberger', 1), (u'sanctipaulensis', 1), (u'newmemorabilia', 1), (u'sachoff', 1), (u'ericarum', 1), (u'arcotense', 1), (u'demitorivich', 1), (u'\u660e\u5983', 1), (u'ma\u010dica', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 580000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000

INFO : discarding 38495 tokens: [(u'khaili', 1), (u'hejredal', 1), (u'spongellidae', 1), (u'benyt', 1), (u'stramare', 1), (u'solovyi', 1), (u'b\u0259y\u1eadl\xeeq', 1), (u'ochansk', 1), (u'zarchman', 1), (u'rudagaugh', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 700000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'soestdijk', u'billycorgan', u'olmsville']...)
INFO : adding document #700000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'soestdijk', u'billycorgan', u'olmsville']...)
INFO : discarding 35710 tokens: [(u'winslsow', 1), (u'yaeyamensis', 1), (u'ninjaturtles', 1), (u'mylpg', 1), (u'mazonka', 1), (u'\u9752\u5c71\u56fd\u969b\u653f\u7d4c\u8ad6\u96c6', 1), (u'linnekogel', 1), (u'artfoto', 1), (u'messaite', 1), (u'hallia', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 710000 (=100.0%) documents
INFO : resulting dictionary: Di

INFO : keeping 2000000 tokens which were in no less than 0 and no more than 830000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #830000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 35616 tokens: [(u'brzegseven', 1), (u'd\xfagh\xe1b\xe1d', 1), (u'inghang', 1), (u'vazhutacaud', 1), (u'\u90ef\u57ce\u53bf', 1), (u'dejardinii', 1), (u'kitsunai', 1), (u'b\xe9kan', 1), (u'invers\xe3o', 1), (u'vellozo', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 840000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #840000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO 

INFO : adding document #960000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 38562 tokens: [(u'audioasics', 1), (u'faeroensis', 1), (u'c\xf3nte', 1), (u'\u6d77\u971e', 1), (u'ivanpahs', 1), (u'ventralization', 1), (u'storyjames', 1), (u'gartzelako', 1), (u'dansparty', 1), (u'yaj\xf1avalka', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 970000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #970000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 39031 tokens: [(u'krostitz', 1), (u'ponlanat', 1), (u'paracalliope', 1), (u'urbancode', 1), (u'bazaria', 1), (u'zongbo', 1), (u'shlumper', 1), (u'nephat', 1), (u'\u02c8m\u02e0ak\u02c8\u025f\u026al', 1), (u'cyfaddawd', 1)]...
IN

INFO : discarding 33508 tokens: [(u'mcgaul', 1), (u'surster', 1), (u'\u0645\u0627\u062a\u064asungai', 1), (u'\xe9tran', 1), (u'varadhanunni', 1), (u'mckennaflutes', 1), (u'ashtavadana', 1), (u'asmani', 1), (u'makingup', 1), (u'dor\u03c0', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 1100000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #1100000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 32190 tokens: [(u'historiadeboca', 1), (u'fivaller', 1), (u'junient', 1), (u'parmatown', 1), (u'remisinscences', 1), (u'wikiairports', 1), (u'gateguard', 1), (u'windkracht', 1), (u'gottenblatt', 1), (u'llandderfel', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 1110000 (=100.0%) documents
INFO : resulting dictionary: Di

INFO : keeping 2000000 tokens which were in no less than 0 and no more than 1230000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #1230000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 32160 tokens: [(u'\u30d7\u30ec\u30df\u30a2\u30e0', 1), (u'\ubcc4\ub0b4\uc2e0\ub3c4\uc2dc', 1), (u'unimogclub', 1), (u'\u0434\u0440\u0430\u0436\u0435\u0442\u0438\u0447', 1), (u'vlilelobisasa', 1), (u'sopv', 1), (u'salpeten', 1), (u'levart', 1), (u'escarlan', 1), (u'ros\xedo', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 1240000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #1240000 to Dictionary(2000000 unique tokens: [u'tripolitan'

INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #1360000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 32871 tokens: [(u'chencunia', 1), (u'portpool', 1), (u'emanoel', 1), (u'engywuck', 1), (u'\u674e\u767d\u8a69\u65b0\u8b6f', 1), (u'viteje\u0219te', 1), (u'balflugg', 1), (u'licto', 1), (u'perpetual_inv', 1), (u'techarizonast', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 1370000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #1370000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 31748 tokens: [(u'mugaritzak', 1), (u'topalovic', 1), (u'camelopardalids', 1), 

INFO : adding document #1490000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 26378 tokens: [(u'fullyhyd', 1), (u'd\xe2rn\u0103ul', 1), (u'morkhoon', 1), (u'pvtg', 1), (u'\u5317\u90e1', 1), (u'whooly', 1), (u'dunskins', 1), (u'jangu', 1), (u'veilhen', 1), (u'voegels', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 1500000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #1500000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 29354 tokens: [(u'xochiti', 1), (u'rudolgo', 1), (u'ogunlana', 1), (u'luckeclipsemoon', 1), (u'muckendorff', 1), (u'vervolgt', 1), (u'bogdocosa', 1), (u'l\xf6vsk\xe4r', 1), (u'ch\xe2tillin', 1), (u'cutucu', 1)]...
INFO : keeping 2000000 tokens whic

INFO : discarding 32522 tokens: [(u'bernaldus', 1), (u'gadsdon', 1), (u'alt\u0103dat\u0103', 1), (u'sinnabri', 1), (u'jhamshikhel', 1), (u'\u5927\u585a\u82b3\u5fe0', 1), (u'v\xe4nernbanan', 1), (u'hexaaquaions', 1), (u'sachche', 1), (u'ardraos', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 1630000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #1630000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 33247 tokens: [(u'ochthephilinae', 1), (u'trnaser', 1), (u'ma\u013eovan\xe9', 1), (u'wsly', 1), (u'lysd', 1), (u'ischtiraki', 1), (u'xhine', 1), (u'seitakorva', 1), (u'pereirinha', 1), (u'cursomers', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 1640000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(

INFO : keeping 2000000 tokens which were in no less than 0 and no more than 1760000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #1760000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 31785 tokens: [(u'keater', 1), (u'pocalyptic', 1), (u'asyenda', 1), (u'hongokuch\u014d', 1), (u'photom\xf6we', 1), (u'jorasankoe', 1), (u'superoptimistic', 1), (u'\u0561\u056c\u0561\u0566\u0561\u0576', 1), (u'cemon', 1), (u'bitistatin', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 1770000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #1770000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk',

INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #1890000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 34272 tokens: [(u'biarrots', 1), (u'umgangsdichtung', 1), (u'frohlichstein', 1), (u'tarruntenus', 1), (u'cppmii', 1), (u'hopleys', 1), (u'berteline', 1), (u'fgajdos', 1), (u'katsimis', 1), (u'treulio', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 1900000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #1900000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 30693 tokens: [(u'gravicembani', 1), (u'ramionami', 1), (u'\u0438\u0441\u043f\u043e\u043b\u044c\u0437\u

INFO : discarding 29662 tokens: [(u'b\u1ee9c', 1), (u'labois', 1), (u'psilophrys', 1), (u'\u0641\u0631\u0646\u0643\u062c\u064a\u0646', 1), (u'armre', 1), (u'd\u017eonjam', 1), (u'riefkohlcarmen', 1), (u'pymangary', 1), (u'\u0642\u0627\u0646\u0627\u062a', 1), (u'matagigantes', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 2030000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #2030000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 34726 tokens: [(u'sharonullman', 1), (u'toprally', 1), (u'lokaniti', 1), (u'\xe9difiants', 1), (u'aerobitch', 1), (u'\u8857\u982d\u719f\u98df', 1), (u'mlpg', 1), (u'haydnlaan', 1), (u'astakhovalidiya', 1), (u'buildman', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 2040000 (=100.0%)

INFO : keeping 2000000 tokens which were in no less than 0 and no more than 2160000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #2160000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 37334 tokens: [(u'chefredaktion', 1), (u'marutiray', 1), (u'resendea', 1), (u'accroupir', 1), (u'or\xe1te', 1), (u'\u03c0\u03b1\u03bd\u03b1\u03c4\u03c3\u03b9\u03ac\u03c2', 1), (u'macigolo', 1), (u'\u897f\u5b89\u4e8b\u53d8', 1), (u'thammaseuksa', 1), (u'h\xfdb\u011b', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 2170000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #2170000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdn

INFO : adding document #2290000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 29291 tokens: [(u'hamshawviales', 1), (u'seruwila', 1), (u'parimia', 1), (u'elektrobil', 1), (u'bossaso', 1), (u'pyandino', 1), (u'micrometry', 1), (u'takahashiakito', 1), (u'shardhananda', 1), (u'laevicephalus', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 2300000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #2300000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 30730 tokens: [(u'metalloamides', 1), (u'\u014b\xedi', 1), (u'kovasky', 1), (u'meausere', 1), (u'tha\xedssa', 1), (u'tallul', 1), (u'\u062a\u0627\u0628\u06cc\u062f\u0646', 1), (u'varo\u0161luk', 1), (u'onthehunt', 1), (u'human

INFO : adding document #2420000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 36413 tokens: [(u'libredigital', 1), (u'interpostal', 1), (u'etanna', 1), (u'g\xf8ransson', 1), (u'lambdaprolog', 1), (u'anthocoma', 1), (u'varvarivka', 1), (u'ndiri', 1), (u'rakuyama', 1), (u'trawally', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 2430000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #2430000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 31044 tokens: [(u'pakery', 1), (u'trotsenburg', 1), (u'rapahel', 1), (u'ffkama', 1), (u'setrouck', 1), (u'petitdemange', 1), (u'lavorato', 1), (u'engelrada', 1), (u'beliarde', 1), (u'presott', 1)]...
INFO : keeping 2000000 tokens which

INFO : discarding 35910 tokens: [(u'coclea', 1), (u'subwork', 1), (u'afk\xe2r', 1), (u'aut\xf3sport', 1), (u'kami\u0109a', 1), (u'minoiu', 1), (u'garagesalestl', 1), (u'ziggus', 1), (u'difenpiramide', 1), (u'dunbyrne', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 2560000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #2560000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 34216 tokens: [(u'menicks', 1), (u'buxanae', 1), (u'sampoothamee', 1), (u'saykham', 1), (u'hrak', 1), (u'stadensis', 1), (u'nagyalfold', 1), (u'choward', 1), (u'onhso', 1), (u'photosensible', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 2570000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolita

INFO : keeping 2000000 tokens which were in no less than 0 and no more than 2690000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #2690000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 35473 tokens: [(u'rafatullah', 1), (u'udumulpet', 1), (u'bunjoy', 1), (u'jidesheng', 1), (u'ruwaily', 1), (u'suvarrasadfso', 1), (u'primalis', 1), (u'panachanpur', 1), (u'\u4f55\u51f1\u502b', 1), (u'gwarbq', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 2700000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #2700000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : disc

INFO : keeping 2000000 tokens which were in no less than 0 and no more than 2820000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #2820000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 34027 tokens: [(u'abulhail', 1), (u't\u01dfnda', 1), (u'k\xf4rnophuli', 1), (u'hettiwatte', 1), (u'\u7121\u7455\u6d77\u7389', 1), (u'sigyel', 1), (u'pasibula', 1), (u'craticulella', 1), (u'\u8499\u516c\u4e61', 1), (u'magatheer', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 2830000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #2830000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmo

INFO : keeping 2000000 tokens which were in no less than 0 and no more than 2950000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #2950000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 32926 tokens: [(u'darepro', 1), (u'teasurer', 1), (u'millario', 1), (u'\u0431\u0435\u0440\u0435\u0433\u0430\u043c', 1), (u'pongezi', 1), (u'ruyifang', 1), (u'athhenian', 1), (u'darwin_en', 1), (u'\xfcnl\xfcp\u0131nar', 1), (u'ye\u015filb\xfck', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 2960000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #2960000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soe

INFO : keeping 2000000 tokens which were in no less than 0 and no more than 3080000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #3080000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 31364 tokens: [(u'hkre', 1), (u'kellyfamous', 1), (u'llechylched', 1), (u'yuvarasmi', 1), (u'skurdalssj\xf8en', 1), (u'sidee', 1), (u'crof', 1), (u'seilnacht', 1), (u'perhiasan', 1), (u'\u516d\u5175\u885e', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 3090000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #3090000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : disca

INFO : adding document #3210000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 36063 tokens: [(u'crateau', 1), (u'znaidi', 1), (u'thi\xe9baudde', 1), (u'mejane', 1), (u'kanejir\u014d', 1), (u'drumnaconagher', 1), (u'skaalum', 1), (u'rouletta', 1), (u'iiantsiarekin', 1), (u'abyssomelania', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 3220000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #3220000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 29916 tokens: [(u'mondje', 1), (u'\u0430\u0443\u0434\u0436\u044d', 1), (u'boismier', 1), (u'vincentre', 1), (u'bogardic', 1), (u'cyatheae', 1), (u'thevathaye', 1), (u'mariamelanie', 1), (u'caledfryn', 1), (u'\u6a13\u5357\u5149',

INFO : discarding 39787 tokens: [(u'mckansy', 1), (u'broadall', 1), (u'l\xf5ug', 1), (u'b\xe4rchen', 1), (u'lousidean', 1), (u'tvid', 1), (u'necrohigh', 1), (u'hokeh', 1), (u'benimadhab', 1), (u'aidds', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 3350000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #3350000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 45299 tokens: [(u'hennericourt', 1), (u'workatplay', 1), (u'isfendiar', 1), (u'chhtrabas', 1), (u'oyyari', 1), (u'corrale\xf1o', 1), (u'\u03c0\u03b7\u03bb\u03b5\u0390\u03b4\u03b7\u03bf', 1), (u'philologika', 1), (u'pylimo', 1), (u'ashynden', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 3360000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2

INFO : keeping 2000000 tokens which were in no less than 0 and no more than 3480000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #3480000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 29076 tokens: [(u'balnavine', 1), (u'geyav', 1), (u'espd', 1), (u'dngl', 1), (u'jahanbanian', 1), (u'purkar', 1), (u'm\xe5nresen\xe4rer', 1), (u'\u0430\u0449\u0435\u043d\u043a\u043e', 1), (u'\u98a8\u5439\u3051\u3070', 1), (u'\u0432\u0435\u043e', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 3490000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #3490000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'so

INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #3610000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 29575 tokens: [(u'newbyeric', 1), (u'tveje', 1), (u'youluvana', 1), (u'\u653e\u5b66\u4ee5\u540e', 1), (u'cornuingmar', 1), (u'trezzinis', 1), (u'horojevi\u0107i', 1), (u'bairvan', 1), (u'laoula', 1), (u'shankarapandi', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 3620000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #3620000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 34000 tokens: [(u'jessiek', 1), (u'sluttery', 1), (u'preparano', 1), (u'urakonthouba',

INFO : adding document #3740000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 34246 tokens: [(u'florinel', 1), (u'tru\u0219c\u0103', 1), (u'andaluzia', 1), (u'stamatian', 1), (u'opri\u0219can', 1), (u'soli\u0219ti', 1), (u'topot', 1), (u'arwee', 1), (u'whitsettrecord', 1), (u'accenseret', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 3750000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #3750000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : discarding 32489 tokens: [(u'stalnov', 1), (u'sreesha', 1), (u'buelah', 1), (u'\u8fce\u4ed9', 1), (u'awardnyc', 1), (u'voenny', 1), (u'odvarttayah', 1), (u'muncif', 1), (u'qanasilis', 1), (u'theobromins', 1)]...
INFO : keeping 2000000 tok

INFO : discarding 28135 tokens: [(u'referrring', 1), (u'televsied', 1), (u'\u4e01\u514b\u4e0e\u5b9d\u8d1d', 1), (u'bioinfra', 1), (u'\ub2e8\uc5b4\ub85c', 1), (u'montisbossi', 1), (u'ukuk', 1), (u'am\xeclcar', 1), (u'\u0e0a\u0e28', 1), (u'yongpyo', 1)]...
INFO : keeping 2000000 tokens which were in no less than 0 and no more than 3880000 (=100.0%) documents
INFO : resulting dictionary: Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : adding document #3880000 to Dictionary(2000000 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)
INFO : built Dictionary(2001078 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...) from 3880459 documents (total 1453202979 corpus positions)


CPU times: user 8h 35s, sys: 8min 15s, total: 8h 8min 51s
Wall time: 8h 10min 22s
Dictionary(2001078 unique tokens: [u'tripolitan', u'ftdna', u'padanagan', u'soestdijk', u'farmobil']...)


In [9]:
%time
# ignore words that appear in less than 20 documents or more than 10% documents
id2word_wiki.filter_extremes(no_below=20, no_above=0.1)
print(id2word_wiki)

CPU times: user 1e+03 ns, sys: 1e+03 ns, total: 2 µs
Wall time: 5.01 µs


INFO : discarding 1901078 tokens: [(u'tripolitan', 94), (u'ftdna', 36), (u'padanagan', 1), (u'soestdijk', 72), (u'farmobil', 2), (u'billycorgan', 2), (u'olmsville', 2), (u'schelberger', 3), (u'nualart', 8), (u'lebensballade', 1)]...
INFO : keeping 100000 tokens which were in no less than 20 and no more than 388045 (=10.0%) documents
INFO : resulting dictionary: Dictionary(100000 unique tokens: [u'biennials', u'fawn', u'gai', u'constan\u021ba', u'nunnery']...)


Dictionary(100000 unique tokens: [u'biennials', u'fawn', u'gai', u'constan\u021ba', u'nunnery']...)


In [11]:
class WikiCorpus(object):
    def __init__(self, dump_file, dictionary, clip_docs=None):
        """
        Parse the first `clip_docs` Wikipedia documents from file `dump_file`.
        Yield each document in turn, as a list of tokens (unicode strings).
        
        """
        self.dump_file = dump_file
        self.dictionary = dictionary
        self.clip_docs = clip_docs
    
    def __iter__(self):
        self.titles = []
        for title, tokens in itertools.islice(iter_wiki(self.dump_file), self.clip_docs):
            self.titles.append(title)
            yield self.dictionary.doc2bow(tokens)
    
    def __len__(self):
        return self.clip_docs

# create a stream of bag-of-words vectors
wiki_corpus = WikiCorpus(wiki_archive, id2word_wiki)
vector = next(iter(wiki_corpus))
print(vector)  # print the first vector in the stream

[(12, 1), (54, 1), (219, 1), (311, 1), (336, 1), (351, 2), (417, 1), (456, 2), (501, 1), (509, 1), (650, 1), (692, 2), (740, 2), (861, 1), (897, 1), (942, 1), (943, 7), (957, 2), (1059, 1), (1119, 1), (1156, 1), (1223, 1), (1265, 1), (1285, 1), (1342, 2), (1478, 1), (1554, 1), (1589, 1), (1592, 1), (1668, 1), (1686, 1), (1708, 1), (1726, 1), (1764, 1), (1765, 1), (1770, 1), (1791, 1), (1915, 7), (2001, 1), (2012, 5), (2079, 1), (2118, 1), (2146, 1), (2168, 1), (2210, 1), (2282, 2), (2296, 10), (2325, 2), (2394, 1), (2561, 5), (2620, 1), (2646, 1), (3177, 1), (3182, 1), (3283, 1), (3318, 5), (3364, 1), (3384, 1), (3436, 2), (3463, 1), (3490, 4), (3566, 2), (3582, 1), (3745, 1), (3789, 2), (3810, 1), (3891, 2), (3963, 1), (3976, 1), (4012, 1), (4072, 1), (4081, 1), (4134, 1), (4197, 1), (4234, 1), (4240, 2), (4306, 2), (4438, 1), (4451, 1), (4470, 1), (4532, 1), (4549, 1), (4567, 1), (4584, 1), (4623, 1), (4651, 3), (4665, 2), (4689, 1), (4751, 5), (4825, 2), (4894, 1), (4901, 14), (4986

In [12]:
# what is the most common word in that first article?
most_index, most_count = max(vector, key=lambda (word_index, count): count)
print(id2word_wiki[most_index], most_count)

(u'anarchist', 155)


In [13]:
%time gensim.corpora.MmCorpus.serialize('./data/wiki_bow.mm', wiki_corpus)

INFO : storing corpus in Matrix Market format to ./data/wiki_bow.mm
INFO : saving sparse matrix to ./data/wiki_bow.mm
INFO : PROGRESS: saving document #0
INFO : PROGRESS: saving document #1000
INFO : PROGRESS: saving document #2000
INFO : PROGRESS: saving document #3000
INFO : PROGRESS: saving document #4000
INFO : PROGRESS: saving document #5000
INFO : PROGRESS: saving document #6000
INFO : PROGRESS: saving document #7000
INFO : PROGRESS: saving document #8000
INFO : PROGRESS: saving document #9000
INFO : PROGRESS: saving document #10000
INFO : PROGRESS: saving document #11000
INFO : PROGRESS: saving document #12000
INFO : PROGRESS: saving document #13000
INFO : PROGRESS: saving document #14000
INFO : PROGRESS: saving document #15000
INFO : PROGRESS: saving document #16000
INFO : PROGRESS: saving document #17000
INFO : PROGRESS: saving document #18000
INFO : PROGRESS: saving document #19000
INFO : PROGRESS: saving document #20000
INFO : PROGRESS: saving document #21000
INFO : PROGRESS

INFO : PROGRESS: saving document #200000
INFO : PROGRESS: saving document #201000
INFO : PROGRESS: saving document #202000
INFO : PROGRESS: saving document #203000
INFO : PROGRESS: saving document #204000
INFO : PROGRESS: saving document #205000
INFO : PROGRESS: saving document #206000
INFO : PROGRESS: saving document #207000
INFO : PROGRESS: saving document #208000
INFO : PROGRESS: saving document #209000
INFO : PROGRESS: saving document #210000
INFO : PROGRESS: saving document #211000
INFO : PROGRESS: saving document #212000
INFO : PROGRESS: saving document #213000
INFO : PROGRESS: saving document #214000
INFO : PROGRESS: saving document #215000
INFO : PROGRESS: saving document #216000
INFO : PROGRESS: saving document #217000
INFO : PROGRESS: saving document #218000
INFO : PROGRESS: saving document #219000
INFO : PROGRESS: saving document #220000
INFO : PROGRESS: saving document #221000
INFO : PROGRESS: saving document #222000
INFO : PROGRESS: saving document #223000
INFO : PROGRESS:

INFO : PROGRESS: saving document #400000
INFO : PROGRESS: saving document #401000
INFO : PROGRESS: saving document #402000
INFO : PROGRESS: saving document #403000
INFO : PROGRESS: saving document #404000
INFO : PROGRESS: saving document #405000
INFO : PROGRESS: saving document #406000
INFO : PROGRESS: saving document #407000
INFO : PROGRESS: saving document #408000
INFO : PROGRESS: saving document #409000
INFO : PROGRESS: saving document #410000
INFO : PROGRESS: saving document #411000
INFO : PROGRESS: saving document #412000
INFO : PROGRESS: saving document #413000
INFO : PROGRESS: saving document #414000
INFO : PROGRESS: saving document #415000
INFO : PROGRESS: saving document #416000
INFO : PROGRESS: saving document #417000
INFO : PROGRESS: saving document #418000
INFO : PROGRESS: saving document #419000
INFO : PROGRESS: saving document #420000
INFO : PROGRESS: saving document #421000
INFO : PROGRESS: saving document #422000
INFO : PROGRESS: saving document #423000
INFO : PROGRESS:

INFO : PROGRESS: saving document #600000
INFO : PROGRESS: saving document #601000
INFO : PROGRESS: saving document #602000
INFO : PROGRESS: saving document #603000
INFO : PROGRESS: saving document #604000
INFO : PROGRESS: saving document #605000
INFO : PROGRESS: saving document #606000
INFO : PROGRESS: saving document #607000
INFO : PROGRESS: saving document #608000
INFO : PROGRESS: saving document #609000
INFO : PROGRESS: saving document #610000
INFO : PROGRESS: saving document #611000
INFO : PROGRESS: saving document #612000
INFO : PROGRESS: saving document #613000
INFO : PROGRESS: saving document #614000
INFO : PROGRESS: saving document #615000
INFO : PROGRESS: saving document #616000
INFO : PROGRESS: saving document #617000
INFO : PROGRESS: saving document #618000
INFO : PROGRESS: saving document #619000
INFO : PROGRESS: saving document #620000
INFO : PROGRESS: saving document #621000
INFO : PROGRESS: saving document #622000
INFO : PROGRESS: saving document #623000
INFO : PROGRESS:

INFO : PROGRESS: saving document #800000
INFO : PROGRESS: saving document #801000
INFO : PROGRESS: saving document #802000
INFO : PROGRESS: saving document #803000
INFO : PROGRESS: saving document #804000
INFO : PROGRESS: saving document #805000
INFO : PROGRESS: saving document #806000
INFO : PROGRESS: saving document #807000
INFO : PROGRESS: saving document #808000
INFO : PROGRESS: saving document #809000
INFO : PROGRESS: saving document #810000
INFO : PROGRESS: saving document #811000
INFO : PROGRESS: saving document #812000
INFO : PROGRESS: saving document #813000
INFO : PROGRESS: saving document #814000
INFO : PROGRESS: saving document #815000
INFO : PROGRESS: saving document #816000
INFO : PROGRESS: saving document #817000
INFO : PROGRESS: saving document #818000
INFO : PROGRESS: saving document #819000
INFO : PROGRESS: saving document #820000
INFO : PROGRESS: saving document #821000
INFO : PROGRESS: saving document #822000
INFO : PROGRESS: saving document #823000
INFO : PROGRESS:

INFO : PROGRESS: saving document #1000000
INFO : PROGRESS: saving document #1001000
INFO : PROGRESS: saving document #1002000
INFO : PROGRESS: saving document #1003000
INFO : PROGRESS: saving document #1004000
INFO : PROGRESS: saving document #1005000
INFO : PROGRESS: saving document #1006000
INFO : PROGRESS: saving document #1007000
INFO : PROGRESS: saving document #1008000
INFO : PROGRESS: saving document #1009000
INFO : PROGRESS: saving document #1010000
INFO : PROGRESS: saving document #1011000
INFO : PROGRESS: saving document #1012000
INFO : PROGRESS: saving document #1013000
INFO : PROGRESS: saving document #1014000
INFO : PROGRESS: saving document #1015000
INFO : PROGRESS: saving document #1016000
INFO : PROGRESS: saving document #1017000
INFO : PROGRESS: saving document #1018000
INFO : PROGRESS: saving document #1019000
INFO : PROGRESS: saving document #1020000
INFO : PROGRESS: saving document #1021000
INFO : PROGRESS: saving document #1022000
INFO : PROGRESS: saving document #

INFO : PROGRESS: saving document #1196000
INFO : PROGRESS: saving document #1197000
INFO : PROGRESS: saving document #1198000
INFO : PROGRESS: saving document #1199000
INFO : PROGRESS: saving document #1200000
INFO : PROGRESS: saving document #1201000
INFO : PROGRESS: saving document #1202000
INFO : PROGRESS: saving document #1203000
INFO : PROGRESS: saving document #1204000
INFO : PROGRESS: saving document #1205000
INFO : PROGRESS: saving document #1206000
INFO : PROGRESS: saving document #1207000
INFO : PROGRESS: saving document #1208000
INFO : PROGRESS: saving document #1209000
INFO : PROGRESS: saving document #1210000
INFO : PROGRESS: saving document #1211000
INFO : PROGRESS: saving document #1212000
INFO : PROGRESS: saving document #1213000
INFO : PROGRESS: saving document #1214000
INFO : PROGRESS: saving document #1215000
INFO : PROGRESS: saving document #1216000
INFO : PROGRESS: saving document #1217000
INFO : PROGRESS: saving document #1218000
INFO : PROGRESS: saving document #

INFO : PROGRESS: saving document #1392000
INFO : PROGRESS: saving document #1393000
INFO : PROGRESS: saving document #1394000
INFO : PROGRESS: saving document #1395000
INFO : PROGRESS: saving document #1396000
INFO : PROGRESS: saving document #1397000
INFO : PROGRESS: saving document #1398000
INFO : PROGRESS: saving document #1399000
INFO : PROGRESS: saving document #1400000
INFO : PROGRESS: saving document #1401000
INFO : PROGRESS: saving document #1402000
INFO : PROGRESS: saving document #1403000
INFO : PROGRESS: saving document #1404000
INFO : PROGRESS: saving document #1405000
INFO : PROGRESS: saving document #1406000
INFO : PROGRESS: saving document #1407000
INFO : PROGRESS: saving document #1408000
INFO : PROGRESS: saving document #1409000
INFO : PROGRESS: saving document #1410000
INFO : PROGRESS: saving document #1411000
INFO : PROGRESS: saving document #1412000
INFO : PROGRESS: saving document #1413000
INFO : PROGRESS: saving document #1414000
INFO : PROGRESS: saving document #

INFO : PROGRESS: saving document #1588000
INFO : PROGRESS: saving document #1589000
INFO : PROGRESS: saving document #1590000
INFO : PROGRESS: saving document #1591000
INFO : PROGRESS: saving document #1592000
INFO : PROGRESS: saving document #1593000
INFO : PROGRESS: saving document #1594000
INFO : PROGRESS: saving document #1595000
INFO : PROGRESS: saving document #1596000
INFO : PROGRESS: saving document #1597000
INFO : PROGRESS: saving document #1598000
INFO : PROGRESS: saving document #1599000
INFO : PROGRESS: saving document #1600000
INFO : PROGRESS: saving document #1601000
INFO : PROGRESS: saving document #1602000
INFO : PROGRESS: saving document #1603000
INFO : PROGRESS: saving document #1604000
INFO : PROGRESS: saving document #1605000
INFO : PROGRESS: saving document #1606000
INFO : PROGRESS: saving document #1607000
INFO : PROGRESS: saving document #1608000
INFO : PROGRESS: saving document #1609000
INFO : PROGRESS: saving document #1610000
INFO : PROGRESS: saving document #

INFO : PROGRESS: saving document #1784000
INFO : PROGRESS: saving document #1785000
INFO : PROGRESS: saving document #1786000
INFO : PROGRESS: saving document #1787000
INFO : PROGRESS: saving document #1788000
INFO : PROGRESS: saving document #1789000
INFO : PROGRESS: saving document #1790000
INFO : PROGRESS: saving document #1791000
INFO : PROGRESS: saving document #1792000
INFO : PROGRESS: saving document #1793000
INFO : PROGRESS: saving document #1794000
INFO : PROGRESS: saving document #1795000
INFO : PROGRESS: saving document #1796000
INFO : PROGRESS: saving document #1797000
INFO : PROGRESS: saving document #1798000
INFO : PROGRESS: saving document #1799000
INFO : PROGRESS: saving document #1800000
INFO : PROGRESS: saving document #1801000
INFO : PROGRESS: saving document #1802000
INFO : PROGRESS: saving document #1803000
INFO : PROGRESS: saving document #1804000
INFO : PROGRESS: saving document #1805000
INFO : PROGRESS: saving document #1806000
INFO : PROGRESS: saving document #

INFO : PROGRESS: saving document #1980000
INFO : PROGRESS: saving document #1981000
INFO : PROGRESS: saving document #1982000
INFO : PROGRESS: saving document #1983000
INFO : PROGRESS: saving document #1984000
INFO : PROGRESS: saving document #1985000
INFO : PROGRESS: saving document #1986000
INFO : PROGRESS: saving document #1987000
INFO : PROGRESS: saving document #1988000
INFO : PROGRESS: saving document #1989000
INFO : PROGRESS: saving document #1990000
INFO : PROGRESS: saving document #1991000
INFO : PROGRESS: saving document #1992000
INFO : PROGRESS: saving document #1993000
INFO : PROGRESS: saving document #1994000
INFO : PROGRESS: saving document #1995000
INFO : PROGRESS: saving document #1996000
INFO : PROGRESS: saving document #1997000
INFO : PROGRESS: saving document #1998000
INFO : PROGRESS: saving document #1999000
INFO : PROGRESS: saving document #2000000
INFO : PROGRESS: saving document #2001000
INFO : PROGRESS: saving document #2002000
INFO : PROGRESS: saving document #

INFO : PROGRESS: saving document #2176000
INFO : PROGRESS: saving document #2177000
INFO : PROGRESS: saving document #2178000
INFO : PROGRESS: saving document #2179000
INFO : PROGRESS: saving document #2180000
INFO : PROGRESS: saving document #2181000
INFO : PROGRESS: saving document #2182000
INFO : PROGRESS: saving document #2183000
INFO : PROGRESS: saving document #2184000
INFO : PROGRESS: saving document #2185000
INFO : PROGRESS: saving document #2186000
INFO : PROGRESS: saving document #2187000
INFO : PROGRESS: saving document #2188000
INFO : PROGRESS: saving document #2189000
INFO : PROGRESS: saving document #2190000
INFO : PROGRESS: saving document #2191000
INFO : PROGRESS: saving document #2192000
INFO : PROGRESS: saving document #2193000
INFO : PROGRESS: saving document #2194000
INFO : PROGRESS: saving document #2195000
INFO : PROGRESS: saving document #2196000
INFO : PROGRESS: saving document #2197000
INFO : PROGRESS: saving document #2198000
INFO : PROGRESS: saving document #

INFO : PROGRESS: saving document #2372000
INFO : PROGRESS: saving document #2373000
INFO : PROGRESS: saving document #2374000
INFO : PROGRESS: saving document #2375000
INFO : PROGRESS: saving document #2376000
INFO : PROGRESS: saving document #2377000
INFO : PROGRESS: saving document #2378000
INFO : PROGRESS: saving document #2379000
INFO : PROGRESS: saving document #2380000
INFO : PROGRESS: saving document #2381000
INFO : PROGRESS: saving document #2382000
INFO : PROGRESS: saving document #2383000
INFO : PROGRESS: saving document #2384000
INFO : PROGRESS: saving document #2385000
INFO : PROGRESS: saving document #2386000
INFO : PROGRESS: saving document #2387000
INFO : PROGRESS: saving document #2388000
INFO : PROGRESS: saving document #2389000
INFO : PROGRESS: saving document #2390000
INFO : PROGRESS: saving document #2391000
INFO : PROGRESS: saving document #2392000
INFO : PROGRESS: saving document #2393000
INFO : PROGRESS: saving document #2394000
INFO : PROGRESS: saving document #

INFO : PROGRESS: saving document #2568000
INFO : PROGRESS: saving document #2569000
INFO : PROGRESS: saving document #2570000
INFO : PROGRESS: saving document #2571000
INFO : PROGRESS: saving document #2572000
INFO : PROGRESS: saving document #2573000
INFO : PROGRESS: saving document #2574000
INFO : PROGRESS: saving document #2575000
INFO : PROGRESS: saving document #2576000
INFO : PROGRESS: saving document #2577000
INFO : PROGRESS: saving document #2578000
INFO : PROGRESS: saving document #2579000
INFO : PROGRESS: saving document #2580000
INFO : PROGRESS: saving document #2581000
INFO : PROGRESS: saving document #2582000
INFO : PROGRESS: saving document #2583000
INFO : PROGRESS: saving document #2584000
INFO : PROGRESS: saving document #2585000
INFO : PROGRESS: saving document #2586000
INFO : PROGRESS: saving document #2587000
INFO : PROGRESS: saving document #2588000
INFO : PROGRESS: saving document #2589000
INFO : PROGRESS: saving document #2590000
INFO : PROGRESS: saving document #

INFO : PROGRESS: saving document #2764000
INFO : PROGRESS: saving document #2765000
INFO : PROGRESS: saving document #2766000
INFO : PROGRESS: saving document #2767000
INFO : PROGRESS: saving document #2768000
INFO : PROGRESS: saving document #2769000
INFO : PROGRESS: saving document #2770000
INFO : PROGRESS: saving document #2771000
INFO : PROGRESS: saving document #2772000
INFO : PROGRESS: saving document #2773000
INFO : PROGRESS: saving document #2774000
INFO : PROGRESS: saving document #2775000
INFO : PROGRESS: saving document #2776000
INFO : PROGRESS: saving document #2777000
INFO : PROGRESS: saving document #2778000
INFO : PROGRESS: saving document #2779000
INFO : PROGRESS: saving document #2780000
INFO : PROGRESS: saving document #2781000
INFO : PROGRESS: saving document #2782000
INFO : PROGRESS: saving document #2783000
INFO : PROGRESS: saving document #2784000
INFO : PROGRESS: saving document #2785000
INFO : PROGRESS: saving document #2786000
INFO : PROGRESS: saving document #

INFO : PROGRESS: saving document #2960000
INFO : PROGRESS: saving document #2961000
INFO : PROGRESS: saving document #2962000
INFO : PROGRESS: saving document #2963000
INFO : PROGRESS: saving document #2964000
INFO : PROGRESS: saving document #2965000
INFO : PROGRESS: saving document #2966000
INFO : PROGRESS: saving document #2967000
INFO : PROGRESS: saving document #2968000
INFO : PROGRESS: saving document #2969000
INFO : PROGRESS: saving document #2970000
INFO : PROGRESS: saving document #2971000
INFO : PROGRESS: saving document #2972000
INFO : PROGRESS: saving document #2973000
INFO : PROGRESS: saving document #2974000
INFO : PROGRESS: saving document #2975000
INFO : PROGRESS: saving document #2976000
INFO : PROGRESS: saving document #2977000
INFO : PROGRESS: saving document #2978000
INFO : PROGRESS: saving document #2979000
INFO : PROGRESS: saving document #2980000
INFO : PROGRESS: saving document #2981000
INFO : PROGRESS: saving document #2982000
INFO : PROGRESS: saving document #

INFO : PROGRESS: saving document #3156000
INFO : PROGRESS: saving document #3157000
INFO : PROGRESS: saving document #3158000
INFO : PROGRESS: saving document #3159000
INFO : PROGRESS: saving document #3160000
INFO : PROGRESS: saving document #3161000
INFO : PROGRESS: saving document #3162000
INFO : PROGRESS: saving document #3163000
INFO : PROGRESS: saving document #3164000
INFO : PROGRESS: saving document #3165000
INFO : PROGRESS: saving document #3166000
INFO : PROGRESS: saving document #3167000
INFO : PROGRESS: saving document #3168000
INFO : PROGRESS: saving document #3169000
INFO : PROGRESS: saving document #3170000
INFO : PROGRESS: saving document #3171000
INFO : PROGRESS: saving document #3172000
INFO : PROGRESS: saving document #3173000
INFO : PROGRESS: saving document #3174000
INFO : PROGRESS: saving document #3175000
INFO : PROGRESS: saving document #3176000
INFO : PROGRESS: saving document #3177000
INFO : PROGRESS: saving document #3178000
INFO : PROGRESS: saving document #

INFO : PROGRESS: saving document #3352000
INFO : PROGRESS: saving document #3353000
INFO : PROGRESS: saving document #3354000
INFO : PROGRESS: saving document #3355000
INFO : PROGRESS: saving document #3356000
INFO : PROGRESS: saving document #3357000
INFO : PROGRESS: saving document #3358000
INFO : PROGRESS: saving document #3359000
INFO : PROGRESS: saving document #3360000
INFO : PROGRESS: saving document #3361000
INFO : PROGRESS: saving document #3362000
INFO : PROGRESS: saving document #3363000
INFO : PROGRESS: saving document #3364000
INFO : PROGRESS: saving document #3365000
INFO : PROGRESS: saving document #3366000
INFO : PROGRESS: saving document #3367000
INFO : PROGRESS: saving document #3368000
INFO : PROGRESS: saving document #3369000
INFO : PROGRESS: saving document #3370000
INFO : PROGRESS: saving document #3371000
INFO : PROGRESS: saving document #3372000
INFO : PROGRESS: saving document #3373000
INFO : PROGRESS: saving document #3374000
INFO : PROGRESS: saving document #

INFO : PROGRESS: saving document #3548000
INFO : PROGRESS: saving document #3549000
INFO : PROGRESS: saving document #3550000
INFO : PROGRESS: saving document #3551000
INFO : PROGRESS: saving document #3552000
INFO : PROGRESS: saving document #3553000
INFO : PROGRESS: saving document #3554000
INFO : PROGRESS: saving document #3555000
INFO : PROGRESS: saving document #3556000
INFO : PROGRESS: saving document #3557000
INFO : PROGRESS: saving document #3558000
INFO : PROGRESS: saving document #3559000
INFO : PROGRESS: saving document #3560000
INFO : PROGRESS: saving document #3561000
INFO : PROGRESS: saving document #3562000
INFO : PROGRESS: saving document #3563000
INFO : PROGRESS: saving document #3564000
INFO : PROGRESS: saving document #3565000
INFO : PROGRESS: saving document #3566000
INFO : PROGRESS: saving document #3567000
INFO : PROGRESS: saving document #3568000
INFO : PROGRESS: saving document #3569000
INFO : PROGRESS: saving document #3570000
INFO : PROGRESS: saving document #

INFO : PROGRESS: saving document #3744000
INFO : PROGRESS: saving document #3745000
INFO : PROGRESS: saving document #3746000
INFO : PROGRESS: saving document #3747000
INFO : PROGRESS: saving document #3748000
INFO : PROGRESS: saving document #3749000
INFO : PROGRESS: saving document #3750000
INFO : PROGRESS: saving document #3751000
INFO : PROGRESS: saving document #3752000
INFO : PROGRESS: saving document #3753000
INFO : PROGRESS: saving document #3754000
INFO : PROGRESS: saving document #3755000
INFO : PROGRESS: saving document #3756000
INFO : PROGRESS: saving document #3757000
INFO : PROGRESS: saving document #3758000
INFO : PROGRESS: saving document #3759000
INFO : PROGRESS: saving document #3760000
INFO : PROGRESS: saving document #3761000
INFO : PROGRESS: saving document #3762000
INFO : PROGRESS: saving document #3763000
INFO : PROGRESS: saving document #3764000
INFO : PROGRESS: saving document #3765000
INFO : PROGRESS: saving document #3766000
INFO : PROGRESS: saving document #

CPU times: user 7h 24min 52s, sys: 4min 46s, total: 7h 29min 38s
Wall time: 7h 31min 33s


In [16]:
%time mm_corpus = gensim.corpora.MmCorpus('./data/wiki_bow.mm')
print(mm_corpus)

INFO : loaded corpus index from ./data/wiki_bow.mm.index
INFO : initializing corpus reader from ./data/wiki_bow.mm
INFO : accepted corpus with 3880459 documents, 100000 features, 635764655 non-zero entries


CPU times: user 848 ms, sys: 193 ms, total: 1.04 s
Wall time: 1.14 s
MmCorpus(3880459 documents, 100000 features, 635764655 non-zero entries)


In [17]:
# same as before
print(next(iter(mm_corpus)))

[(12, 1.0), (54, 1.0), (219, 1.0), (311, 1.0), (336, 1.0), (351, 2.0), (417, 1.0), (456, 2.0), (501, 1.0), (509, 1.0), (650, 1.0), (692, 2.0), (740, 2.0), (861, 1.0), (897, 1.0), (942, 1.0), (943, 7.0), (957, 2.0), (1059, 1.0), (1119, 1.0), (1156, 1.0), (1223, 1.0), (1265, 1.0), (1285, 1.0), (1342, 2.0), (1478, 1.0), (1554, 1.0), (1589, 1.0), (1592, 1.0), (1668, 1.0), (1686, 1.0), (1708, 1.0), (1726, 1.0), (1764, 1.0), (1765, 1.0), (1770, 1.0), (1791, 1.0), (1915, 7.0), (2001, 1.0), (2012, 5.0), (2079, 1.0), (2118, 1.0), (2146, 1.0), (2168, 1.0), (2210, 1.0), (2282, 2.0), (2296, 10.0), (2325, 2.0), (2394, 1.0), (2561, 5.0), (2620, 1.0), (2646, 1.0), (3177, 1.0), (3182, 1.0), (3283, 1.0), (3318, 5.0), (3364, 1.0), (3384, 1.0), (3436, 2.0), (3463, 1.0), (3490, 4.0), (3566, 2.0), (3582, 1.0), (3745, 1.0), (3789, 2.0), (3810, 1.0), (3891, 2.0), (3963, 1.0), (3976, 1.0), (4012, 1.0), (4072, 1.0), (4081, 1.0), (4134, 1.0), (4197, 1.0), (4234, 1.0), (4240, 2.0), (4306, 2.0), (4438, 1.0), (445

### Semantic transformations

In [18]:
clipped_corpus = gensim.utils.ClippedCorpus(mm_corpus, 4000)  # use fewer documents during training, LDA is slow
# ClippedCorpus new in gensim 0.10.1
# copy&paste it from https://github.com/piskvorky/gensim/blob/0.10.1/gensim/utils.py#L467 if necessary (or upgrade your gensim)
%time lda_model = gensim.models.LdaModel(clipped_corpus, num_topics=10, id2word=id2word_wiki, passes=4)

INFO : using symmetric alpha at 0.1
INFO : using symmetric eta at 1e-05
INFO : using serial LDA version on this node
INFO : running online (multi-pass) LDA training, 10 topics, 4 passes over the supplied corpus of 4000 documents, updating model once every 2000 documents, evaluating perplexity every 4000 documents, iterating 50x with a convergence threshold of 0.001000
INFO : PROGRESS: pass 0, at document #2000/4000
INFO : merging changes from 2000 documents into a model of 4000 documents
INFO : topic #1 (0.100): 0.002*"black" + 0.002*"player" + 0.001*"common" + 0.001*"art" + 0.001*"french" + 0.001*"example" + 0.001*"blue" + 0.001*"king" + 0.001*"population" + 0.001*"league"
INFO : topic #9 (0.100): 0.001*"king" + 0.001*"book" + 0.001*"church" + 0.001*"french" + 0.001*"player" + 0.001*"al" + 0.001*"river" + 0.001*"bc" + 0.001*"modern" + 0.001*"sea"
INFO : topic #3 (0.100): 0.001*"black" + 0.001*"language" + 0.001*"common" + 0.001*"aircraft" + 0.001*"population" + 0.001*"modern" + 0.001*

INFO : topic #8 (0.100): 0.004*"population" + 0.003*"island" + 0.003*"islands" + 0.003*"largest" + 0.002*"region" + 0.002*"river" + 0.002*"sea" + 0.002*"park" + 0.002*"areas" + 0.002*"million"
INFO : topic #0 (0.100): 0.003*"church" + 0.003*"population" + 0.003*"countries" + 0.003*"law" + 0.003*"relations" + 0.003*"european" + 0.003*"court" + 0.002*"embassy" + 0.002*"republic" + 0.002*"europe"
INFO : topic diff=0.456681, rho=0.408248
INFO : -9.152 per-word bound, 568.8 perplexity estimate based on a held-out corpus of 2000 documents with 3039389 words
INFO : PROGRESS: pass 3, at document #4000/4000
INFO : merging changes from 2000 documents into a model of 4000 documents
INFO : topic #8 (0.100): 0.004*"population" + 0.003*"islands" + 0.003*"island" + 0.003*"largest" + 0.002*"river" + 0.002*"region" + 0.002*"building" + 0.002*"park" + 0.002*"areas" + 0.002*"sea"
INFO : topic #3 (0.100): 0.004*"species" + 0.004*"water" + 0.003*"carbon" + 0.003*"air" + 0.002*"earth" + 0.002*"common" + 0.0

CPU times: user 7min 26s, sys: 5.22 s, total: 7min 32s
Wall time: 4min 41s


In [19]:
_ = lda_model.print_topics(-1)  # print a few most important words for each LDA topic

INFO : topic #0 (0.100): 0.004*"law" + 0.004*"countries" + 0.004*"population" + 0.003*"church" + 0.003*"european" + 0.003*"relations" + 0.003*"republic" + 0.003*"croatia" + 0.003*"china" + 0.003*"embassy"
INFO : topic #1 (0.100): 0.005*"function" + 0.005*"theory" + 0.005*"numbers" + 0.005*"example" + 0.004*"space" + 0.004*"point" + 0.003*"field" + 0.003*"energy" + 0.003*"functions" + 0.003*"value"
INFO : topic #2 (0.100): 0.008*"game" + 0.005*"games" + 0.004*"league" + 0.004*"album" + 0.003*"band" + 0.003*"players" + 0.003*"play" + 0.003*"record" + 0.003*"video" + 0.002*"player"
INFO : topic #3 (0.100): 0.004*"species" + 0.004*"water" + 0.003*"carbon" + 0.003*"air" + 0.002*"earth" + 0.002*"common" + 0.002*"low" + 0.002*"surface" + 0.002*"body" + 0.002*"pressure"
INFO : topic #4 (0.100): 0.007*"language" + 0.005*"languages" + 0.004*"data" + 0.004*"example" + 0.003*"common" + 0.003*"systems" + 0.003*"code" + 0.003*"word" + 0.002*"software" + 0.002*"words"
INFO : topic #5 (0.100): 0.005*"

### Unseen docs

In [21]:
text = "A blood cell, also called a hematocyte, is a cell produced by hematopoiesis and normally found in blood."

# transform text into the bag-of-words space
bow_vector = id2word_wiki.doc2bow(tokenize(text))
print([(id2word_wiki[id], count) for id, count in bow_vector])

[(u'blood', 2), (u'normally', 1), (u'produced', 1), (u'cell', 2)]


In [23]:
# transform into LDA space
lda_vector = lda_model[bow_vector]
print(lda_vector)
# print the document's single most prominent LDA topic
print(lda_model.print_topic(max(lda_vector, key=lambda item: item[1])[0]))

[(0, 0.014286218690270584), (1, 0.014286781475026874), (2, 0.014286877323336417), (3, 0.01429047597823874), (4, 0.014287298704011474), (5, 0.014286279604259504), (6, 0.87141660561249323), (7, 0.014286272711959151), (8, 0.014286605625869225), (9, 0.014286584274534882)]
0.006*"acid" + 0.004*"chemical" + 0.004*"al" + 0.004*"dna" + 0.003*"cell" + 0.003*"reaction" + 0.003*"water" + 0.003*"tree" + 0.003*"chemistry" + 0.003*"acids"


### Saving and loading

In [24]:
# store all trained models to disk
lda_model.save('./data/lda_wiki.model')
#lsi_model.save('./data/lsi_wiki.model')
#tfidf_model.save('./data/tfidf_wiki.model')
id2word_wiki.save('./data/wiki.dictionary')

INFO : saving LdaState object under ./data/lda_wiki.model.state, separately None
INFO : saved ./data/lda_wiki.model.state
INFO : saving LdaModel object under ./data/lda_wiki.model, separately ['expElogbeta', 'sstats']
INFO : not storing attribute id2word
INFO : storing np array 'expElogbeta' to ./data/lda_wiki.model.expElogbeta.npy
INFO : not storing attribute state
INFO : not storing attribute dispatcher
INFO : saved ./data/lda_wiki.model
INFO : saving Dictionary object under ./data/wiki.dictionary, separately None
INFO : saved ./data/wiki.dictionary


In [25]:
# load the same model back; the result is equal to `lda_model`
same_lda_model = gensim.models.LdaModel.load('./data/lda_wiki.model')

INFO : loading LdaModel object from ./data/lda_wiki.model
INFO : loading expElogbeta from ./data/lda_wiki.model.expElogbeta.npy with mmap=None
INFO : setting ignored attribute id2word to None
INFO : setting ignored attribute state to None
INFO : setting ignored attribute dispatcher to None
INFO : loaded ./data/lda_wiki.model
INFO : loading LdaModel object from ./data/lda_wiki.model.state
INFO : loaded ./data/lda_wiki.model.state


In [29]:
same_id2word_wiki = gensim.corpora.dictionary.Dictionary.load('./data/wiki.dictionary')

INFO : loading Dictionary object from ./data/wiki.dictionary
INFO : loaded ./data/wiki.dictionary


In [30]:
new_mm_corpus = gensim.corpora.MmCorpus('./data/wiki_bow.mm')
new_clipped_corpus = gensim.utils.ClippedCorpus(new_mm_corpus, 5000)  # use fewer documents during training, LDA is slow
# ClippedCorpus new in gensim 0.10.1
# copy&paste it from https://github.com/piskvorky/gensim/blob/0.10.1/gensim/utils.py#L467 if necessary (or upgrade your gensim)
%time new_lda_model = gensim.models.LdaModel(new_clipped_corpus, num_topics=10, id2word=same_id2word_wiki, passes=4)

INFO : loaded corpus index from ./data/wiki_bow.mm.index
INFO : initializing corpus reader from ./data/wiki_bow.mm
INFO : accepted corpus with 3880459 documents, 100000 features, 635764655 non-zero entries
INFO : using symmetric alpha at 0.1
INFO : using symmetric eta at 1e-05
INFO : using serial LDA version on this node
INFO : running online (multi-pass) LDA training, 10 topics, 4 passes over the supplied corpus of 5000 documents, updating model once every 2000 documents, evaluating perplexity every 5000 documents, iterating 50x with a convergence threshold of 0.001000
INFO : PROGRESS: pass 0, at document #2000/5000
INFO : merging changes from 2000 documents into a model of 5000 documents
INFO : topic #6 (0.100): 0.002*"book" + 0.002*"church" + 0.001*"modern" + 0.001*"px" + 0.001*"language" + 0.001*"sea" + 0.001*"alexander" + 0.001*"art" + 0.001*"common" + 0.001*"star"
INFO : topic #1 (0.100): 0.002*"black" + 0.002*"player" + 0.001*"german" + 0.001*"language" + 0.001*"species" + 0.001

INFO : topic #8 (0.100): 0.006*"language" + 0.004*"party" + 0.004*"software" + 0.003*"languages" + 0.003*"systems" + 0.003*"code" + 0.003*"apple" + 0.003*"example" + 0.002*"court" + 0.002*"data"
INFO : topic #1 (0.100): 0.003*"population" + 0.003*"river" + 0.003*"island" + 0.003*"islands" + 0.003*"species" + 0.003*"sea" + 0.003*"million" + 0.002*"region" + 0.002*"largest" + 0.002*"land"
INFO : topic #9 (0.100): 0.004*"emperor" + 0.003*"king" + 0.003*"human" + 0.002*"philosophy" + 0.002*"study" + 0.002*"theory" + 0.002*"england" + 0.002*"society" + 0.002*"science" + 0.002*"imperial"
INFO : topic diff=0.574311, rho=0.447214
INFO : PROGRESS: pass 2, at document #4000/5000
INFO : merging changes from 2000 documents into a model of 5000 documents
INFO : topic #6 (0.100): 0.011*"church" + 0.006*"god" + 0.004*"christian" + 0.004*"greek" + 0.004*"book" + 0.004*"bc" + 0.003*"earth" + 0.003*"christ" + 0.003*"roman" + 0.003*"star"
INFO : topic #1 (0.100): 0.004*"population" + 0.003*"river" + 0.00

CPU times: user 5min 52s, sys: 4.13 s, total: 5min 56s
Wall time: 3min 28s


In [31]:
same_text = "A blood cell, also called a hematocyte, is a cell produced by hematopoiesis and normally found in blood."

# transform text into the bag-of-words space
same_bow_vector = same_id2word_wiki.doc2bow(tokenize(same_text))

In [32]:
# transform into LDA space
new_lda_vector = new_lda_model[same_bow_vector]
print(new_lda_vector)
# print the document's single most prominent LDA topic
print(new_lda_model.print_topic(max(new_lda_vector, key=lambda item: item[1])[0]))

[(0, 0.014286071976500917), (1, 0.014286180434987181), (2, 0.87141896006581099), (3, 0.01428876996402528), (4, 0.014286725971596163), (5, 0.014287604277777007), (6, 0.014286540404302155), (7, 0.01428595030764699), (8, 0.0142868528306424), (9, 0.014286343766710785)]
0.007*"air" + 0.006*"aircraft" + 0.004*"cell" + 0.003*"cells" + 0.003*"engine" + 0.003*"force" + 0.002*"dna" + 0.002*"flight" + 0.002*"blood" + 0.002*"speed"


In [33]:
max(new_lda_vector, key=lambda item: item[1])

(2, 0.87141896006581099)

In [46]:
new_lda_model.print_topic(2, topn=100)

u'0.007*"air" + 0.006*"aircraft" + 0.004*"cell" + 0.003*"cells" + 0.003*"engine" + 0.003*"force" + 0.002*"dna" + 0.002*"flight" + 0.002*"blood" + 0.002*"speed" + 0.002*"production" + 0.002*"navy" + 0.002*"engines" + 0.002*"water" + 0.002*"fuel" + 0.002*"type" + 0.002*"range" + 0.002*"protein" + 0.002*"ships" + 0.002*"body" + 0.002*"proteins" + 0.002*"control" + 0.002*"mission" + 0.001*"space" + 0.001*"class" + 0.001*"disease" + 0.001*"crew" + 0.001*"oxygen" + 0.001*"research" + 0.001*"membrane" + 0.001*"nuclear" + 0.001*"fleet" + 0.001*"surface" + 0.001*"low" + 0.001*"light" + 0.001*"design" + 0.001*"apollo" + 0.001*"german" + 0.001*"species" + 0.001*"similar" + 0.001*"effects" + 0.001*"gun" + 0.001*"pressure" + 0.001*"bacteria" + 0.001*"ship" + 0.001*"car" + 0.001*"enigma" + 0.001*"enzyme" + 0.001*"test" + 0.001*"produced" + 0.001*"battle" + 0.001*"cancer" + 0.001*"diesel" + 0.001*"program" + 0.001*"mosquito" + 0.001*"hours" + 0.001*"enzymes" + 0.001*"military" + 0.001*"mk" + 0.001*"r

In [54]:
other_text ="Tigers with stripes like to go hunting in the forests of India."
tiger_bow_vec = same_id2word_wiki.doc2bow(tokenize(other_text))
tiger_lda_vec = new_lda_model[tiger_bow_vec]

print(new_lda_model.print_topic(max(tiger_lda_vec, key=lambda item: item[1])[0]))

for word in new_lda_model.show_topic(1):
    print word[0]
    
', '.join([item[0] for item in new_lda_model.show_topic(1, topn=12)])

0.005*"population" + 0.004*"islands" + 0.004*"river" + 0.003*"island" + 0.003*"million" + 0.003*"species" + 0.003*"largest" + 0.003*"sea" + 0.003*"region" + 0.002*"land"
population
islands
river
island
million
species
largest
sea
region
land


u'population, islands, river, island, million, species, largest, sea, region, land, areas, northern'

In [42]:
gensim.matutils.jensen_shannon(new_lda_vector, tiger_lda_vec)

0.5172540545463562

In [43]:
same_text_2 = "A blood cell, that is red, is a cell produced by hematopoiesis and normally found in blood."
blood_bow_vec = same_id2word_wiki.doc2bow(tokenize(same_text_2))
blood_lda_vec = new_lda_model[blood_bow_vec]
print(new_lda_model.print_topic(max(blood_lda_vec, key=lambda item: item[1])[0]))

0.007*"air" + 0.006*"aircraft" + 0.004*"cell" + 0.003*"cells" + 0.003*"engine" + 0.003*"force" + 0.002*"dna" + 0.002*"flight" + 0.002*"blood" + 0.002*"speed"


In [44]:
gensim.matutils.jensen_shannon(new_lda_vector, blood_lda_vec)

0.00030454326770268381