In [16]:
from __future__ import print_function
import logging
import itertools

import numpy as np
import gensim

logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO

def head(stream, n=10):
    return list(itertools.islice(stream,n))


In [17]:
from gensim.utils import smart_open, simple_preprocess
from gensim.corpora.wikicorpus import _extract_pages, filter_wiki
from gensim.parsing.preprocessing import STOPWORDS

def tokenize(text):
    return [token for token in simple_preprocess(text) 
            if token not in STOPWORDS]

def iter_wiki(dump_file):
    ignore_namespaces = 'Wikipedia Category File Portal\
    Template MediaWiki User Help Book Draft'.split()
    
    for title, text, pageid in _extract_pages(smart_open(dump_file)):
        text = filter_wiki(text)
        tokens = tokenize(text)
        if len(tokens) < 50 or any(title.startswith(ns + ":") for ns in ignore_namespaces):
            continue
        yield title,tokens

In [18]:
stream = iter_wiki('./data/simplewiki-20180101-pages-articles.xml.bz2')
for title, tokens in itertools.islice(iter_wiki('./data/simplewiki-20180101-pages-articles.xml.bz2'), 8):
    print(title, tokens[:10])

April [u'april', u'th', u'month', u'year', u'comes', u'march', u'months', u'days', u'april', u'begins']
August [u'august', u'aug', u'th', u'month', u'year', u'gregorian', u'calendar', u'coming', u'july', u'september']
Art [u'painting', u'renoir', u'work', u'art', u'art', u'creative', u'activity', u'people', u'people', u'called']
A [u'page', u'letter', u'alphabet', u'indefinite', u'article', u'article', u'grammar', u'uses', u'disambiguation', u'thumb']
Air [u'air', u'fan', u'air', u'air', u'earth', u'atmosphere', u'air', u'mixture', u'gases', u'dust']
Autonomous communities of Spain [u'spain', u'divided', u'parts', u'called', u'autonomous', u'communities', u'autonomous', u'means', u'autonomous', u'communities']
Alan Turing [u'statue', u'alan', u'turing', u'rebuild', u'machine', u'alan', u'turing', u'alan', u'mathison', u'turing']
Alanis Morissette [u'alanis', u'nadine', u'morissette', u'born', u'june', u'grammy', u'award', u'winning', u'canadian', u'american']


In [19]:
doc_stream = (tokens for _, tokens in iter_wiki('./data/simplewiki-20180101-pages-articles.xml.bz2'))

In [20]:
%time id2word_wiki = gensim.corpora.Dictionary(doc_stream)
print(id2word_wiki)

INFO : adding document #0 to Dictionary(0 unique tokens: [])
INFO : adding document #10000 to Dictionary(157080 unique tokens: [u'fawn', u'\u03c9\u0431\u0440\u0430\u0434\u043e\u0432\u0430\u043d\u043d\u0430\u0467', u'vang', u'yollar\u0131', u'idaira']...)
INFO : adding document #20000 to Dictionary(233346 unique tokens: [u'biennials', u'sowela', u'tsukino', u'clottes', u'refreshable']...)
INFO : adding document #30000 to Dictionary(293307 unique tokens: [u'biennials', u'sowela', u'tsukino', u'clottes', u'klatki']...)
INFO : adding document #40000 to Dictionary(368196 unique tokens: [u'biennials', u'sowela', u'biysk', u'sermersheim', u'wooda']...)
INFO : adding document #50000 to Dictionary(416860 unique tokens: [u'biennials', u'sowela', u'biysk', u'sermersheim', u'wooda']...)
INFO : adding document #60000 to Dictionary(454791 unique tokens: [u'biennials', u'sowela', u'biysk', u'sermersheim', u'wooda']...)
INFO : built Dictionary(481491 unique tokens: [u'biennials', u'sowela', u'biysk', 

CPU times: user 6min 59s, sys: 4.96 s, total: 7min 4s
Wall time: 7min 7s
Dictionary(481491 unique tokens: [u'biennials', u'sowela', u'biysk', u'sermersheim', u'wooda']...)
