In [9]:
# import and setup modules we'll be using in this notebook
import logging
import itertools

import numpy as np
import gensim

logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

def head(stream, n=10):
    """Convenience fnc: return the first `n` elements of the stream, as plain list."""
    return list(itertools.islice(stream, n))

In [10]:
from gensim.utils import smart_open, simple_preprocess
from gensim.corpora.wikicorpus import _extract_pages, filter_wiki
from gensim.parsing.preprocessing import STOPWORDS

def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

def iter_wiki(dump_file):
    """Yield each article from the Wikipedia dump, as a `(title, tokens)` 2-tuple."""
    ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split()
    for title, text, pageid in _extract_pages(smart_open(dump_file)):
        text = filter_wiki(text)
        tokens = tokenize(text)
        if len(tokens) < 50 or any(title.startswith(ns + ':') for ns in ignore_namespaces):
            continue  # ignore short articles and various meta-articles
        yield title, tokens

In [3]:

# only use simplewiki in this tutorial (fewer documents)
# the full wiki dump is exactly the same format, but larger
stream = iter_wiki('simplewiki-latest-pages-articles.xml.bz2')
for title, tokens in itertools.islice(iter_wiki('simplewiki-latest-pages-articles.xml.bz2'), 8):
    print(title, tokens[:10])  # print the article title and its first ten tokens

April ['april', 'th', 'month', 'year', 'comes', 'march', 'months', 'days', 'april', 'begins']
August ['august', 'aug', 'th', 'month', 'year', 'gregorian', 'calendar', 'coming', 'july', 'september']
Art ['painting', 'renoir', 'work', 'art', 'art', 'creative', 'activity', 'expresses', 'imaginative', 'technical']
A ['thumb', 'letter', 'english', 'alphabet', 'small', 'letter', 'lower', 'case', 'vowel', 'english']
Air ['air', 'fan', 'air', 'air', 'earth', 'atmosphere', 'air', 'mixture', 'gases', 'dust']
Autonomous communities of Spain ['spain', 'divided', 'parts', 'called', 'autonomous', 'communities', 'autonomous', 'means', 'autonomous', 'communities']
Alan Turing ['statue', 'alan', 'turing', 'rebuild', 'machine', 'alan', 'turing', 'alan', 'mathison', 'turing']
Alanis Morissette ['alanis', 'nadine', 'morissette', 'born', 'june', 'grammy', 'award', 'winning', 'canadian', 'american']


In [4]:

id2word = {0: u'word', 2: u'profit', 300: u'another_word'}

In [5]:
doc_stream = (tokens for _, tokens in iter_wiki('simplewiki-latest-pages-articles.xml.bz2'))

In [15]:
%time id2word_wiki = gensim.corpora.Dictionary(doc_stream)
print(id2word_wiki)

INFO : adding document #0 to Dictionary(0 unique tokens: [])
INFO : adding document #10000 to Dictionary(159953 unique tokens: ['cockney', 'narrow', 'arrhythmic', 'studying', 'emigrants']...)
INFO : adding document #20000 to Dictionary(235552 unique tokens: ['cockney', 'wheeling', 'nordland', 'aspendale', 'haversack']...)
INFO : adding document #30000 to Dictionary(294870 unique tokens: ['cockney', 'spelbound', 'wheeling', 'nordland', 'aspendale']...)
INFO : adding document #40000 to Dictionary(367892 unique tokens: ['spelbound', 'hofgarten', 'mélèzes', 'aspendale', 'gridlock']...)
INFO : adding document #50000 to Dictionary(418186 unique tokens: ['spelbound', 'hofgarten', 'mélèzes', 'aspendale', 'hattula']...)
INFO : adding document #60000 to Dictionary(455119 unique tokens: ['edimburgh', 'spelbound', 'hofgarten', 'mélèzes', 'offsetting']...)
INFO : built Dictionary(505167 unique tokens: ['edimburgh', 'spelbound', 'hofgarten', 'mélèzes', 'offsetting']...) from 68259 documents (total 1

CPU times: user 11min 36s, sys: 1.15 s, total: 11min 37s
Wall time: 11min 38s
Dictionary(505167 unique tokens: ['edimburgh', 'spelbound', 'hofgarten', 'mélèzes', 'offsetting']...)


In [16]:
# ignore words that appear in less than 20 documents or more than 10% documents
id2word_wiki.filter_extremes(no_below=20, no_above=0.1)
print(id2word_wiki)

INFO : discarding 472601 tokens: [('alvares', 4), ('ambedkar', 15), ('american', 15239), ('aperire', 1), ('april', 7245), ('arbroath', 15), ('born', 17165), ('chakri', 13), ('city', 11560), ('cosmonauts', 11)]...
INFO : keeping 32566 tokens which were in no less than 20 and no more than 6825 (=10.0%) documents
INFO : resulting dictionary: Dictionary(32566 unique tokens: ['faun', 'narrow', 'hernán', 'studying', 'renting']...)


Dictionary(32566 unique tokens: ['faun', 'narrow', 'hernán', 'studying', 'renting']...)


In [6]:
doc = "A blood cell, also called a hematocyte, is a cell produced by hematopoiesis and normally found in blood."
bow = id2word_wiki.doc2bow(tokenize(doc))
print(bow)

NameError: name 'id2word_wiki' is not defined

In [7]:
print(id2word_wiki[10882])

NameError: name 'id2word_wiki' is not defined

In [8]:
class WikiCorpus(object):
    def __init__(self, dump_file, dictionary, clip_docs=None):
        """
        Parse the first `clip_docs` Wikipedia documents from file `dump_file`.
        Yield each document in turn, as a list of tokens (unicode strings).
        
        """
        self.dump_file = dump_file
        self.dictionary = dictionary
        self.clip_docs = clip_docs
    
    def __iter__(self):
        self.titles = []
        for title, tokens in itertools.islice(iter_wiki(self.dump_file), self.clip_docs):
            self.titles.append(title)
            yield self.dictionary.doc2bow(tokens)
    
    def __len__(self):
        return self.clip_docs

# create a stream of bag-of-words vectors
wiki_corpus = WikiCorpus('simplewiki-latest-pages-articles.xml.bz2', id2word_wiki)
vector = next(iter(wiki_corpus))
print(vector)  # print the first vector in the stream

NameError: name 'id2word_wiki' is not defined

In [2]:
most_index, most_count = max(vector, key=lambda (word_index, count): count)
print(id2word_wiki[most_index], most_count)

SyntaxError: invalid syntax (<ipython-input-2-69a4e517dc0f>, line 1)

In [4]:
%time gensim.corpora.MmCorpus.serialize('./wiki_bow.mm', wiki_corpus)

NameError: name 'gensim' is not defined