In [1]:
import io
import os.path
import re
import tarfile

import smart_open

def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz'):
    with smart_open.open(url, "rb") as file:
        with tarfile.open(fileobj=file) as tar:
            for member in tar.getmembers():
                if member.isfile() and re.search(r'nipstxt/nips\d+/\d+\.txt', member.name):
                    member_bytes = tar.extractfile(member).read()
                    yield member_bytes.decode('utf-8', errors='replace')

docs = list(extract_documents())

In [13]:
print(docs[1][:500])

['connectivity', 'versus', 'entropy', 'yaser', 'abu', 'mostafa', 'california', 'institute', 'of', 'technology', 'pasadena', 'ca', 'abstract', 'how', 'doe', 'the', 'connectivity', 'of', 'neural', 'network', 'number', 'of', 'synapsis', 'per', 'neuron', 'relate', 'to', 'the', 'complexity', 'of', 'the', 'problem', 'it', 'can', 'handle', 'measured', 'by', 'the', 'entropy', 'switching', 'theory', 'would', 'suggest', 'no', 'relation', 'at', 'all', 'since', 'all', 'boolean', 'function', 'can', 'be', 'implemented', 'using', 'circuit', 'with', 'very', 'low', 'connectivity', 'using', 'two', 'input', 'nand', 'gate', 'however', 'for', 'network', 'that', 'learns', 'problem', 'from', 'example', 'using', 'local', 'learning', 'rule', 'we', 'prove', 'that', 'the', 'entropy', 'of', 'the', 'problem', 'becomes', 'lower', 'bound', 'for', 'the', 'connectivity', 'of', 'the', 'network', 'introduction', 'the', 'most', 'distinguishing', 'feature', 'of', 'neural', 'network', 'is', 'their', 'ability', 'to', 'spon'

In [3]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [8]:
# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

In [9]:
# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)



In [11]:
bigram[docs[0]]

['neural_net',
 'and',
 'traditional',
 'classifier',
 'william',
 'huang',
 'and',
 'richard_lippmann',
 'mit',
 'lincoln_laboratory',
 'lexington',
 'ma',
 'usa_abstract',
 'previous_work',
 'on',
 'net',
 'with',
 'continuous_valued',
 'input',
 'led',
 'to',
 'generative',
 'procedure',
 'to',
 'construct',
 'convex',
 'decision_region',
 'with',
 'two',
 'layer_percepttons',
 'one',
 'hidden_layer',
 'and',
 'arbitrary',
 'decision_region',
 'with',
 'three',
 'layer_percepttons',
 'two',
 'hidden_layer',
 'here',
 'we',
 'demonstrate',
 'that',
 'two',
 'layer',
 'perceptton',
 'classifier',
 'trained',
 'with',
 'back_propagation',
 'can',
 'form',
 'both',
 'convex',
 'and',
 'disjoint',
 'decision_region',
 'such',
 'classifier',
 'are',
 'robust',
 'train',
 'rapidly',
 'and',
 'provide',
 'good',
 'performance',
 'with',
 'simple',
 'decision_region',
 'when',
 'complex',
 'decision_region',
 'are',
 'required',
 'however',
 'convergence',
 'time',
 'can_be',
 'excessively',