In [1]:
import io
import os.path
import re
import tarfile

import smart_open

def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz'):
    fname = url.split('/')[-1]

    # Download the file to local storage first.
    # We can't read it on the fly because of
    # https://github.com/RaRe-Technologies/smart_open/issues/331
    if not os.path.isfile(fname):
        with smart_open.open(url, "rb") as fin:
            with smart_open.open(fname, 'wb') as fout:
                while True:
                    buf = fin.read(io.DEFAULT_BUFFER_SIZE)
                    if not buf:
                        break
                    fout.write(buf)

    with tarfile.open(fname, mode='r:gz') as tar:
        # Ignore directory entries, as well as files like README, etc.
        files = [
            m for m in tar.getmembers()
            if m.isfile() and re.search(r'nipstxt/nips\d+/\d+\.txt', m.name)
        ]
        for member in sorted(files, key=lambda x: x.name):
            member_bytes = tar.extractfile(member).read()
            yield member_bytes.decode('utf-8', errors='replace')

docs = list(extract_documents())

In [2]:
print(len(docs))
print(docs[0][:500])

1740
1 
CONNECTIVITY VERSUS ENTROPY 
Yaser S. Abu-Mostafa 
California Institute of Technology 
Pasadena, CA 91125 
ABSTRACT 
How does the connectivity of a neural network (number of synapses per 
neuron) relate to the complexity of the problems it can handle (measured by 
the entropy)? Switching theory would suggest no relation at all, since all Boolean 
functions can be implemented using a circuit with very low connectivity (e.g., 
using two-input NAND gates). However, for a network that learns a pr


In [3]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [4]:
# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

In [5]:
docs[0][:100]
len(docs[0])

1811

In [26]:
docs[0]

['connectivity',
 'versus',
 'entropy',
 'yaser',
 'abu',
 'mostafa',
 'california',
 'institute',
 'of',
 'technology',
 'pasadena',
 'ca',
 'abstract',
 'how',
 'doe',
 'the',
 'connectivity',
 'of',
 'neural',
 'network',
 'number',
 'of',
 'synapsis',
 'per',
 'neuron',
 'relate',
 'to',
 'the',
 'complexity',
 'of',
 'the',
 'problem',
 'it',
 'can',
 'handle',
 'measured',
 'by',
 'the',
 'entropy',
 'switching',
 'theory',
 'would',
 'suggest',
 'no',
 'relation',
 'at',
 'all',
 'since',
 'all',
 'boolean',
 'function',
 'can',
 'be',
 'implemented',
 'using',
 'circuit',
 'with',
 'very',
 'low',
 'connectivity',
 'using',
 'two',
 'input',
 'nand',
 'gate',
 'however',
 'for',
 'network',
 'that',
 'learns',
 'problem',
 'from',
 'example',
 'using',
 'local',
 'learning',
 'rule',
 'we',
 'prove',
 'that',
 'the',
 'entropy',
 'of',
 'the',
 'problem',
 'becomes',
 'lower',
 'bound',
 'for',
 'the',
 'connectivity',
 'of',
 'the',
 'network',
 'introduction',
 'the',
 'most'

In [29]:
# Compute bigrams.
import gensim

bigram = gensim.models.Phrases(docs, min_count=20)
trigram = gensim.models.Phrases(bigram[docs], threshold=20)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
# See trigram example
print(trigram_mod[bigram_mod[docs[0]]])

def make_bigrams(texts):
    return [bigram_mod[itm] for itm in texts]
data_words_bigrams = make_bigrams(docs)

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]













['connectivity', 'versus', 'entropy', 'yaser_abu_mostafa', 'california_institute', 'of_technology_pasadena', 'ca_abstract', 'how_doe', 'the', 'connectivity', 'of', 'neural_network', 'number', 'of', 'synapsis', 'per', 'neuron', 'relate', 'to', 'the', 'complexity', 'of', 'the', 'problem', 'it', 'can', 'handle', 'measured', 'by', 'the', 'entropy', 'switching', 'theory', 'would', 'suggest', 'no', 'relation', 'at', 'all', 'since', 'all', 'boolean_function', 'can_be_implemented', 'using', 'circuit', 'with', 'very_low', 'connectivity', 'using', 'two', 'input', 'nand_gate', 'however', 'for', 'network', 'that', 'learns', 'problem', 'from', 'example', 'using', 'local', 'learning_rule', 'we', 'prove', 'that', 'the', 'entropy', 'of', 'the', 'problem', 'becomes', 'lower_bound', 'for', 'the', 'connectivity', 'of', 'the', 'network', 'introduction', 'the', 'most', 'distinguishing_feature', 'of', 'neural_network', 'is', 'their', 'ability', 'to', 'spon', 'taneously', 'learn', 'the', 'desired', 'function

In [28]:
len(data_words_bigrams[0])

1764

In [14]:
len(bigram_mod[docs][0])

1764

In [54]:
len(bigram[docs][1])

3928