<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
# 4. magic to enable retina (high resolution) plots
# https://gist.github.com/minrk/3301035
%matplotlib inline
%load_ext watermark
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format='retina'

import spacy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# change default style figure and font size
plt.rcParams['figure.figsize'] = 8, 6
plt.rcParams['font.size'] = 12

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,sklearn,matplotlib,spacy

Ethen 2018-11-13 10:10:08 

CPython 3.6.4
IPython 6.4.0

numpy 1.14.1
pandas 0.23.0
sklearn 0.19.1
matplotlib 2.2.2
spacy 2.0.16


In [None]:
# https://spacy.io/usage/processing-pipelines#multi-processing-example

# changes in 2.0 versus that makes the nlp.pip behave differently than 1.0
# https://github.com/explosion/spaCy/issues/2075
from pathlib import Path
from joblib import Parallel, delayed
from toolz import partition_all
import thinc.extra.datasets
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS


def is_valid_word(token):
    """
    Returns False if the spacy token is either
    a punctuation, whitespace, number, or is one
    of the un-wanted POS tag

    http://universaldependencies.org/u/pos/
    """
    pos_flag = token.pos_ not in {'ADP', 'CCONJ', 'PRON'}
    word_flag = not (token.is_punct or token.is_space or token.like_num or token.is_stop)
    word_len_flag = len(token) >= 2
    valid_flag = pos_flag and word_flag and word_len_flag
    return valid_flag


def preprocess_text(texts, nlp):
    """https://gist.github.com/smsubrahmannian/2835bd32c688b7b57a5300f94af07b1b"""

    for word in nlp.pipe(texts, n_threads=1):
        ' '.join(word.lemma_ for word in doc if is_valid_word(word))

In [None]:
nlp = spacy.load('en_core_web_sm')
for w in ENGLISH_STOP_WORDS:
    # spacy syntax for adding custom stop words
    nlp.vocab[w].is_stop = True
    
nlp

In [14]:
batch_size = 32

# imdb() returns train and test data
# data is a list of tuple (text, sentiment label)
data, _ = thinc.extra.datasets.imdb()
text, _ = zip(*data)
partitions = partition_all(batch_size, text)

In [None]:


parallel = Parallel(n_jobs=-1)
task = delayed(preprocess_text)
parallel(task(batch, nlp) for batch in partitions)

In [53]:
# for word in doc.sents:
#     print(word)

What can i say about the first film ever?




You can't rate this, because it's not supposed to be entertaining.
But if you HAVE to rate it, you should give it a 10.
It is stunning to see moving images from the year 1895.
This was one of the most important movies in history.
I wonder how it was to be one of the people who saw the first movie ever!






In [16]:
temp = text[0]
temp

"What can i say about the first film ever?\n\n\n\nYou can't rate this, because it's not supposed to be entertaining. But if you HAVE to rate it, you should give it a 10. It is stunning to see moving images from the year 1895. This was one of the most important movies in history. I wonder how it was to be one of the people who saw the first movie ever!\n\n\n\n"

In [17]:
doc = nlp(temp)
doc

What can i say about the first film ever?



You can't rate this, because it's not supposed to be entertaining. But if you HAVE to rate it, you should give it a 10. It is stunning to see moving images from the year 1895. This was one of the most important movies in history. I wonder how it was to be one of the people who saw the first movie ever!




In [None]:
hi

In [16]:
# https://spacy.io/usage/spacy-101#section-features
import spacy

# python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion as it is quite a succesful startup')
for token in doc:
    print(token.text, token.pos_, token.dep_)

Apple PROPN nsubj
is VERB aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.K. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
1 NUM compound
billion NUM pobj
as ADP mark
it PRON nsubj
is VERB advcl
quite ADJ advmod
a DET det
succesful ADJ amod
startup NOUN attr


In [1]:
from sklearn.datasets import fetch_20newsgroups


newsgroups_train = fetch_20newsgroups(subset='train')
sentences = [doc.strip().split() for doc in newsgroups_train.data]

In [23]:
import os
import spacy
from time import time
from joblib import cpu_count
from joblib import Parallel, delayed
from toolz import partition_all


def export_unigrams(unigram_path, texts, parser,
                    batch_size = 50, n_jobs = -1):
    """
    Preprocessed the raw text and export it to a .txt file,
    where each line is one document, for what sort of preprocessing
    is done, please refer to the `clean_corpus` function

    Parameters
    ----------
    unigram_path : str
        output file path of the preprocessed unigram text

    texts : iterable
        iterable can be simply a list, but for larger corpora,
        consider an iterable that streams the sentences directly from
        disk/network using Gensim's Linsentence or something along
        those line

    parser : spacy model object
        e.g. parser = spacy.load('en')

    batch_size : int, default 10000
        batch size for the spacy preprocessing

    n_jobs : int, default -1
        number of jobs/cores/threads to use for the spacy preprocessing
    """
    with open(unigram_path, 'w', encoding='utf_8') as f:
        # partition = partition_all(batch_size, texts)

        parallel = Parallel(n_jobs=n_jobs)
        task = delayed(preprocess_text)
        cleaned_text = parallel(task(text) for text in texts)
        f.write(cleaned_text + '\n')


def preprocess_text(text):
    """
    Generator function using spaCy to parse reviews:
    - lemmatize the text
    - remove punctuation, whitespace and number
    - remove some pos tags
    """
    nlp_parser = spacy.load('en_core_web_sm')
    parsed_text = nlp_parser(text)
    cleaned_text = ' '.join(token.lemma_ for token in parsed_text if is_valid_word(token))
    return cleaned_texts


def is_valid_word(token):
    """
    Returns False if the spacy token is either
    a punctuation, whitespace, number, or is one
    of the un-wanted POS tag

    http://universaldependencies.org/u/pos/
    """
    pos_flag = token.pos_ not in {'ADP', 'CCONJ'}
    word_flag = not (token.is_punct or token.is_space or token.like_num or token.is_stop)
    word_len_flag = len(token) >= 2
    valid_flag = pos_flag and word_flag and word_len_flag
    return valid_flag

In [25]:
nlp = spacy.load('en_core_web_sm')

# create a directory called 'model' to
# store all outputs in later section
MODEL_DIR = 'model'
if not os.path.isdir(MODEL_DIR):
    os.mkdir(MODEL_DIR)

UNIGRAM_PATH = os.path.join(MODEL_DIR, 'unigram.txt')
if not os.path.exists(UNIGRAM_PATH):
    start = time()
    export_unigrams(UNIGRAM_PATH, texts = newsgroups_train.data[:100], parser = nlp)
    elapse = time() - start
    print('text preprocessing, elapse', elapse)

Process ForkPoolWorker-63:
Process ForkPoolWorker-64:
Process ForkPoolWorker-61:
Process ForkPoolWorker-60:
Process ForkPoolWorker-59:
Process ForkPoolWorker-57:
Process ForkPoolWorker-62:
Process ForkPoolWorker-58:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/mingyuliu/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/mingyuliu/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/mingyuliu/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/mingyuliu/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/mingyuliu/anaconda3/lib/python3.6/multiprocessing/proces

KeyboardInterrupt: 

In [2]:
import os
import spacy
from time import time
from joblib import cpu_count


def export_unigrams(unigram_path, texts, parser,
                    batch_size = 10000, n_jobs = -1):
    """
    Preprocessed the raw text and export it to a .txt file,
    where each line is one document, for what sort of preprocessing
    is done, please refer to the `clean_corpus` function

    Parameters
    ----------
    unigram_path : str
        output file path of the preprocessed unigram text

    texts : iterable
        iterable can be simply a list, but for larger corpora,
        consider an iterable that streams the sentences directly from
        disk/network using Gensim's Linsentence or something along
        those line

    parser : spacy model object
        e.g. parser = spacy.load('en')

    batch_size : int, default 10000
        batch size for the spacy preprocessing

    n_jobs : int, default -1
        number of jobs/cores/threads to use for the spacy preprocessing
    """
    with open(unigram_path, 'w', encoding = 'utf_8') as f:
        for cleaned_text in clean_corpus(texts, parser, batch_size, n_jobs):
            f.write(cleaned_text + '\n')


def clean_corpus(texts, parser, batch_size, n_jobs):
    """
    Generator function using spaCy to parse reviews:
    - lemmatize the text
    - remove punctuation, whitespace and number
    - remove some pos tags
    """
    n_threads = cpu_count()
    if n_jobs > 0 and n_jobs < n_threads:
        n_threads = n_jobs

    # use the .pip to process texts as a stream;
    # this functionality supports using multi-threads
    for parsed_text in parser.pipe(texts, n_threads = 1, batch_size = batch_size):
        tokens = []
        for token in parsed_text:
            if is_valid_word(token):
                tokens.append(token.lemma_)

        cleaned_text = ' '.join(tokens)
        yield cleaned_text


def is_valid_word(token):
    """
    Returns False if the spacy token is either
    a punctuation, whitespace, number, or is one
    of the un-wanted POS tag

    http://universaldependencies.org/u/pos/
    """
    pos_flag = token.pos_ not in {'ADP', 'CCONJ'}
    word_flag = not (token.is_punct or token.is_space or token.like_num or token.is_stop)
    word_len_flag = len(token) >= 2
    valid_flag = pos_flag and word_flag and word_len_flag
    return valid_flag

In [4]:
nlp = spacy.load('en_core_web_sm')

# create a directory called 'model' to
# store all outputs in later section
MODEL_DIR = 'model'
if not os.path.isdir(MODEL_DIR):
    os.mkdir(MODEL_DIR)

UNIGRAM_PATH = os.path.join(MODEL_DIR, 'unigram.txt')
if not os.path.exists(UNIGRAM_PATH):
    start = time()
    export_unigrams(UNIGRAM_PATH, texts = newsgroups_train.data[:100], parser = nlp)
    elapse = time() - start
    print('text preprocessing, elapse', elapse)

text preprocessing, elapse 6.267408847808838
