In [10]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

# Pipelining and parameter search

In this section we study how to chain different estimators to form one end-to-end model.

In [11]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.cross_validation import train_test_split

In [12]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             categories=('rec.sport.hockey', 'soc.religion.christian',
                                         'rec.motorcycles', 'rec.sport.baseball', 'sci.crypt'),
                             remove=('headers', 'footers', 'quotes'))

In [13]:
docs, y = dataset['data'], dataset['target']

## Instead of doing just one train/validation step, we will do cross-validation

`sklearn.model_selection.cross_val_score` and `sklearn.model_selection.*SearchCV` do everything for us!

However, we need to pass then an "estimator"...

## Layering transformers and classifiers: The Pipeline

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [15]:
import re

default_tokenizer = re.compile(r"(?u)\b\w\w+\b")

def number_aware_tokenizer(doc):
    # Start off with the default tokenizer
    toks = default_tokenizer.findall(doc)
    # replace tokens that start with numbers with a custom marker
    toks = [
        "#NBR" if t[0].isdigit() else t  # substitute numeric-starting tokens
        for t in toks
        if t.isalnum()  # drop non-alphanumeric tokens
    ]
    return toks

In [16]:
# Pipeline([(name_1, object_1), (name_2, object_2), ...])

pipe = Pipeline([
    ('vect', CountVectorizer(min_df=5,
                             max_df=0.11,
                             tokenizer=number_aware_tokenizer)),
    ('clf', LogisticRegression(multi_class='multinomial',
                               solver='lbfgs'))
])

In [17]:
from sklearn.model_selection import cross_val_score

In [18]:
cv_scores = cross_val_score(pipe, docs, y, scoring='f1_macro', cv=3)

print(cv_scores)
print("Mean CV F1: {:.2f}".format(cv_scores.mean()))

[ 0.85766247  0.82267921  0.83422012]
Mean CV F1: 0.84


## Tweaking hyperparameters

In [19]:
from sklearn.model_selection import RandomizedSearchCV

In [20]:
search = RandomizedSearchCV(
    pipe,
    {
        'vect__min_df': (1, 5, 10),
        'vect__max_df': (0.10, 0.11, 0.12),
        'clf__C': (0.01, 0.1, 1, 10, 100)
    },
    scoring='f1_macro',
    cv=3,
    n_jobs=2,
    n_iter=15,
    random_state=0
)

In [21]:
search.fit(docs, y)

RandomizedSearchCV(cv=3, error_score='raise',
          estimator=Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.11, max_features=None, min_df=5,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        stri...enalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False))]),
          fit_params={}, iid=True, n_iter=15, n_jobs=2,
          param_distributions={'clf__C': (0.01, 0.1, 1, 10, 100), 'vect__max_df': (0.1, 0.11, 0.12), 'vect__min_df': (1, 5, 10)},
          pre_dispatch='2*n_jobs', random_state=0, refit=True,
          scoring='f1_macro', verbose=0)

In [22]:
search.best_score_

0.85600500561149673

In [23]:
search.best_params_

{'clf__C': 0.1, 'vect__max_df': 0.1, 'vect__min_df': 1}

In [24]:
search.grid_scores_

[mean: 0.80194, std: 0.01618, params: {'clf__C': 100, 'vect__max_df': 0.12, 'vect__min_df': 10},
 mean: 0.83715, std: 0.01644, params: {'clf__C': 0.01, 'vect__max_df': 0.1, 'vect__min_df': 1},
 mean: 0.83489, std: 0.01454, params: {'clf__C': 0.01, 'vect__max_df': 0.11, 'vect__min_df': 1},
 mean: 0.83150, std: 0.01084, params: {'clf__C': 100, 'vect__max_df': 0.11, 'vect__min_df': 1},
 mean: 0.85601, std: 0.01921, params: {'clf__C': 0.1, 'vect__max_df': 0.1, 'vect__min_df': 1},
 mean: 0.83933, std: 0.01757, params: {'clf__C': 1, 'vect__max_df': 0.1, 'vect__min_df': 5},
 mean: 0.84664, std: 0.01487, params: {'clf__C': 1, 'vect__max_df': 0.11, 'vect__min_df': 1},
 mean: 0.82898, std: 0.01522, params: {'clf__C': 100, 'vect__max_df': 0.1, 'vect__min_df': 1},
 mean: 0.82374, std: 0.01546, params: {'clf__C': 1, 'vect__max_df': 0.11, 'vect__min_df': 10},
 mean: 0.83044, std: 0.01619, params: {'clf__C': 0.01, 'vect__max_df': 0.12, 'vect__min_df': 1},
 mean: 0.84719, std: 0.01323, params: {'clf__

**Protip:** when using `RandomizedSearchCV`, you can (and **should**) specify random distributions instead of fixed parameter values.

In [25]:
from scipy.stats import randint, expon, uniform

better_search = RandomizedSearchCV(
    pipe,
    {
        'vect__min_df': randint(1, 11),
        'vect__max_df': uniform(0.05, 0.15),
        'clf__C': expon()
    },
    scoring='f1_macro',
    cv=3,
    n_jobs=2,
    n_iter=15,
    random_state=0
)

In [26]:
np.random.seed(42)
better_search.fit(docs, y)

RandomizedSearchCV(cv=3, error_score='raise',
          estimator=Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.11, max_features=None, min_df=5,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        stri...enalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False))]),
          fit_params={}, iid=True, n_iter=15, n_jobs=2,
          param_distributions={'clf__C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f180f53f4e0>, 'vect__max_df': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f180f44db00>, 'vect__min_df': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f180f442c88>},
          pre_dispatch='2*n_jobs', random_state=0, refit=True,
          scoring='f1_macro', verbose=0)

In [27]:
better_search.best_score_

0.85170190648496569

For 15 iterations, this does not make much difference. But in the long run, it allows you to explore much more. *(And you **should** run this much more!)*

In [28]:
#better_search.set_params(n_iter=100)
#better_search.fit(docs, y)

In [29]:
better_search.best_score_

0.85170190648496569

In [30]:
better_search.best_params_

{'clf__C': 1.0214318863920728,
 'vect__max_df': 0.07150299311135697,
 'vect__min_df': 1}

## Adding a topic model to the mix

In [31]:
from sklearn.decomposition import LatentDirichletAllocation

pipe_topic = Pipeline([
    ('vect', CountVectorizer(min_df=5,
                             max_df=0.11,
                             tokenizer=number_aware_tokenizer)),
    ('topic', LatentDirichletAllocation(n_topics=20, max_iter=20, random_state=0)),
    ('clf', LogisticRegression(multi_class='multinomial',
                               solver='lbfgs'))
])

In [32]:
# without optimizing the hyperparameters

cv_scores_topic = cross_val_score(pipe_topic, docs, y, scoring='f1_macro', cv=3)

print(cv_scores_topic)
print("Mean CV F1: {:.2f}".format(cv_scores_topic.mean()))

[ 0.8238608   0.81820247  0.79068165]
Mean CV F1: 0.81


In [33]:
search_topic = RandomizedSearchCV(
    pipe_topic,
    {
        'vect__min_df': randint(1, 11),
        'vect__max_df': uniform(0.05, 0.15),
        'topic__n_topics': randint(20, 40),
        'clf__C': expon()
    },
    scoring='f1_macro',
    cv=3,
    n_jobs=1,
    n_iter=3,
    random_state=0
)

In [34]:
np.random.seed(42)
search_topic.fit(docs, y)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  3.1min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.11, max_features=None, min_df=5,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        stri...enalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False))]),
          fit_params={}, iid=True, n_iter=3, n_jobs=1,
          param_distributions={'topic__n_topics': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f180df585f8>, 'clf__C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f180df58fd0>, 'vect__max_df': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f180de1f5c0>, 'vect__min_df': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f180ddef358>},
          pre_dispatch='2*n_jobs', random_state=0, refit=True,
          scoring='f

In [35]:
search_topic.best_score_

0.82924341532224755

In [36]:
search_topic.best_params_

{'clf__C': 1.8789640641973517,
 'topic__n_topics': 39,
 'vect__max_df': 0.14688411695999842,
 'vect__min_df': 5}

In [37]:
# search_topic.set_params(n_iter=30)
# search_topic.fit(docs, y)

In [38]:
# search_topic.best_params_

In [39]:
search_topic.best_score_

0.82924341532224755

## When satisfied, (and no sooner!), we can evaluate on the test set

In [40]:
test_dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                                  subset='test',
                                  categories=('rec.sport.hockey', 'soc.religion.christian',
                                              'rec.motorcycles', 'rec.sport.baseball', 'sci.crypt'),
                                  remove=('headers', 'footers', 'quotes'))

In [41]:
test_docs, test_y = test_dataset.data, test_dataset.target

In [42]:
print("Bag-of-words test F1: {:.2f}".format(better_search.score(test_docs, test_y)))
print("Topic model test F1: {:.2f}".format(search_topic.score(test_docs, test_y)))



Bag-of-words test F1: 0.85
Topic model test F1: 0.84
