In [1]:
# Heterogeneous models

In [2]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.cross_validation import cross_val_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [3]:
import numpy as np

In [4]:
import pandas as pd

# Combining multiple sources (and kinds) of data

Here is a small variation of the newsgroups data, that is somewhat closer to what you will have after working for a while on feature engineering and preprocessing.

In [5]:
data = pd.read_json("datasets/tagged_newsgroup_subjects.json")

In [6]:
data.head()

Unnamed: 0,pos,target,token
0,"N , N N N",rec.sport.hockey,Re : Goalie Mask Update
1,"^ N ^ ,",rec.sport.baseball,Tigers pound Mariners !!!!!!!
2,"N , N",rec.sport.baseball,RE : survey
3,"A U N , ^ N",rec.sport.hockey,Dear Montana@pinetree.org Re : Hockey Pool
4,"N , ! , A N , V A , ^",rec.sport.hockey,"Re : Goodbye , good riddance , get lost ' Stars"


Here, the "token" column is already tokenized (tokens are delimited by spaces) so there is no need to use a regular expression tokenizer.  We can just use `str.split`.

The `pos` column contains the part-of-speech tag labels corresponding to the tokens. `N` means noun, `^` means proper noun, for example.

This output was produced off-line using the CMU TweetNLP toolkit, which Vlad finds very robust to web text like this.

But before we start, suppose we do even more feature engineering.

In [7]:
data["is_reply"] = data.token.apply(lambda x: x.lower().startswith("re :"))
data["n_words"] = data.token.apply(lambda x: len(x.split()))
data.head()

Unnamed: 0,pos,target,token,is_reply,n_words
0,"N , N N N",rec.sport.hockey,Re : Goalie Mask Update,True,5
1,"^ N ^ ,",rec.sport.baseball,Tigers pound Mariners !!!!!!!,False,4
2,"N , N",rec.sport.baseball,RE : survey,True,3
3,"A U N , ^ N",rec.sport.hockey,Dear Montana@pinetree.org Re : Hockey Pool,False,6
4,"N , ! , A N , V A , ^",rec.sport.hockey,"Re : Goodbye , good riddance , get lost ' Stars",True,11


In [8]:
from sklearn.pipeline import FeatureUnion, Pipeline

Let's build a pipe to extract the subject body.

In [9]:
def get_tokenized(x):
    return x["token"]


token_pipe = Pipeline([
    ('proj', FunctionTransformer(
                func=get_tokenized,
                validate=False)),
    ('vect', CountVectorizer(analyzer='word', tokenizer=str.split, min_df=5))
])

And check that it works.

In [10]:
token_pipe.fit_transform(data)

<1197x319 sparse matrix of type '<class 'numpy.int64'>'
	with 6345 stored elements in Compressed Sparse Row format>

In [11]:
token_pipe.steps[1][1].vocabulary_

{'!': 0,
 '!!': 1,
 '!!!': 2,
 '!!!!': 3,
 '!!!!!': 4,
 '"': 5,
 '&': 6,
 "'": 7,
 '(': 8,
 ')': 9,
 ',': 10,
 '-': 11,
 '.': 12,
 '...': 13,
 '....': 14,
 '1': 15,
 '18': 16,
 '19': 17,
 '1964': 18,
 '1988-1992': 19,
 '1992': 20,
 '1993': 21,
 '2': 22,
 '3': 23,
 '93': 24,
 ':': 25,
 ';': 26,
 '?': 27,
 '???': 28,
 '????': 29,
 'a': 30,
 'aargh': 31,
 'abc': 32,
 'again': 33,
 'ahl': 34,
 'al': 35,
 'all': 36,
 'all-time': 37,
 "america's": 38,
 'an': 39,
 'and': 40,
 'another': 41,
 'apr': 42,
 'april': 43,
 'are': 44,
 'area': 45,
 'astros': 46,
 'at': 47,
 'atlanta': 48,
 'attendance': 49,
 'averages': 50,
 'back': 51,
 'base': 52,
 'baseball': 53,
 'bay': 54,
 'bbddd': 55,
 'be': 56,
 'beat': 57,
 'best': 58,
 'biggest': 59,
 'bob': 60,
 'bosox': 61,
 'braves': 62,
 'breaker': 63,
 'bruins': 64,
 'burns': 65,
 'but': 66,
 'canada': 67,
 'canadian': 68,
 'canadiens': 69,
 'captains': 70,
 'catchers': 71,
 'cherry': 72,
 "coach's": 73,
 'community': 74,
 'conf': 75,
 'corner': 76,
 

Let's build a pipe to extract part-of-speech bigrams.

In [12]:
def get_pos(x):
    return x["pos"]


pos_pipe = Pipeline([
    ('proj', FunctionTransformer(
                func=get_pos,
                validate=False)),
    ('vect', CountVectorizer(analyzer='word',
                             lowercase=False,
                             tokenizer=str.split,
                             ngram_range=(1, 2)))
])

In [13]:
pos_pipe.fit(data)

Pipeline(steps=[('proj', FunctionTransformer(accept_sparse=False,
          func=<function get_pos at 0x7f9dacb190d0>, pass_y=False,
          validate=False)), ('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content...pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<method 'split' of 'str' objects>, vocabulary=None))])

In [14]:
pos_pipe.steps[1][1].vocabulary_

{'!': 0,
 '! ,': 1,
 '! ^': 2,
 '$': 3,
 '$ &': 4,
 '$ ,': 5,
 '$ A': 6,
 '$ N': 7,
 '$ P': 8,
 '$ R': 9,
 '$ S': 10,
 '$ V': 11,
 '$ ^': 12,
 '&': 13,
 '& ,': 14,
 '& A': 15,
 '& D': 16,
 '& N': 17,
 '& R': 18,
 '& V': 19,
 '& Z': 20,
 '& ^': 21,
 ',': 22,
 ', !': 23,
 ', $': 24,
 ', &': 25,
 ', ,': 26,
 ', A': 27,
 ', D': 28,
 ', G': 29,
 ', L': 30,
 ', N': 31,
 ', O': 32,
 ', P': 33,
 ', R': 34,
 ', V': 35,
 ', Z': 36,
 ', ^': 37,
 'A': 38,
 'A $': 39,
 'A &': 40,
 'A ,': 41,
 'A A': 42,
 'A N': 43,
 'A P': 44,
 'A R': 45,
 'A U': 46,
 'A V': 47,
 'A ^': 48,
 'D': 49,
 'D $': 50,
 'D ,': 51,
 'D A': 52,
 'D D': 53,
 'D N': 54,
 'D S': 55,
 'D ^': 56,
 'E': 57,
 'E E': 58,
 'G': 59,
 'G ,': 60,
 'G G': 61,
 'G N': 62,
 'G ^': 63,
 'L': 64,
 'L P': 65,
 'L R': 66,
 'L V': 67,
 'N': 68,
 'N $': 69,
 'N &': 70,
 'N ,': 71,
 'N A': 72,
 'N E': 73,
 'N G': 74,
 'N N': 75,
 'N O': 76,
 'N P': 77,
 'N R': 78,
 'N V': 79,
 'N ^': 80,
 'O': 81,
 'O ,': 82,
 'O A': 83,
 'O N': 84,
 'O P': 85,


Finally, let's extract the manually-computed features.

In [15]:
def extract_manual(x):
    res = x[["n_words", "is_reply"]]  # select cols
    return res.values.astype(np.double)  # convert to numpy


manual = FunctionTransformer(func=extract_manual,
                             validate=False)

In [16]:
manual.fit_transform(data)

array([[ 5.,  1.],
       [ 4.,  0.],
       [ 3.,  1.],
       ..., 
       [ 4.,  1.],
       [ 4.,  0.],
       [ 5.,  0.]])

## Putting them all together, side by side

In [17]:
union = FeatureUnion([
    ("words", token_pipe),
    ("pos", pos_pipe),
    ("manual", manual)
])

In [18]:
union.fit_transform(data)

<1197x481 sparse matrix of type '<class 'numpy.float64'>'
	with 19198 stored elements in Compressed Sparse Row format>

481 is 319 (from tokens) + 160 (POS tag uni and bigrams) + 2.

FeatureUnion concatenates them left-to-right.

In [19]:
full_pipe = Pipeline([
    ('union', union),
    ('scale', StandardScaler(with_mean=False)), 
        # Different features can end up on different scales.
        # Some classifiers are not impacted much, but it can impact interpretation.
        # Puzzle: can we use `with_mean=True?` Why not?
    ('clf', LogisticRegression())
])

In [21]:
from sklearn.grid_search import RandomizedSearchCV

from scipy.stats import randint, expon, uniform

In [22]:
search = RandomizedSearchCV(
    full_pipe,
    {
        'union__words__vect__min_df': randint(1, 11),  # That's a mouthful!
        'clf__C': expon(scale=10)
    },
    scoring='accuracy',
    cv=3,
    n_jobs=2,
    n_iter=40,
    random_state=0
)

In [23]:
search.fit(data, data.target)

RandomizedSearchCV(cv=3, error_score='raise',
          estimator=Pipeline(steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('words', Pipeline(steps=[('proj', FunctionTransformer(accept_sparse=False,
          func=<function get_tokenized at 0x7f9dacb19158>, pass_y=False,
          validate=False)), ('vect', CountVectorizer(analyzer='word', binary=False, de...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
          fit_params={}, iid=True, n_iter=40, n_jobs=2,
          param_distributions={'clf__C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f9dac1bb2e8>, 'union__words__vect__min_df': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f9dacb24198>},
          pre_dispatch='2*n_jobs', random_state=0, refit=True,
          scoring='accuracy', verbose=0)

In [24]:
search.best_score_

0.91395154553049285

In [25]:
search.grid_scores_

[mean: 0.87469, std: 0.01545, params: {'clf__C': 12.377127928425844, 'union__words__vect__min_df': 5},
 mean: 0.86466, std: 0.00938, params: {'clf__C': 7.440362212259898, 'union__words__vect__min_df': 9},
 mean: 0.86550, std: 0.00719, params: {'clf__C': 9.10738729525131, 'union__words__vect__min_df': 9},
 mean: 0.89307, std: 0.02050, params: {'clf__C': 10.574911418700976, 'union__words__vect__min_df': 3},
 mean: 0.87385, std: 0.02467, params: {'clf__C': 2.7707606872905277, 'union__words__vect__min_df': 6},
 mean: 0.85380, std: 0.01009, params: {'clf__C': 9.614176425640345, 'union__words__vect__min_df': 10},
 mean: 0.87302, std: 0.01127, params: {'clf__C': 23.15001589085309, 'union__words__vect__min_df': 6},
 mean: 0.90226, std: 0.02515, params: {'clf__C': 2.019699468304375, 'union__words__vect__min_df': 3},
 mean: 0.86383, std: 0.02008, params: {'clf__C': 0.5329835938608137, 'union__words__vect__min_df': 8},
 mean: 0.90977, std: 0.01748, params: {'clf__C': 7.54653097889178, 'union__wor