In [1]:
# Heterogeneous models

In [2]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.cross_validation import cross_val_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [3]:
import numpy as np

In [4]:
import pandas as pd

# Combining multiple sources (and kinds) of data

Here is a small variation of the newsgroups data, that is somewhat closer to what you will have after working for a while on feature engineering and preprocessing.

In [5]:
data = pd.read_json("datasets/tagged_newsgroup_subjects.json")

In [6]:
data.head()

Unnamed: 0,pos,target,token
0,"N , N N N",rec.sport.hockey,Re : Goalie Mask Update
1,"^ N ^ ,",rec.sport.baseball,Tigers pound Mariners !!!!!!!
2,"N , N",rec.sport.baseball,RE : survey
3,"A U N , ^ N",rec.sport.hockey,Dear Montana@pinetree.org Re : Hockey Pool
4,"N , ! , A N , V A , ^",rec.sport.hockey,"Re : Goodbye , good riddance , get lost ' Stars"


Here, the "token" column is already tokenized (tokens are delimited by spaces) so there is no need to use a regular expression tokenizer.  We can just use `str.split`.

The `pos` column contains the part-of-speech tag labels corresponding to the tokens. `N` means noun, `^` means proper noun, for example.

This output was produced off-line using the CMU TweetNLP toolkit, which Vlad finds very robust to web text like this.

But before we start, suppose we do even more feature engineering.

In [7]:
data["is_reply"] = data.token.apply(lambda x: x.lower().startswith("re :"))
data["n_words"] = data.token.apply(lambda x: len(x.split()))
data.head()

Unnamed: 0,pos,target,token,is_reply,n_words
0,"N , N N N",rec.sport.hockey,Re : Goalie Mask Update,True,5
1,"^ N ^ ,",rec.sport.baseball,Tigers pound Mariners !!!!!!!,False,4
2,"N , N",rec.sport.baseball,RE : survey,True,3
3,"A U N , ^ N",rec.sport.hockey,Dear Montana@pinetree.org Re : Hockey Pool,False,6
4,"N , ! , A N , V A , ^",rec.sport.hockey,"Re : Goodbye , good riddance , get lost ' Stars",True,11


In [8]:
from sklearn.pipeline import FeatureUnion, Pipeline

Let's build a pipe to extract the subject body.

In [9]:
def get_tokenized(x):
    return x["token"]


token_pipe = Pipeline([
    ('proj', FunctionTransformer(
                func=get_tokenized,
                validate=False)),
    ('vect', CountVectorizer(analyzer='word',
                             token_pattern=r'(?u)\b\S+\b',
                             #tokenizer=unicode.split,
                             min_df=5))
])

And check that it works.

In [10]:
token_pipe.fit_transform(data)

<1197x299 sparse matrix of type '<type 'numpy.int64'>'
	with 4593 stored elements in Compressed Sparse Row format>

In [11]:
token_pipe.steps[1][1].vocabulary_

{u'1': 0,
 u'18': 1,
 u'19': 2,
 u'1964': 3,
 u'1988-1992': 4,
 u'1992': 5,
 u'1993': 6,
 u'2': 7,
 u'3': 8,
 u'93': 9,
 u'a': 10,
 u'aargh': 11,
 u'abc': 12,
 u'again': 13,
 u'ahl': 14,
 u'al': 15,
 u'all': 16,
 u'all-time': 17,
 u"america's": 18,
 u'an': 19,
 u'and': 20,
 u'another': 21,
 u'apr': 22,
 u'april': 23,
 u'are': 24,
 u'area': 25,
 u'astros': 26,
 u'at': 27,
 u'atlanta': 28,
 u'attendance': 29,
 u'averages': 30,
 u'back': 31,
 u'base': 32,
 u'baseball': 33,
 u'bay': 34,
 u'bbddd': 35,
 u'be': 36,
 u'beat': 37,
 u'best': 38,
 u'biggest': 39,
 u'bob': 40,
 u'bosox': 41,
 u'braves': 42,
 u'breaker': 43,
 u'bruins': 44,
 u'burns': 45,
 u'but': 46,
 u'canada': 47,
 u'canadian': 48,
 u'canadiens': 49,
 u'captains': 50,
 u'catchers': 51,
 u'cherry': 52,
 u"coach's": 53,
 u'community': 54,
 u'conf': 55,
 u'corner': 56,
 u'coverage': 57,
 u'cubs': 58,
 u'cup': 59,
 u'dave': 60,
 u'day': 61,
 u'defensive': 62,
 u'deja': 63,
 u'detroit': 64,
 u'devils': 65,
 u'div': 66,
 u'don': 67,


Let's build a pipe to extract part-of-speech bigrams.

In [12]:
def get_pos(x):
    return x["pos"]


pos_pipe = Pipeline([
    ('proj', FunctionTransformer(
                func=get_pos,
                validate=False)),
    ('vect', CountVectorizer(analyzer='word',
                             lowercase=False,
                             token_pattern=r'(?u)\b\S+\b',
                             ngram_range=(1, 2)))
])

In [13]:
pos_pipe.fit(data)

Pipeline(steps=[('proj', FunctionTransformer(accept_sparse=False,
          func=<function get_pos at 0x7fd9b0edb230>, pass_y=False,
          validate=False)), ('vect', CountVectorizer(analyzer='word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'conte...      strip_accents=None, token_pattern='(?u)\\b\\S+\\b', tokenizer=None,
        vocabulary=None))])

In [14]:
pos_pipe.steps[1][1].vocabulary_

{u'A': 0,
 u'A A': 1,
 u'A D': 2,
 u'A N': 3,
 u'A P': 4,
 u'A R': 5,
 u'A U': 6,
 u'A V': 7,
 u'D': 8,
 u'D A': 9,
 u'D D': 10,
 u'D N': 11,
 u'D R': 12,
 u'D S': 13,
 u'D V': 14,
 u'E': 15,
 u'E E': 16,
 u'G': 17,
 u'G G': 18,
 u'G N': 19,
 u'L': 20,
 u'L P': 21,
 u'L R': 22,
 u'L V': 23,
 u'N': 24,
 u'N A': 25,
 u'N D': 26,
 u'N E': 27,
 u'N G': 28,
 u'N L': 29,
 u'N N': 30,
 u'N O': 31,
 u'N P': 32,
 u'N R': 33,
 u'N T': 34,
 u'N V': 35,
 u'N Z': 36,
 u'O': 37,
 u'O A': 38,
 u'O N': 39,
 u'O P': 40,
 u'O R': 41,
 u'O T': 42,
 u'O V': 43,
 u'P': 44,
 u'P A': 45,
 u'P D': 46,
 u'P E': 47,
 u'P G': 48,
 u'P N': 49,
 u'P O': 50,
 u'P P': 51,
 u'P R': 52,
 u'P V': 53,
 u'P X': 54,
 u'R': 55,
 u'R A': 56,
 u'R D': 57,
 u'R N': 58,
 u'R P': 59,
 u'R R': 60,
 u'R V': 61,
 u'S': 62,
 u'S A': 63,
 u'S N': 64,
 u'S V': 65,
 u'T': 66,
 u'T D': 67,
 u'T P': 68,
 u'T R': 69,
 u'T V': 70,
 u'U': 71,
 u'U N': 72,
 u'V': 73,
 u'V A': 74,
 u'V D': 75,
 u'V G': 76,
 u'V N': 77,
 u'V O': 78,
 u'V P': 

Finally, let's extract the manually-computed features.

In [15]:
def extract_manual(x):
    res = x[["n_words", "is_reply"]]  # select cols
    return res.values.astype(np.double)  # convert to numpy


manual = FunctionTransformer(func=extract_manual,
                             validate=False)

In [16]:
manual.fit_transform(data)

array([[ 5.,  1.],
       [ 4.,  0.],
       [ 3.,  1.],
       ..., 
       [ 4.,  1.],
       [ 4.,  0.],
       [ 5.,  0.]])

## Putting them all together, side by side

In [17]:
union = FeatureUnion([
    ("words", token_pipe),
    ("pos", pos_pipe),
    ("manual", manual)
])

In [18]:
union.fit_transform(data)

<1197x393 sparse matrix of type '<type 'numpy.float64'>'
	with 12040 stored elements in Compressed Sparse Row format>

FeatureUnion concatenates them left-to-right.

In [19]:
full_pipe = Pipeline([
    ('union', union),
    ('scale', StandardScaler(with_mean=False)), 
        # Different features can end up on different scales.
        # Some classifiers are not impacted much, but it can impact interpretation.
        # Puzzle: can we use `with_mean=True?` Why not?
    ('clf', LogisticRegression())
])

In [20]:
from sklearn.grid_search import RandomizedSearchCV

from scipy.stats import randint, expon, uniform

In [21]:
search = RandomizedSearchCV(
    full_pipe,
    {
        'union__words__vect__min_df': randint(1, 11),  # That's a mouthful!
        'clf__C': expon(scale=10)
    },
    scoring='accuracy',
    cv=3,
    n_jobs=2,
    n_iter=40,
    random_state=0
)

In [24]:
search.fit(data, data.target)

RandomizedSearchCV(cv=3, error_score='raise',
          estimator=Pipeline(steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('words', Pipeline(steps=[('proj', FunctionTransformer(accept_sparse=False,
          func=<function get_tokenized at 0x7fd9b37fbf50>, pass_y=False,
          validate=False)), ('vect', CountVectorizer(analyzer='word', binary=False, de...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
          fit_params={}, iid=True, n_iter=40, n_jobs=2,
          param_distributions={'union__words__vect__min_df': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fd9b0ed2c10>, 'clf__C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fd9b0edebd0>},
          pre_dispatch='2*n_jobs', random_state=0, refit=True,
          scoring='accuracy', verbose=0)

In [25]:
search.best_score_

0.91478696741854637

In [29]:
search.grid_scores_

[mean: 0.91312, std: 0.00515, params: {'clf__C': 2.944138242601366, 'union__words__vect__min_df': 1},
 mean: 0.85213, std: 0.02156, params: {'clf__C': 10.383496509863521, 'union__words__vect__min_df': 6},
 mean: 0.85464, std: 0.01842, params: {'clf__C': 5.283913558856205, 'union__words__vect__min_df': 6},
 mean: 0.86132, std: 0.01316, params: {'clf__C': 1.5733142152836868, 'union__words__vect__min_df': 6},
 mean: 0.85380, std: 0.01740, params: {'clf__C': 6.192893708175707, 'union__words__vect__min_df': 6},
 mean: 0.86550, std: 0.01654, params: {'clf__C': 0.8301803940456774, 'union__words__vect__min_df': 6},
 mean: 0.84628, std: 0.00313, params: {'clf__C': 22.691879959868714, 'union__words__vect__min_df': 9},
 mean: 0.88053, std: 0.01654, params: {'clf__C': 36.53869157600396, 'union__words__vect__min_df': 5},
 mean: 0.88805, std: 0.01030, params: {'clf__C': 0.9460239418670159, 'union__words__vect__min_df': 5},
 mean: 0.83459, std: 0.02077, params: {'clf__C': 2.0439487093114552, 'union__