In [17]:
import pandas as pd
import numpy as np
from scipy import sparse

from nltk.stem import WordNetLemmatizer

from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score

A pipeline is what chains several steps together, once the initial exploration is done. 

For example, some codes are meant to transform features — normalise numericals, or turn text into vectors, or fill up missing data, they are **transformers**; other codes are meant to predict variables by fitting an algorithm, such as random forest or support vector machine, they are **estimators**. 

Pipeline chains all these together which can then be applied to training data en bloc.

In [11]:
df = pd.read_csv('./data/toxic_comment.csv')
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [12]:
x = df['comment_text'].values[:5000]
y = df['toxic'].values[:5000]

In [13]:
# default params
scoring='roc_auc'
cv=3
n_jobs=-1
max_features = 2500

Define a transformer and an estimator:

In [18]:
tfidf = TfidfVectorizer(max_features=max_features)  # a transformer

# lr = LogisticRegression()  # an estimator
# dt = DecisionTreeClassifier() # another estimator

In [20]:
classifiers = [
    LogisticRegression(),
    DecisionTreeClassifier()]

In [22]:
for clf in classifiers:
    steps = [
        ('tfidf', tfidf),
        ('clf', clf)
    ]
    
    pipeline = Pipeline(steps)
    results = cross_val_score(estimator=pipeline, X=x, y=y, scoring=scoring, cv=cv, n_jobs=n_jobs)
    print(results.mean())

0.9219150528004635
0.7591568823977255


#### Reference:
- tutorial on kaggle: https://www.kaggle.com/sermakarevich/sklearn-pipelines-tutorial