# Pipelines in Sklearn  

In this notebook I am demonstrating the use of **pipelines** in scikit-learn.  

In [1]:
import numpy as np
import pandas as pd

In [2]:
test = pd.read_csv("./data/test.csv")
train = pd.read_csv("./data/train.csv")
train.head()

Unnamed: 0,age,cost_of_ad,device_type,gender,in_initial_launch_location,income,n_drivers,n_vehicles,prior_ins_tenure,outcome
0,56,0.005737,iPhone,M,0,62717,2,1,4,0
1,50,0.004733,desktop,F,0,64328,2,3,2,0
2,54,0.004129,laptop,M,0,83439,1,3,7,0
3,16,0.005117,Android,F,0,30110,2,3,0,0
4,37,0.003635,desktop,M,0,76565,2,1,5,0


In [3]:
# Easy dirty way of creating features
def features(df):
    df['dv'] = df['n_vehicles'] / df['n_drivers']
    df['di'] = df['income'] / df['n_drivers']
    df['ci'] = df['income'] / df['cost_of_ad']
features(train)
features(test)

# Categorizing features 
cat_cols = ['gender', 'in_initial_launch_location', 'device_type']
response = ['outcome']
num_cols = [x for x in train.columns if x not in cat_cols + response]

y = train['outcome']
X = train.drop('outcome', axis=1)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10, stratify=y)

In [6]:
# Binary categoricals
# Imputing missing gender values
# in_initial_launch_location encoded as (0, 1)
# gender encoded as {0: female, 1: missing value, 2: male}
cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='G')),
    ('ordinal', OrdinalEncoder())
])

# One-hot encoding device feature as it has multiple values
device_transformer = Pipeline([
    ('onehot', OneHotEncoder(sparse=False))
])

# Numbers are scaled using Z-score
num_transformer = Pipeline([
    ('scaler', StandardScaler())
])

# Combining the transformers of all feature groups
preprocessor = ColumnTransformer([
    ('cat', cat_transformer, ['gender', 'in_initial_launch_location']),
    ('device', device_transformer, ['device_type']),
    ('num', num_transformer, num_cols)
])

# Preprocessing + Model
hgb = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', HistGradientBoostingClassifier(random_state=7))
])

# Parameter distribution used for randomized search hyperparameter optimization
param_dist = {
    'classifier__learning_rate': np.logspace(-3, 100, 10),
    'classifier__max_iter': np.logspace(1, 4, 20, dtype='int'),
    'classifier__max_depth': [2, 10, 100, 500, 1000],
    'classifier__max_leaf_nodes': np.logspace(1, 2, 7, dtype='int'),
    'classifier__min_samples_leaf': np.logspace(0, 2, 20, dtype='int'),
    'classifier__l2_regularization': np.linspace(0, 1, 10)
}

# Fit and predict
CV = RandomizedSearchCV(hgb, param_dist, random_state=17)
CV.fit(X_train, y_train)
pred = CV.predict(test)

### AUC Estimation with Validation Set  

In [7]:
y_val_prob = CV.predict_proba(X_val)

In [8]:
roc_auc_score(y_val, y_val_prob[:, 1])

0.877050432146251