In [1]:
# importing libraries

from seaborn import load_dataset # for dataset

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer # for handling NaN values
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler # handling the preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline # for automation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [2]:
# load data

columns = ['alive', 'class', 'embarked', 'who','alone', 'adult_male']
data = load_dataset('titanic').drop(columns=columns)
data['deck'] = data['deck'].astype('object')
print(data.shape)
data.head()

(891, 9)


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,deck,embark_town
0,0,3,male,22.0,1,0,7.25,,Southampton
1,1,1,female,38.0,1,0,71.2833,C,Cherbourg
2,1,3,female,26.0,0,0,7.925,,Southampton
3,1,1,female,35.0,1,0,53.1,C,Southampton
4,0,3,male,35.0,0,0,8.05,,Southampton


In [5]:
# References
SEED = 101
TARGET = 'survived'
FEATURES = data.columns.drop(TARGET)

NUMERICAL = data[FEATURES].select_dtypes('number').columns
CATEGORICAL = pd.Index(np.setdiff1d(FEATURES, NUMERICAL))

print(NUMERICAL) # numerical values
print(CATEGORICAL) # categorical values

Index(['pclass', 'age', 'sibsp', 'parch', 'fare'], dtype='object')
Index(['deck', 'embark_town', 'sex'], dtype='object')


In [6]:
# split the dataset

X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=TARGET), data[TARGET], test_size=0.2, random_state=SEED, stratify=data[TARGET])

Elegant Approach #1

In [15]:
# using sklearn's Pipeline
numerical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessors = ColumnTransformer(transformers=[
    ('num', numerical_pipe, NUMERICAL),
    ('cat', categorical_pipe, CATEGORICAL)
])

pipe = Pipeline([
    ('preprocessors', preprocessors),
    ('model', LogisticRegression())
])

pipe.fit(X_train, y_train)

Pipeline(steps=[('preprocessors',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   MinMaxScaler())]),
                                                  Index(['pclass', 'age', 'sibsp', 'parch', 'fare'], dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('encoder',
                                            

In [16]:
def calculate_roc_auc(model_pipe, X, y) :
    y_proba = model_pipe.predict_proba(X)[:,1]
    return roc_auc_score(y,y_proba)

print(f'Train ROC-AUC : {calculate_roc_auc(pipe, X_train, y_train)}')
print(f'Test ROC-AUC : {calculate_roc_auc(pipe, X_test, y_test)}')

Train ROC-AUC : 0.867230719166938
Test ROC-AUC : 0.8269433465085639


Elegant Approach #2

In [33]:
# making custom transformers

class Imputer(BaseEstimator, TransformerMixin) :
    def __init__(self, features, method='constant', value='missing') :
        self.features = features
        self.method = method
        self.value = value

    def fit(self, X, y=None) :
        if self.method == 'mean' :
            self.value = X[self.features].mean()
        return self
    
    def transform(self, X) :
        X_transformed = X.copy()
        X_transformed[self.features] = X[self.features].fillna(self.value)
        return X_transformed

class Scaler(BaseEstimator, TransformerMixin) :
    def __init__(self, features) :
        self.features = features

    def fit(self, X, y=None) :
        self.min = X[self.features].min()
        self.range = X[self.features].max() - self.min
        return self

    def transform(self, X) :
        X_transformed = X.copy()
        X_transformed[self.features] = (X[self.features] - self.min)/ self.range
        return X_transformed

class Encoder(BaseEstimator, TransformerMixin) :
    def __init__(self, features, drop='first') :
        self.features = features
        self.drop = drop
    
    def fit(self, X, y=None) :
        self.encoder = OneHotEncoder(sparse=False, drop=self.drop)
        self.encoder.fit(X[self.features])
        return self

    def transform(self, X) :
        X_transformed = pd.concat([X.drop(columns=self.features).reset_index(drop=True), 
                                   pd.DataFrame(self.encoder.transform(X[self.features]),columns=self.encoder.get_feature_names(self.features))], axis=1)
    
        return X_transformed


pipe = Pipeline([
    ('num_imputer', Imputer(NUMERICAL, method='mean')),
    ('scaler', Scaler(NUMERICAL)),
    ('cat_imputer', Imputer(CATEGORICAL)),
    ('encoder', Encoder(CATEGORICAL)),
    ('model', LogisticRegression())
])

pipe.fit(X_train, y_train)

Pipeline(steps=[('num_imputer',
                 Imputer(features=Index(['pclass', 'age', 'sibsp', 'parch', 'fare'], dtype='object'),
                         method='mean',
                         value=pclass     2.308989
age       29.638307
sibsp      0.528090
parch      0.404494
fare      32.633151
dtype: float64)),
                ('scaler',
                 Scaler(features=Index(['pclass', 'age', 'sibsp', 'parch', 'fare'], dtype='object'))),
                ('cat_imputer',
                 Imputer(features=Index(['deck', 'embark_town', 'sex'], dtype='object'))),
                ('encoder',
                 Encoder(features=Index(['deck', 'embark_town', 'sex'], dtype='object'))),
                ('model', LogisticRegression())])

In [35]:
print(f'Train ROC-AUC : {calculate_roc_auc(pipe, X_train, y_train) :.4f}')
print(f'Test ROC-AUC : {calculate_roc_auc(pipe, X_test, y_test) : .4f}')

Train ROC-AUC : 0.8677
Test ROC-AUC :  0.8289


Source : https://towardsdatascience.com/from-ml-model-to-ml-pipeline-9f95c32c6512