In [1]:
import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import get_scorer
from sklearn.feature_selection import SelectKBest, f_classif

import category_encoders as ce

import warnings



In [2]:
df, target = pd.read_csv('datasets/titanic.csv'), 'Survived'
# df, target = pd.read_csv('datasets/attrition.csv'), 'Attrition'
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
X = df.drop(target, axis=1)
y = df[target]

numeric_features = X.select_dtypes(include=['int64', 'float64', 'int32', 'float32']).columns
categorical_features = X.select_dtypes(include=['object', 'category', 'boolean']).columns
        
y = preprocessing.LabelEncoder().fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

num_features = len(numeric_features) + len(categorical_features)

In [4]:
encoder_list = [
    ce.backward_difference.BackwardDifferenceEncoder, 
    ce.basen.BaseNEncoder,
    ce.binary.BinaryEncoder,
    ce.cat_boost.CatBoostEncoder,
    ce.hashing.HashingEncoder,
    ce.helmert.HelmertEncoder,
    ce.james_stein.JamesSteinEncoder,
    ce.one_hot.OneHotEncoder,
    ce.leave_one_out.LeaveOneOutEncoder,
    ce.m_estimate.MEstimateEncoder,
    ce.ordinal.OrdinalEncoder,
    ce.sum_coding.SumEncoder,
    ce.target_encoder.TargetEncoder,
    ce.woe.WOEEncoder
]

for encoder in encoder_list:
    
    numeric_transformer = Pipeline(steps=[
        ('num_imputer', SimpleImputer(strategy='median')),
        ('num_scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('cat_imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('cat_encoder', encoder())
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )

    pipe = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('feature_selection', SelectKBest(f_classif, k=int(0.9*num_features))),
            ('classifier', RandomForestClassifier(n_estimators=250, n_jobs=-1))
        ]
    )
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

        model = pipe.fit(X_train, y_train)

        print(f"{encoder.__name__:>30}, {get_scorer('roc_auc')(model, X_test, y_test)}")

     BackwardDifferenceEncoder, 0.8468181557353532
                  BaseNEncoder, 0.8231192976415905
                 BinaryEncoder, 0.8290009754977907
               CatBoostEncoder, 0.8505766913410225
                HashingEncoder, 0.7149251161990016
                HelmertEncoder, 0.8317840133126757
             JamesSteinEncoder, 0.7892924771905663
                 OneHotEncoder, 0.8378091467263442
            LeaveOneOutEncoder, 0.8297756355081196
              MEstimateEncoder, 0.7859356171458083
                OrdinalEncoder, 0.8572043380960579
                    SumEncoder, 0.8355712400298387
                 TargetEncoder, 0.8665289493314972
                    WOEEncoder, 0.8770012050266827
