Find best Encoder for each categorical features for all classifiers:
**We will try**

1. Baseline_encoder (all categorical - Ordinal_encoder)
2. level2 encoder (all categorical -  all encoders)

**Desired Output** : best encoder for all classifiers by averaging values of all classifiers. 

## Import libraries and funtions

In [16]:
!pip install category_encoders



In [17]:
# libraries
from sklearn.model_selection import train_test_split , cross_val_score

from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier 
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


from category_encoders.ordinal import OrdinalEncoder
from category_encoders.woe import WOEEncoder
from category_encoders.target_encoder import TargetEncoder
from category_encoders.sum_coding import SumEncoder
from category_encoders.m_estimate import MEstimateEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.helmert import HelmertEncoder
from category_encoders.cat_boost import CatBoostEncoder
from category_encoders.james_stein import JamesSteinEncoder
from category_encoders.one_hot import OneHotEncoder
from category_encoders.basen import BaseNEncoder
from category_encoders.count import CountEncoder
from category_encoders.glmm import GLMMEncoder
from sklearn.preprocessing import MinMaxScaler

In [18]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statistics
from sklearn.metrics import accuracy_score
import warnings
warnings.simplefilter('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [19]:
data_all_filtered= pd.read_csv('data/data_filtered.csv')
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
data_all_filtered.drop(columns=['Cabin_location'],inplace=True)
data_all_filtered.shape, train.shape, test.shape 

((1309, 13), (891, 12), (418, 11))

In [20]:
data_all_filtered.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,is_Alone,Embarked,Fare_bin,Family_size,Title,age_bin
0,0.0,3,0,22.0,1,0,7.25,0,S,cheap,medium,Mr,adult
1,1.0,1,1,38.0,1,0,71.2833,0,C,very_high,medium,Mrs,adult
2,1.0,3,1,26.0,0,0,7.925,1,S,cheap,single,Miss,adult


In [21]:
categorical_columns = ['Embarked','Fare_bin','Family_size','Title','age_bin']
encoder_list = [OrdinalEncoder(cols=categorical_columns, return_df=True), WOEEncoder(cols=categorical_columns,return_df=True),
                TargetEncoder(cols=categorical_columns, return_df=True), MEstimateEncoder(cols=categorical_columns,return_df=True), 
                JamesSteinEncoder(cols=categorical_columns, return_df=True),
                CatBoostEncoder(cols=categorical_columns,return_df=True), BaseNEncoder(cols=categorical_columns,return_df=True,base=3),
                BaseNEncoder(cols=categorical_columns,return_df=True,base=2),BaseNEncoder(cols=categorical_columns,return_df=True,base=4),
                OneHotEncoder(cols=categorical_columns,return_df=True),CountEncoder(cols=categorical_columns,return_df=True, handle_unknown=0)]

#encoder does not depend of target variable
non_target_encoder_list = [
       OrdinalEncoder(cols=categorical_columns, return_df=True),
       BaseNEncoder(cols=categorical_columns,return_df=True,base=2), 
       BaseNEncoder(cols=categorical_columns,return_df=True,base=3),
       BaseNEncoder(cols=categorical_columns,return_df=True,base=4),
       OneHotEncoder(cols=categorical_columns,return_df=True),
       CountEncoder(cols=categorical_columns,return_df=True, handle_unknown=0)
]

classifiers_list = [
        XGBClassifier(random_state=42), 
        RandomForestClassifier(random_state=42), 
        LGBMClassifier(random_state=42),   
        KNeighborsClassifier(), 
        SVC()
                            ]

In [22]:
data_all_filtered.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,is_Alone,Embarked,Fare_bin,Family_size,Title,age_bin
0,0.0,3,0,22.0,1,0,7.25,0,S,cheap,medium,Mr,adult
1,1.0,1,1,38.0,1,0,71.2833,0,C,very_high,medium,Mrs,adult


In [23]:
#train_full and  test_full split
def split_trainANDtest(data_all_filtered):
  data_train = data_all_filtered.iloc[0:train.shape[0],:]
  data_test = data_all_filtered.iloc[train.shape[0]:,:]
  return data_train, data_test

# classifier CV score and pred score
def train_clasifier(classifier_X,X_train,y_train,X_val,y_val):
  baseline = classifier_X
  scores_cv = cross_val_score(baseline, X_train, y_train, cv=5,scoring='accuracy')
  model = baseline.fit(X_train,y_train)
  scores_pred = accuracy_score(y_val,model.predict(X_val))
  return scores_cv, scores_pred

# min max scaler
def Min_Max_Scaler_X(train_enc, val_enc):
  scaler = MinMaxScaler()
  train_enc = scaler.fit_transform(train_enc)
  val_enc = scaler.transform(val_enc)
  return train_enc,val_enc

# create a nested list of combination of all possible combination of features 
def all_combinations(categorical_columns):
  all_combinations = []
  for r in range(len(categorical_columns) + 1):
    combinations_object = itertools.combinations(categorical_columns, r)
    combinations_list = list(combinations_object)
    all_combinations += combinations_list
  all_combinations = [list(x) for x in all_combinations][1:]
  return all_combinations


## Find best Encoding

## 1. Baseline Encoder

In [24]:
data_train, data_test = split_trainANDtest(data_all_filtered)
data_train.shape, data_test.shape

((891, 13), (418, 13))

In [25]:
data_train['Fare_bin'] = data_train['Fare_bin'].map({'cheap':0,'medium':1,'high':2,'very_high':3})
data_train['Family_size'] = data_train['Family_size'].map({'single':0,'medium':1,'large':2})
data_train['age_bin'] = data_train['age_bin'].map({'baby':0,'young':1,'adult':2,'elderly':3})
data_train['Embarked'] = data_train['Embarked'].map({'S':0,'C':1,'Q':2})
data_train['Title'] = data_train['Title'].map({'Mr':0,'Miss':1,'Mrs':2, 'Master':3,'Rev':4,'Dr':5, 'Other_t':6,'Col':7,'Major':8 })
data_train.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,is_Alone,Embarked,Fare_bin,Family_size,Title,age_bin
0,0.0,3,0,22.0,1,0,7.25,0,0,0,1,0,2
1,1.0,1,1,38.0,1,0,71.2833,0,1,3,1,2,2


In [26]:
X_train_1, X_val_1, y_train_1, y_val_1 = train_test_split(data_train.drop(columns=['Survived']),data_train['Survived'] , test_size=0.2, random_state=42, shuffle=True)
X_train_1.shape, y_train_1.shape, X_val_1.shape, y_val_1.shape

((712, 12), (712,), (179, 12), (179,))

In [27]:
# iterate through all encoders and all classifiers
def find_best_encoder(classifiers_list,X_train, y_train,X_val,y_val):
  statistics_1 =[]
  best_score = 0
  best_enc = 'None'
  for i, classifier in enumerate(classifiers_list):
    #print("***************************************************")

    train_enc,  val_enc = Min_Max_Scaler_X(X_train,X_val)

    score_CV, pred_score = train_clasifier(classifier, train_enc, y_train,val_enc,y_val)

    mean_score_cv = round(score_CV.mean(),4)
    std_score_cv = round(score_CV.std(), 4)
    # our goal is to maximize (mean_score_cv+pred_score) and minimize std_score_cv
    total_score = round((mean_score_cv+pred_score)/2,4)
    if total_score > best_score:
      best_score = total_score
    # print("cv_mean: %0.3f (+/- %0.2f), pred_score: %0.3f" 
    #   % (mean_score_cv, std_score_cv, pred_score))
    statistics_1.append(total_score)
  # all scores, max, max_score, mean_Score, std_score
  return statistics_1, best_score, round(statistics.mean(statistics_1),4), round(statistics.stdev(statistics_1),4)

In [28]:
X_train_1.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,is_Alone,Embarked,Fare_bin,Family_size,Title,age_bin
331,1,0,46.0,0,0,28.5,1,0,1,0,0,2
733,2,0,23.0,0,0,13.0,1,0,1,0,0,2


In [29]:
X_val_1.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,is_Alone,Embarked,Fare_bin,Family_size,Title,age_bin
709,3,0,17.0,1,1,15.2458,0,1,1,1,3,1
439,2,0,31.0,0,0,10.5,1,0,1,0,0,2


In [30]:
result1, best_score_baseline, best_mean_baseline, best_std_baseline = find_best_encoder(classifiers_list, X_train_1, y_train_1,X_val_1,y_val_1)

In [31]:
# for each classifier, result
result1, best_score_baseline, best_mean_baseline, best_std_baseline

([0.8284, 0.8207, 0.8277, 0.813, 0.82], 0.8284, 0.822, 0.0063)

**Our goal**:

find encoder which is better in mean score, and less than std

## Level2 - Encoder
- apply all encoders to each combination of columns and apply simple encoder to others

In [32]:
import itertools

In [33]:
categorical_columns = ['Embarked','Fare_bin','Family_size','Title','age_bin']
col_to_apply = all_combinations(categorical_columns)
encoder_list = [OrdinalEncoder(cols=categorical_columns, return_df=True), WOEEncoder(cols=categorical_columns,return_df=True),
                TargetEncoder(cols=categorical_columns, return_df=True), MEstimateEncoder(cols=categorical_columns,return_df=True), 
                JamesSteinEncoder(cols=categorical_columns, return_df=True),
                CatBoostEncoder(cols=categorical_columns,return_df=True), BaseNEncoder(cols=categorical_columns,return_df=True,base=3),
                BaseNEncoder(cols=categorical_columns,return_df=True,base=2),BaseNEncoder(cols=categorical_columns,return_df=True,base=4),
                OneHotEncoder(cols=categorical_columns,return_df=True),CountEncoder(cols=categorical_columns,return_df=True, handle_unknown=0)]
col_to_apply[0:5]

[['Embarked'], ['Fare_bin'], ['Family_size'], ['Title'], ['age_bin']]

In [34]:
len(encoder_list)

11

In [35]:
data_train, data_test = split_trainANDtest(data_all_filtered)
data_train.shape, data_test.shape

((891, 13), (418, 13))

In [36]:
X_train_2, X_val_2, y_train_2, y_val_2 = train_test_split(data_train.drop(columns=['Survived']),data_train['Survived'] , test_size=0.2, random_state=42, shuffle=True)
X_train_2.shape, y_train_2.shape, X_val_2.shape, y_val_2.shape

((712, 12), (712,), (179, 12), (179,))

In [37]:
X_train_2.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,is_Alone,Embarked,Fare_bin,Family_size,Title,age_bin
331,1,0,46.0,0,0,28.5,1,S,medium,single,Mr,adult
733,2,0,23.0,0,0,13.0,1,S,medium,single,Mr,adult


In [38]:
X_val_2.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,is_Alone,Embarked,Fare_bin,Family_size,Title,age_bin
709,3,0,17.0,1,1,15.2458,0,C,medium,medium,Master,young
439,2,0,31.0,0,0,10.5,1,S,medium,single,Mr,adult


In [39]:
#maximize min
statistics_2 = []
#maximize max value
statistics_3 = []

for try_i in range(len(encoder_list)):
  print(f"{try_i}---------------------------------------------------------------------------------------------------------------------------------------------------")
  final_result = []
  for i, col_enc in enumerate(col_to_apply):
    X_train_df = X_train_2.copy()
    X_val_df = X_val_2.copy()
    other_cols = list(set(categorical_columns) - set(col_enc))
    if 'Fare_bin' in other_cols:
      X_train_df['Fare_bin'] = X_train_df['Fare_bin'].map({'cheap':0,'medium':1,'high':2,'very_high':3})
      X_val_df['Fare_bin'] = X_val_df['Fare_bin'].map({'cheap':0,'medium':1,'high':2,'very_high':3})
    if 'Family_size' in other_cols:
      X_train_df['Family_size'] = X_train_df['Family_size'].map({'single':0,'medium':1,'large':2})
      X_val_df['Family_size'] = X_val_df['Family_size'].map({'single':0,'medium':1,'large':2})
    if 'age_bin' in other_cols:
      X_train_df['age_bin'] = X_train_df['age_bin'].map({'baby':0,'young':1,'adult':2,'elderly':3})
      X_val_df['age_bin'] = X_val_df['age_bin'].map({'baby':0,'young':1,'adult':2,'elderly':3})
    if 'Embarked' in other_cols:
      X_train_df['Embarked'] = X_train_df['Embarked'].map({'S':0,'C':1,'Q':2})
      X_val_df['Embarked'] = X_val_df['Embarked'].map({'S':0,'C':1,'Q':2})
    if 'Title' in other_cols:
      X_train_df['Title'] = X_train_df['Title'].map({'Mr':0,'Miss':1,'Mrs':2, 'Master':3,'Rev':4,'Dr':5, 'Other_t':6,'Col':7,'Major':8 })
      X_val_df['Title'] = X_val_df['Title'].map({'Mr':0,'Miss':1,'Mrs':2, 'Master':3,'Rev':4,'Dr':5, 'Other_t':6,'Col':7,'Major':8 })

    encoder_list = [OrdinalEncoder(cols=col_enc, return_df=True), WOEEncoder(cols=col_enc,return_df=True),
                  TargetEncoder(cols=col_enc, return_df=True), MEstimateEncoder(cols=col_enc,return_df=True), 
                  JamesSteinEncoder(cols=col_enc, return_df=True),
                  CatBoostEncoder(cols=col_enc,return_df=True), BaseNEncoder(cols=col_enc,return_df=True,base=3),
                  BaseNEncoder(cols=col_enc,return_df=True,base=2),BaseNEncoder(cols=col_enc,return_df=True,base=4),
                  OneHotEncoder(cols=col_enc,return_df=True),CountEncoder(cols=col_enc,return_df=True, handle_unknown=0)]
    #encode each combination
    encoder_try = encoder_list[try_i]
    X_train_df_enc = encoder_try.fit_transform(X_train_df,y_train_2)
    X_val_df_enc = encoder_try.transform(X_val_df)


    result2_all_score, result2_best_score, result2_mean, result2_std =  find_best_encoder(classifiers_list, X_train_df_enc,y_train_2,X_val_df_enc,y_val_2)
    final_result.append((result2_mean, result2_std, result2_best_score, try_i, i))
  #print((result2_mean, result2_std, result2_best_score, try_i, i))
  # append best average score, std and i_th iteration over all combination to detect columns
  statistics_2.append([y for y in final_result if y[0]==max([x[0] for x in final_result])] )
  statistics_3.append([y for y in final_result if y[2]==max([x[2] for x in final_result])] )
#X_val_df.head()

0---------------------------------------------------------------------------------------------------------------------------------------------------
1---------------------------------------------------------------------------------------------------------------------------------------------------
2---------------------------------------------------------------------------------------------------------------------------------------------------
3---------------------------------------------------------------------------------------------------------------------------------------------------
4---------------------------------------------------------------------------------------------------------------------------------------------------
5---------------------------------------------------------------------------------------------------------------------------------------------------
6---------------------------------------------------------------------------------------------------------

In [40]:
statistics_2

[[(0.8243, 0.0047, 0.8291, 0, 11),
  (0.8243, 0.0047, 0.8291, 0, 17),
  (0.8243, 0.0047, 0.8291, 0, 22),
  (0.8243, 0.0047, 0.8291, 0, 26)],
 [(0.8217, 0.0078, 0.8312, 1, 6), (0.8217, 0.0106, 0.834, 1, 9)],
 [(0.8228, 0.0081, 0.834, 2, 2)],
 [(0.8228, 0.0082, 0.834, 3, 2)],
 [(0.8228, 0.0081, 0.834, 4, 2)],
 [(0.821, 0.0106, 0.8361, 5, 2)],
 [(0.8234, 0.0081, 0.834, 6, 3), (0.8234, 0.0077, 0.8347, 6, 12)],
 [(0.8227, 0.0084, 0.834, 7, 2)],
 [(0.8235, 0.0083, 0.8347, 8, 8)],
 [(0.8229, 0.0098, 0.8319, 9, 6)],
 [(0.8246, 0.0056, 0.8333, 10, 24)]]

In [41]:
best_search = [y for y in statistics_2 if y[0]==max([x[0] for x in statistics_2])]
best_search

[[(0.8246, 0.0056, 0.8333, 10, 24)]]

In [42]:
print(f"best mean score: {best_search[0][0][0]}\nStd: {best_search[0][0][1]}\nMaxclassifier_score: {best_search[0][0][2]}\nEncoder_idx: {best_search[0][0][3]}\nCols_to_dec_inx: {best_search[0][0][4]}")

best mean score: 0.8246
Std: 0.0056
Maxclassifier_score: 0.8333
Encoder_idx: 10
Cols_to_dec_inx: 24


In [43]:
col_to_apply[24]

['Family_size', 'Title', 'age_bin']

**Result**:
- best encoder CountEncoder(cols=col_enc,return_df=True, handle_unknown=0)
- features to encode: ['Family_size', 'Title', 'age_bin']

## Testing - best encoder

In [44]:
data_train, data_test = split_trainANDtest(data_all_filtered)
data_train.shape, data_test.shape

((891, 13), (418, 13))

In [45]:
#test
X_train_3, X_val_3, y_train_3, y_val_3 = train_test_split(data_train.drop(columns=['Survived']),data_train['Survived'] , test_size=0.2, random_state=42, shuffle=True)
X_train_3.shape, y_train_3.shape, X_val_3.shape, y_val_3.shape

((712, 12), (712,), (179, 12), (179,))

In [46]:
X_train_3.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,is_Alone,Embarked,Fare_bin,Family_size,Title,age_bin
331,1,0,46.0,0,0,28.5,1,S,medium,single,Mr,adult
733,2,0,23.0,0,0,13.0,1,S,medium,single,Mr,adult


In [47]:
label_encoder_cols = list(set(categorical_columns) - set(col_to_apply[24]))
counter_encoder_cols = col_to_apply[24]
label_encoder_cols, counter_encoder_cols

(['Fare_bin', 'Embarked'], ['Family_size', 'Title', 'age_bin'])

In [48]:
# X_train_df['Fare_bin'] = X_train_df['Fare_bin'].map({'cheap':0,'medium':1,'high':2,'very_high':3})
# X_train_df['Family_size'] = X_train_df['Family_size'].map({'single':0,'medium':1,'large':2})
# X_train_df['age_bin'] = X_train_df['age_bin'].map({'baby':0,'young':1,'adult':2,'elderly':3})
# X_train_df['Embarked'] = X_train_df['Embarked'].map({'S':0,'C':1,'Q':2})
# X_train_df['Title'] = X_train_df['Title'].map({'Mr':0,'Miss':1,'Mrs':2, 'Master':3,'Rev':4,'Dr':5, 'Other_t':6,'Col':7,'Major':8 })

In [49]:
X_train_3['Fare_bin'] = X_train_3['Fare_bin'].map({'cheap':0,'medium':1,'high':2,'very_high':3})
X_train_3['Embarked'] = X_train_3['Embarked'].map({'S':0,'C':1,'Q':2})

X_val_3['Fare_bin'] = X_val_3['Fare_bin'].map({'cheap':0,'medium':1,'high':2,'very_high':3})
X_val_3['Embarked'] = X_val_3['Embarked'].map({'S':0,'C':1,'Q':2})

best_encoder = CountEncoder(cols=counter_encoder_cols,return_df=True, handle_unknown=0)
X_train_3 = best_encoder.fit_transform(X_train_3,y_train_3)
X_val_3 = best_encoder.transform(X_val_3)

In [50]:
X_train_3.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,is_Alone,Embarked,Fare_bin,Family_size,Title,age_bin
331,1,0,46.0,0,0,28.5,1,0,1,429,419,502
733,2,0,23.0,0,0,13.0,1,0,1,429,419,502


In [51]:
X_val_3.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,is_Alone,Embarked,Fare_bin,Family_size,Title,age_bin
709,3,0,17.0,1,1,15.2458,0,1,1,206,33,104
439,2,0,31.0,0,0,10.5,1,0,1,429,419,502


In [52]:
all_scores_best, best_score_best, mean_score_best, std_score_best =  find_best_encoder(classifiers_list, X_train_3,y_train_3,X_val_3,y_val_3)

In [53]:
statistics.mean(all_scores_best)

0.82462

In [54]:
#test is correct
print(best_search[0][0][0] ==mean_score_best)
print(mean_score_best)

True
0.8246


## testing 2 - typical categorical feature encoding

In [55]:
data_train, data_test = split_trainANDtest(data_all_filtered)
data_train.shape, data_test.shape
#test
X_train_4, X_val_4, y_train_4, y_val_4 = train_test_split(data_train.drop(columns=['Survived']),data_train['Survived'] , test_size=0.2, random_state=42, shuffle=True)
X_train_4.shape, y_train_4.shape, X_val_4.shape, y_val_4.shape

((712, 12), (712,), (179, 12), (179,))

In [56]:
categorical_columns

['Embarked', 'Fare_bin', 'Family_size', 'Title', 'age_bin']

In [57]:
ordinal_columns = ['Fare_bin','Family_size','age_bin'] # label encoding
nominal_columns = ['Embarked', 'Title'] 
X_train_4['Fare_bin'] = X_train_4['Fare_bin'].map({'cheap':0,'medium':1,'high':2,'very_high':3})
X_train_4['Family_size'] = X_train_4['Family_size'].map({'single':0,'medium':1,'large':2})
X_train_4['age_bin'] = X_train_4['age_bin'].map({'baby':0,'young':1,'adult':2,'elderly':3})

X_val_4['Fare_bin'] = X_val_4['Fare_bin'].map({'cheap':0,'medium':1,'high':2,'very_high':3})
X_val_4['Family_size'] = X_val_4['Family_size'].map({'single':0,'medium':1,'large':2})
X_val_4['age_bin'] = X_val_4['age_bin'].map({'baby':0,'young':1,'adult':2,'elderly':3})

enc2 =  BaseNEncoder(cols=nominal_columns,return_df=True,base=2)
X_train_4 = enc2.fit_transform(X_train_4,y_train_4)
X_val_4 = enc2.transform(X_val_4)

In [58]:
test2_1, test2_2, test2_3, test2_4  =  find_best_encoder(classifiers_list, X_train_4,y_train_4,X_val_4,y_val_4)

In [59]:
statistics.mean(test2_1)

0.82114

**result**
- Not an improvement

# Conclusion:
- Base Encoder: CountEncoder(cols=counter_encoder_cols,return_df=True, handle_unknown=0)
- features to encode with CountEncoder:  ['Family_size', 'Title', 'age_bin']
- features to encode with labelencoder : ['Fare_bin', 'Embarked']

## Future Work:
- can be tried other encoding methods rather than default label encoding. 