# Tabular Playground Series - May 2022: Feature Boundary

- https://www.kaggle.com/code/ambrosm/tpsmay22-eda-which-makes-sense
- https://www.kaggle.com/code/wti200/analysing-interactions-with-shap

I was interested in investigating the difference in performance between
a pre-determined feature boundary:


In [None]:
from pathlib import Path
from warnings import simplefilter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

simplefilter("ignore")
input_dir = Path('input')

RANDOM_STATE=42

### Load training data

In [None]:
train = pd.read_csv(input_dir / 'train.csv')
train = train.set_index('id').sort_index()
test = pd.read_csv(input_dir / 'test.csv')
test = test.set_index('id').sort_index()
display(train.head(2))

#### Make features

f_27 features and interaction features borrowed from https://www.kaggle.com/code/ambrosm/tpsmay22-eda-which-makes-sense

In [None]:
def make_features(X_in, boundaries=True):
    """
    generate features for incoming dataframe
    
    boundaries: specifies whether interaction features incorporate explicit boundaries
    
    returns: dataframe with features
    """
    
    # start with float and int features
    X = X_in.select_dtypes(['float64','int64'])
 
    # manufacture features from f_27:
    # - feature for each character position, with ordinal-encoding (10 features)
    # - feature with total number of distinct characters
    for i in range(10):
        X[f"f_27_{i}"] = X_in["f_27"].str[i].apply(ord) - ord("A")
        X["f_27_count"] =  X_in["f_27"].apply(lambda s: len(set(s)))
        
    # interaction features:
    # if boundaries==True, create 3 ternary features based on explicit boundaries
    if boundaries: 
        X["f_21_f_02"] = (X.f_02 + X.f_21 > 5.2).astype('int') - (X.f_02 + X.f_21 < -5.3).astype('int')
        X["f_26_f_00_f_01"] = (X.f_01 + X.f_00 + X.f_26 > 5.0).astype('int') - (X.f_01 + X.f_00 + X.f_26 < -5.0).astype('int')
        X["f_22_f_05"] =( X.f_22 + X.f_05 > 5.1).astype('int') - (X.f_22 + X.f_05 < -5.4).astype('int')
    else:
        X["f_21_f_02"] = X.f_02 + X.f_21 
        X["f_26_f_00_f_01"] = X.f_01 + X.f_00 + X.f_26
        X["f_22_f_05"] = X.f_22 + X.f_05

    return X

In [None]:
def make_xgb(random_state=RANDOM_STATE):
    """
    instantiate XGB with GPU
    """
    
    return XGBClassifier(n_estimators=500,
                         objective='binary:logistic',
                         eval_metric='auc',
                         random_state=random_state,
                         #tree_method='gpu_hist')
                         n_jobs=4
                        )

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

y = train.target
X = make_features(train.drop(columns=['target']), boundaries=True)   

skf = StratifiedKFold(n_splits=3)
scores1 = cross_val_score(make_xgb(), X, y, cv=skf, scoring="roc_auc", verbose=2)

X = make_features(train.drop(columns=['target']), boundaries=False)   
scores2 = cross_val_score(make_xgb(), X, y, cv=skf, scoring="roc_auc", verbose=2)


In [None]:
print(np.mean(scores1))
print(np.mean(scores2))