# Model Building
In this stage, you will build several machine learning models on the cleaned data set and attempt to train a model that performs better than baseline. Depending on your data set, this may mean different things.
## Imports

In [1]:
import os
import sys
import pandas as pd

In [2]:
src_path = os.path.abspath('../src/')
sys.path.append(src_path)

from ed_data_modeling import *

In [3]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
pd.set_option("display.max_rows", 500)

#For dataset
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import numpy as np

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, RobustScaler, PowerTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.utils import check_random_state
from sklearn.feature_selection import RFE

from statsmodels.stats.outliers_influence import variance_inflation_factor

from scipy.stats import mstats

from xgboost import XGBClassifier

from category_encoders.target_encoder import TargetEncoder

## Functions
For your convenience, we have included a few pre-written functions, which you might find useful in your model building. They are by no means necessary, but feel free to use any or all of them.

In [4]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, col_names=None, col_nums=None):
        self.col_names=col_names
        self.col_nums=col_nums
        self.use = None
        assert (self.col_names is not None) or (self.col_nums is not None), 'Must set either col_names or col_nums'
        
    def fit(self, X, y=None):
        if self.col_names is not None:
            self.use = 'col_names'
        elif self.col_nums is not None:
            self.use = 'col_nums'
        return self
    
    def transform(self, X, y=None):
        if self.use == 'col_names':
            _X = X[self.col_names].copy()
        elif self.use == 'col_nums':
            _X = X[:, self.col_nums]
        return(_X)

In [5]:
def encode_dataset(X_train, y_train, X_test, cat_columns, encoder):
    noncat_train = X_train.drop(columns=cat_columns)
    noncat_test = X_test.drop(columns=cat_columns)
    cat_train = X_train[cat_columns]
    cat_test = X_test[cat_columns]
    
    cat_train_encoded = encoder.fit_transform(cat_train, y_train)
    cat_test_encoded = encoder.transform(cat_test)
    
    X_train_encoded = noncat_train.join(cat_train_encoded)
    X_test_encoded = noncat_test.join(cat_test_encoded)
    return(X_train_encoded, X_test_encoded)

In [6]:
class Winsorizer(BaseEstimator, TransformerMixin):
    def __init__(self, limits=(0.01, 0.01)):
        self.limits=limits

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return mstats.winsorize(X, limits=self.limits)

In [7]:
# Custom Transformer: Temporary Imputation + Variance Threshold
class TemporarySimpleImputerVarianceThreshold(BaseEstimator, TransformerMixin):
    def __init__(self, strategy='median', fill_value=None, threshold=0.001, constant_fill_value=0):
        self.strategy = strategy
        self.fill_value = fill_value
        self.threshold = threshold
        self.constant_fill_value = constant_fill_value
        self.imputer = SimpleImputer(strategy=self.strategy, fill_value=self.fill_value)
        self.selector = VarianceThreshold(threshold=self.threshold)
        self.selected_features = None  # Store selected feature names

    def fit(self, X, y=None):
        X = pd.DataFrame(X)

        # Identify columns that are entirely NaN
        all_nan_columns = X.columns[X.isna().all()]

        # Fill entirely NaN columns with a constant value
        X[all_nan_columns] = X[all_nan_columns].fillna(self.constant_fill_value)

        # Temporarily impute remaining missing values with the median value of each column
        X_imputed = pd.DataFrame(self.imputer.fit_transform(X), columns=X.columns, index=X.index)

        # Apply VarianceThreshold on the imputed data
        self.selector.fit(X_imputed)

        # Store the names of selected features
        self.selected_features = X.columns[self.selector.get_support()]
        return self

    def transform(self, X):
        X = pd.DataFrame(X)

        # Keep only selected features, but retain original NaNs
        return X[self.selected_features]

In [8]:
class VIFSelector(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=10):
        self.threshold = threshold
        self.selected_features_ = None

    def fit(self, X, y=None):
        X = X if isinstance(X, np.ndarray) else X.to_numpy()
        vif = [variance_inflation_factor(X, i) for i in range(X.shape[1])]
        
        self.selected_features_ = np.where(np.array(vif) < self.threshold)[0]
        
        return self

    def transform(self, X):
        if self.selected_features_ is None:
            raise ValueError("VIFSelector has not been fitted yet.")

        X = X if isinstance(X, np.ndarray) else X.to_numpy()

        if len(self.selected_features_) == 0:
            raise ValueError("No features were selected after VIF filtering. Try increasing the threshold.")
        
        return X[:, self.selected_features_].reshape(X.shape[0], -1)

### score_classification
score_classification takes the predicted results from a model and scores them on every classification metric ever. It also gives the confusion matrix.

Parameters:
- y_train: (1d array-like) The correct y values for the training data set
- y_train_pred: (1d array-like) The predicted y values from the training data set
- y_test: (1d array-like) The correct y values for the test data set
- y_test_pred: (1d array-like) The predicted y values from the test data set

This function uses [sklearn](https://scikit-learn.org/stable/modules/classes.html).metrics to calculate each score. The required functions are imported inside the function.

### downsample
Takes a dataframe and the name (string) of its target column and [downsamples](https://developers.google.com/machine-learning/data-prep/construct/sampling-splitting/imbalanced-data) the majority class to equal the minority class.

Parameters:
- df: a Pandas DataFrame containing the data to be downsampled
- target: string. The name of the target variable.

This function uses the Python libraries [Pandas](https://pandas.pydata.org/docs/reference/index.html) (pd), which has been imported above, and [resample](https://scikit-learn.org/stable/modules/generated/sklearn.utils.resample.html) from the [sklearn](https://scikit-learn.org/stable/modules/classes.html) library, which is imported inside the function.

### scaled_model_search 
Takes a list of scalers and models, along with test-train split data, and runs a search over every possible combination of scaler and model. It prints out the best result. Currently the metric used is accuracy, but it would be simple enough to change depending on the situation.

Parameters:
- scalers: a list of initialized scaler functions (ex: scalers = [StandardScaler(), RobustScaler(), QuantileTransformer(random_state = 42)]
- models: a list of initialized model function (ex: models = [LogisticRegression(), ExtraTreesClassifier(random_state = 42), RandomForestClassifier(random_state = 42)]
- X_train: DataFrame containing the training data set without the target variable
- y_train: DataFrame containing the target variable for the training data.
- X_test: DataFrame containing the test data set without the target variable
- y_test: DataFrame containing the target variable for the test data.

This function uses the [sklearn](https://scikit-learn.org/stable/modules/classes.html) function [accuracy_score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html) as a metric to compare the models, and it has been imported inside the function. It also uses [Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html) from [sklearn](https://scikit-learn.org/stable/modules/classes.html), which has been imported inside the function.

## Data
Read in the clean data set from your data_preparation notebook. It should be ready for some preliminary model-building by now, but you should consider your variables and decide if you want to use all of them to train a model. You should have a clear reason for excluding any variables. Also consider time-series data (if applicable to your set). If you have data from multiple years, should you train and test on each year individually? Train on one year and test on another?

In [9]:
dtype_dict = {
    'c3':'object',
    'v1':'object',
    'v2':'object',
    'v3':'object',
    'v4':'object',
    'v5':'object',
    'v6':'object',
    'v7':'object',
    'v8':'object',
    'v9':'object',
    'v10':'object',
    'v11':'object',
    'v12':'object',
    'v13':'object',
    'v14':'object',
    'v15':'object',
    'v17':'object',
    'v18':'object',
    'v19':'object',
    'v21':'object',
    'v22':'object',
    'v23':'object',
    'v24':'object',
    'v29':'object',
    'v31':'object',
    'v34':'object',
    'v35':'object',
    'v36':'object',
    'v39':'object',
    'v41':'object',
    'v42':'object',
    'v43':'object',
    'v44':'object',
    'v45':'object',
    'v46':'object',
    'v47':'object',
    'v48':'object',
    'v49':'object',
    'v50':'object',
    'v51':'object',
    'v52':'object',
    'v53':'object',
    'v54':'object',
    'v55':'object',
    'v57':'object',
    'v58':'object',
    'v59':'object',
    'v60':'object',
    'v61':'object',
    'v62':'object',
    'v63':'object',
    'v64':'object',
    'v65':'object',
    'v66':'object',
    'v67':'object',
    'v68':'object',
    'v69':'object',
    'v70':'object',
    'v71':'object',
    'v72':'object',
    'v73':'object',
    'v74':'object',
    'v75':'object',
    'v76':'object',
    'v77':'object',
    'v78':'object',
    'v79':'object',
    'v80':'object',
    'v81':'object',
    'v82':'object',
    'v83':'object',
    'v84':'object',
    'v85':'object',
    'v86':'object',
    'v87':'object',
    'v88':'object',
    'v89':'object',
    'v90':'object',
    'v91':'object',
    'v92':'object',
    'v93':'object',
    'v94':'object',
    'v98':'object',
    'v100':'object',
    'v104':'object',
    'v107':'object',
    'v108':'object',
    'v109':'object',
    'v110':'object',
    'v111':'object',
    'v112':'object',
    'v113':'object',
    'v114':'object',
    'v115':'object',
    'v116':'object',
    'v117':'object',
    'v118':'object',
    'v119':'object',
    'v120':'object',
    'v121':'object',
    'v122':'object',
    'v123':'object',
    'v124':'object',
    'v125':'object',
    'v138':'object',
    'v139':'object',
    'v140':'object',
    'v141':'object',
    'v142':'object',
    'v146':'object',
    'v147':'object',
    'v148':'object',
    'v149':'object',
    'v153':'object',
    'v154':'object',
    'v155':'object',
    'v156':'object',
    'v157':'object',
    'v158':'object',
    'v169':'object',
    'v170':'object',
    'v172':'object',
    'v173':'object',
    'v174':'object',
    'v175':'object',
    'v176':'object',
    'v181':'object',
    'v183':'object',
    'v184':'object',
    'v185':'object',
    'v186':'object',
    'v188':'object',
    'v189':'object',
    'v190':'object',
    'v191':'object',
    'v192':'object',
    'v193':'object',
    'v194':'object',
    'v195':'object',
    'v196':'object',
    'v197':'object',
    'v198':'object',
    'v199':'object',
    'v200':'object',
    'v220':'object',
    'v223':'object',
    'v235':'object',
    'v236':'object',
    'v237':'object',
    'v238':'object',
    'v239':'object',
    'v241':'object',
    'v242':'object',
    'v244':'object',
    'v246':'object',
    'v247':'object',
    'v249':'object',
    'v250':'object',
    'v251':'object',
    'v252':'object',
    'v257':'object',
    'v260':'object',
    'v262':'object',
    'v281':'object',
    'v282':'object',
    'v284':'object',
    'v286':'object',
    'v287':'object',
    'v288':'object',
    'v289':'object',
    'v297':'object',
    'v300':'object',
    'v301':'object',
    'v302':'object',
    'v303':'object',
    'v304':'object',
    'v305':'object',
    'v325':'object',
    'v326':'object',
    'v327':'object',
    'v328':'object',
    'id_32':'object'
}
    
    

In [10]:
df = pd.read_csv('../data/interim/df_train.csv', index_col='transactionid', dtype=dtype_dict)
df.head()

Unnamed: 0_level_0,isfraud,transactiondt,transactionamt,productcd,card1,card2,card3,card4,card5,card6,...,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38
transactionid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,...,,,,,,,,,,
2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,
2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,,,,,,,,,,
2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,
2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,NotFound,Android 7.0,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T


## Data Splitting
Once you have an idea of how you plan to use the data, split your data into train and test groups or, if you prefer a more complicated approach, multiple folds. 

In [11]:
df.shape

(590537, 434)

In [12]:
X = df.drop(columns='isfraud')
y = df['isfraud']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## Baseline Model
Before anything else, let's build a baseline model. This will serve as a "sanity check" for everything that comes after. Choose a simplistic model and, without any preprocessing or tuning, train a model on the training set. How well does it perform on the test set?

In [14]:
categorical_columns = X.select_dtypes(include='object').columns.to_list()
numeric_columns = [c for c in X.columns if c not in categorical_columns]

#### Baseline Logistic Regression Model

In [15]:
lr_baseline_pipe = Pipeline([
    ('column selector', ColumnSelector(col_names=numeric_columns)),
    ('simple imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('variance threshold', VarianceThreshold(threshold=0.01)),
    ('winsorize', Winsorizer(limits=(0.01, 0.01))),
    ('power_transform', PowerTransformer(method='yeo-johnson')),
    ('robust scaler', RobustScaler()),
    ('logistic_regression', LogisticRegression(max_iter=1000, random_state=42)),
])

In [16]:
# Evaluate with cross-validation
scores = cross_val_score(lr_baseline_pipe, X_train, y_train, cv=5, scoring='roc_auc')

print("Cross-validation scores:", scores)
print("Mean cross-validation score:", np.mean(scores))

Cross-validation scores: [0.85497351 0.85349982 0.85588585 0.85850373 0.85732817]
Mean cross-validation score: 0.8560382156946511


In [17]:
lr_baseline_pipe.fit(X_train, y_train)

In [18]:
lr_pipe_y_preds_proba = lr_baseline_pipe.predict_proba(X_test)[:,1]
roc_auc_score(y_test, lr_pipe_y_preds_proba)

np.float64(0.8524419838601197)

In [19]:
lr_pipe_y_preds = lr_baseline_pipe.predict(X_test)
test_score_classification(y_test, lr_pipe_y_preds, lr_pipe_y_preds_proba)

                       Test
Accuracy           0.970840
Balanced_Accuracy  0.620342
Precision          0.800375
Recall             0.242925
f1                 0.372723
ROC_AUC            0.852442
Brier_Loss         0.025755
Log_Loss           0.108741
Jaccard            0.229047
[[142051    319]
 [  3986   1279]]


#### Baseline Random Forest Model

In [20]:
rf_baseline_pipe = Pipeline([
    ('column selector', ColumnSelector(col_names=numeric_columns)),
    ('simple imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('random_forest', RandomForestClassifier(random_state=42)),
])

In [21]:
# Evaluate with cross-validation
scores = cross_val_score(rf_baseline_pipe, X_train, y_train, cv=5, scoring='roc_auc')

print("Cross-validation scores:", scores)
print("Mean cross-validation score:", np.mean(scores))

Cross-validation scores: [0.90685953 0.90809343 0.90937031 0.9126309  0.91057612]
Mean cross-validation score: 0.9095060587885959


In [22]:
rf_baseline_pipe.fit(X_train, y_train)

In [23]:
rf_pipe_y_preds_proba = rf_baseline_pipe.predict_proba(X_test)[:,1]
roc_auc_score(y_test, rf_pipe_y_preds_proba)

np.float64(0.9203362364999883)

In [24]:
rf_pipe_y_preds = rf_baseline_pipe.predict(X_test)
test_score_classification(y_test, rf_pipe_y_preds, rf_pipe_y_preds_proba)

                       Test
Accuracy           0.978528
Balanced_Accuracy  0.711942
Precision          0.940311
Recall             0.424881
f1                 0.585296
ROC_AUC            0.920336
Brier_Loss         0.018103
Log_Loss           0.126876
Jaccard            0.413723
[[142228    142]
 [  3028   2237]]


#### Baseline XGBoost Model

In [25]:
xgb_baseline_pipe = Pipeline([
    ('column selector', ColumnSelector(col_names=numeric_columns)),
    #('simple imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('xgboost', XGBClassifier(random_state=42)),
])

In [26]:
# Evaluate with cross-validation
scores = cross_val_score(xgb_baseline_pipe, X_train, y_train, cv=5, scoring='roc_auc')

print("Cross-validation scores:", scores)
print("Mean cross-validation score:", np.mean(scores))

Cross-validation scores: [0.93129726 0.9326791  0.93288734 0.93529311 0.93319912]
Mean cross-validation score: 0.9330711842127523


In [27]:
xgb_baseline_pipe.fit(X_train, y_train)

In [28]:
xgb_pipe_y_preds_proba = xgb_baseline_pipe.predict_proba(X_test)[:,1]
roc_auc_score(y_test, xgb_pipe_y_preds_proba)

np.float64(0.9373520616031913)

In [29]:
xgb_pipe_y_preds = xgb_baseline_pipe.predict(X_test)
test_score_classification(y_test, xgb_pipe_y_preds, xgb_pipe_y_preds_proba)

                       Test
Accuracy           0.979876
Balanced_Accuracy  0.742272
Precision          0.905587
Recall             0.486420
f1                 0.632893
ROC_AUC            0.937352
Brier_Loss         0.016804
Log_Loss           0.070785
Jaccard            0.462943
[[142103    267]
 [  2704   2561]]


## Model Improvement
Now you can work on improving on the baseline. There's no linear approach to this process and the steps you take will depend on the data. Below are some steps that are commonly used in building robust models. You can use any, all, or only some of them, and you are encouraged to add your own steps for your specific data set.

As you go through this process, keep in mind all that you learned during the data understanding phase and consider the following questions:
- What sort of model should you train? (ie, classification, regression? Neural network?)
- Given the distribution of your data, the presence or absence of missing data, and various other factors, is there a particular model (or ensemble) that you think will work well? (ie, RandomForest, ExtraTrees, SVM...?)
- Depending on what sort of model you train and what your data look like, you may find different evaluation metrics useful. How can you certain that you have the most well-rounded view of how well your model is performing? What metric or metrics will best capture your model priorities (and what are your model priorities)?

#### Categorical Encoding w/ Target Encoding

In [30]:
# Categorical pipeline
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', TargetEncoder())
])

# Numeric Pipeline
num_pipe = Pipeline([
    ('selector', ColumnSelector(col_names=numeric_columns))
])

# Combine categorical and numerical pipelines
preprocessor = ColumnTransformer([
    ('cat', cat_pipe, categorical_columns),
    ('num', num_pipe, numeric_columns)
])

# Fit pipeline with transformers and an estimator to the training data
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb_model', XGBClassifier(random_state=42))
])

In [31]:
# Evaluate with cross-validation
scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='roc_auc')

print("Cross-validation scores:", scores)
print("Mean cross-validation score:", np.mean(scores))

Cross-validation scores: [0.93728813 0.93920939 0.9396116  0.93700107 0.93863961]
Mean cross-validation score: 0.9383499602643909


In [32]:
pipe.fit(X_train, y_train)

In [33]:
y_preds_proba = pipe.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_preds_proba)

np.float64(0.9404142770722809)

In [34]:
y_preds = pipe.predict(X_test)
test_score_classification(y_test, y_preds, y_preds_proba)

                       Test
Accuracy           0.980838
Balanced_Accuracy  0.753014
Precision          0.918557
Recall             0.507692
f1                 0.653945
ROC_AUC            0.940414
Brier_Loss         0.016202
Log_Loss           0.068737
Jaccard            0.485823
[[142133    237]
 [  2592   2673]]


#### Categorical Encoding w/ OneHotEncoding

In [35]:
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Numeric Pipeline
num_pipe = Pipeline([
    ('selector', ColumnSelector(col_names=numeric_columns))
])

# Combine categorical and numerical pipelines
preprocessor = ColumnTransformer([
    ('cat', cat_pipe, categorical_columns),
    ('num', num_pipe, numeric_columns)
])

# Fit pipeline with transformers and an estimator to the training data
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb_model', XGBClassifier(random_state=42))
])



In [36]:
# Evaluate with cross-validation
scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='roc_auc')

print("Cross-validation scores:", scores)
print("Mean cross-validation score:", np.mean(scores))

Cross-validation scores: [0.93241193 0.93665394 0.93520814 0.93752416 0.93503623]
Mean cross-validation score: 0.9353668802656125


In [38]:
pipe.fit(X_train, y_train)

In [39]:
y_preds_proba = pipe.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_preds_proba)

np.float64(0.9370398032599806)

In [40]:
y_preds = pipe.predict(X_test)
test_score_classification(y_test, y_preds, y_preds_proba)

                       Test
Accuracy           0.979917
Balanced_Accuracy  0.741470
Precision          0.910128
Recall             0.484710
f1                 0.632544
ROC_AUC            0.937040
Brier_Loss         0.016910
Log_Loss           0.071252
Jaccard            0.462570
[[142118    252]
 [  2713   2552]]


#### Categorical Encoding w/ OneHotEncoding Variables Having Less Than 10 Unique Values and Target Encoding The Rest

In [41]:
categorical_columns_ohe = [c for c in categorical_columns if X[c].nunique() <= 10]
categorical_columns_te = [c for c in categorical_columns if X[c].nunique() >= 10]

In [42]:
cat_pipe_ohe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

cat_pipe_te = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', TargetEncoder())
])

# Numeric Pipeline
num_pipe = Pipeline([
    ('selector', ColumnSelector(col_names=numeric_columns))
])

# Combine categorical and numerical pipelines
preprocessor = ColumnTransformer([
    ('cat_ohe', cat_pipe_ohe, categorical_columns_ohe),
    ('cat_te', cat_pipe_te, categorical_columns_te),
    ('num', num_pipe, numeric_columns)
])

# Fit pipeline with transformers and an estimator to the training data
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb_model', XGBClassifier(random_state=42))
])


In [43]:
# Evaluate with cross-validation
scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='roc_auc')

print("Cross-validation scores:", scores)
print("Mean cross-validation score:", np.mean(scores))

Cross-validation scores: [0.93895764 0.93966116 0.94138777 0.93555543 0.93774502]
Mean cross-validation score: 0.9386614030448776


In [44]:
pipe.fit(X_train, y_train)

In [45]:
y_preds_proba = pipe.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_preds_proba)

np.float64(0.9418659771320678)

In [46]:
y_preds = pipe.predict(X_test)
test_score_classification(y_test, y_preds, y_preds_proba)

                       Test
Accuracy           0.980702
Balanced_Accuracy  0.752395
Precision          0.913982
Recall             0.506553
f1                 0.651839
ROC_AUC            0.941866
Brier_Loss         0.016155
Log_Loss           0.068078
Jaccard            0.483503
[[142119    251]
 [  2598   2667]]


#### Categorical Encoding w/ OneHotEncoding Variables Having Less Than 5 Unique Values and Target Encoding The Rest

In [47]:
categorical_columns_ohe = [c for c in categorical_columns if X[c].nunique() <= 5]
categorical_columns_te = [c for c in categorical_columns if X[c].nunique() >= 5]

In [48]:
cat_pipe_ohe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

cat_pipe_te = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', TargetEncoder())
])

# Numeric Pipeline
num_pipe = Pipeline([
    ('selector', ColumnSelector(col_names=numeric_columns))
])

# Combine categorical and numerical pipelines
preprocessor = ColumnTransformer([
    ('cat_ohe', cat_pipe_ohe, categorical_columns_ohe),
    ('cat_te', cat_pipe_te, categorical_columns_te),
    ('num', num_pipe, numeric_columns)
])

# Fit pipeline with transformers and an estimator to the training data
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb_model', XGBClassifier(random_state=42))
])



In [49]:
# Evaluate with cross-validation
scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='roc_auc')

print("Cross-validation scores:", scores)
print("Mean cross-validation score:", np.mean(scores))

Cross-validation scores: [0.9370522  0.93770065 0.93669726 0.9371378  0.93747928]
Mean cross-validation score: 0.937213437093182


In [50]:
pipe.fit(X_train, y_train)

In [51]:
y_preds_proba = pipe.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_preds_proba)

np.float64(0.9435952066632689)

In [52]:
y_preds = pipe.predict(X_test)
test_score_classification(y_test, y_preds, y_preds_proba)

                       Test
Accuracy           0.980967
Balanced_Accuracy  0.754818
Precision          0.919085
Recall             0.511301
f1                 0.657066
ROC_AUC            0.943595
Brier_Loss         0.015927
Log_Loss           0.067212
Jaccard            0.489277
[[142133    237]
 [  2573   2692]]


#### Best Categorical Encoding w/ Imputing 0

In [53]:
cat_pipe_ohe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

cat_pipe_te = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', TargetEncoder())
])

# Numeric Pipeline
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0))
])

# Combine categorical and numerical pipelines
preprocessor = ColumnTransformer([
    ('cat_ohe', cat_pipe_ohe, categorical_columns_ohe),
    ('cat_te', cat_pipe_te, categorical_columns_te),
    ('num', num_pipe, numeric_columns)
])

# Fit pipeline with transformers and an estimator to the training data
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb_model', XGBClassifier(random_state=42))
])



In [54]:
# Evaluate with cross-validation
scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='roc_auc')

print("Cross-validation scores:", scores)
print("Mean cross-validation score:", np.mean(scores))

Cross-validation scores: [0.93564596 0.93940633 0.9391285  0.93533272 0.94060926]
Mean cross-validation score: 0.9380245525228064


In [55]:
pipe.fit(X_train, y_train)

In [56]:
y_preds_proba = pipe.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_preds_proba)

np.float64(0.9406061857067454)

In [57]:
y_preds = pipe.predict(X_test)
test_score_classification(y_test, y_preds, y_preds_proba)

                       Test
Accuracy           0.980818
Balanced_Accuracy  0.754101
Precision          0.914198
Recall             0.509972
f1                 0.654718
ROC_AUC            0.940606
Brier_Loss         0.016102
Log_Loss           0.068235
Jaccard            0.486678
[[142118    252]
 [  2580   2685]]


#### Best Categorical Encoding w/ Imputing -1

In [58]:
# Categorical pipeline
cat_pipe_ohe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

cat_pipe_te = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', TargetEncoder())
])

# Numeric Pipeline
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=-1))
])

# Combine categorical and numerical pipelines
preprocessor = ColumnTransformer([
    ('cat_ohe', cat_pipe_ohe, categorical_columns_ohe),
    ('cat_te', cat_pipe_te, categorical_columns_te),
    ('num', num_pipe, numeric_columns)
])

# Fit pipeline with transformers and an estimator to the training data
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb_model', XGBClassifier(random_state=42))
])



In [59]:
# Evaluate with cross-validation
scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='roc_auc')

print("Cross-validation scores:", scores)
print("Mean cross-validation score:", np.mean(scores))

Cross-validation scores: [0.93886481 0.93859614 0.94025981 0.93764753 0.93932582]
Mean cross-validation score: 0.9389388204758461


In [60]:
pipe.fit(X_train, y_train)

In [61]:
y_preds_proba = pipe.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_preds_proba)

np.float64(0.9408693844490243)

In [62]:
y_preds = pipe.predict(X_test)
test_score_classification(y_test, y_preds, y_preds_proba)

                       Test
Accuracy           0.980824
Balanced_Accuracy  0.752184
Precision          0.920525
Recall             0.505983
f1                 0.653021
ROC_AUC            0.940869
Brier_Loss         0.016200
Log_Loss           0.068475
Jaccard            0.484804
[[142140    230]
 [  2601   2664]]


#### Best Categorical Encoding w/ Imputing Mean

In [63]:
# Categorical pipeline
cat_pipe_ohe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

cat_pipe_te = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', TargetEncoder())
])

# Numeric Pipeline
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean'))
])

# Combine categorical and numerical pipelines
preprocessor = ColumnTransformer([
    ('cat_ohe', cat_pipe_ohe, categorical_columns_ohe),
    ('cat_te', cat_pipe_te, categorical_columns_te),
    ('num', num_pipe, numeric_columns)
])

# Fit pipeline with transformers and an estimator to the training data
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb_model', XGBClassifier(random_state=42))
])



In [64]:
# Evaluate with cross-validation
scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='roc_auc')

print("Cross-validation scores:", scores)
print("Mean cross-validation score:", np.mean(scores))



Cross-validation scores: [0.93766448 0.9401515  0.94002295 0.93741638 0.93589845]
Mean cross-validation score: 0.9382307513131772


In [65]:
pipe.fit(X_train, y_train)



In [66]:
y_preds_proba = pipe.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_preds_proba)



np.float64(0.9394586394305435)

In [67]:
y_preds = pipe.predict(X_test)
test_score_classification(y_test, y_preds, y_preds_proba)



                       Test
Accuracy           0.980540
Balanced_Accuracy  0.750664
Precision          0.911562
Recall             0.503134
f1                 0.648391
ROC_AUC            0.939459
Brier_Loss         0.016248
Log_Loss           0.068939
Jaccard            0.479717
[[142113    257]
 [  2616   2649]]


#### Best Categorical Encoding w/ imputing median

In [68]:
# Categorical pipeline
cat_pipe_ohe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

cat_pipe_te = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', TargetEncoder())
])

# Numeric Pipeline
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median'))
])

# Combine categorical and numerical pipelines
preprocessor = ColumnTransformer([
    ('cat_ohe', cat_pipe_ohe, categorical_columns_ohe),
    ('cat_te', cat_pipe_te, categorical_columns_te),
    ('num', num_pipe, numeric_columns)
])

# Fit pipeline with transformers and an estimator to the training data
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb_model', XGBClassifier(random_state=42))
])



In [69]:
# Evaluate with cross-validation
scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='roc_auc')

print("Cross-validation scores:", scores)
print("Mean cross-validation score:", np.mean(scores))



Cross-validation scores: [0.93901714 0.94067972 0.93803928 0.93584211 0.93527537]
Mean cross-validation score: 0.9377707209016867


In [70]:
pipe.fit(X_train, y_train)



In [71]:
y_preds_proba = pipe.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_preds_proba)



np.float64(0.9409384099494377)

In [72]:
y_preds = pipe.predict(X_test)
test_score_classification(y_test, y_preds, y_preds_proba)



                       Test
Accuracy           0.980811
Balanced_Accuracy  0.751811
Precision          0.921053
Recall             0.505223
f1                 0.652521
ROC_AUC            0.940938
Brier_Loss         0.016236
Log_Loss           0.068615
Jaccard            0.484253
[[142142    228]
 [  2605   2660]]


### Scaling
Some models assume data have a normal distribution and performance will suffer when they do not. Most models will suffer if different variables have vastly differing scales. Do you need to scale your data? If so, how should you go about doing so?

#### Robust Scaler w/ Best Categorical Encoding and No Imputing

In [73]:
# Categorical pipelines
cat_pipe_ohe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

cat_pipe_te = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', TargetEncoder())
])

# Numeric Pipeline
num_pipe = Pipeline([
    ('scaler', RobustScaler())
])

# Combine categorical and numerical pipelines
preprocessor = ColumnTransformer([
    ('cat_ohe', cat_pipe_ohe, categorical_columns_ohe),
    ('cat_te', cat_pipe_te, categorical_columns_te),
    ('num', num_pipe, numeric_columns)
])

# Fit pipeline with transformers and an estimator to the training data
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb_model', XGBClassifier(random_state=42))
])



In [74]:
# Evaluate with cross-validation
scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='roc_auc')

print("Cross-validation scores:", scores)
print("Mean cross-validation score:", np.mean(scores))

  return fnb._ureduce(a, func=_nanmedian, keepdims=keepdims,
  return _nanquantile_unchecked(
  return fnb._ureduce(a, func=_nanmedian, keepdims=keepdims,
  return _nanquantile_unchecked(
  return fnb._ureduce(a, func=_nanmedian, keepdims=keepdims,
  return _nanquantile_unchecked(
  return fnb._ureduce(a, func=_nanmedian, keepdims=keepdims,
  return _nanquantile_unchecked(
  return fnb._ureduce(a, func=_nanmedian, keepdims=keepdims,
  return _nanquantile_unchecked(


Cross-validation scores: [0.9370522  0.93770065 0.93669726 0.9371378  0.93747928]
Mean cross-validation score: 0.937213437093182


In [75]:
pipe.fit(X_train, y_train)

  return fnb._ureduce(a, func=_nanmedian, keepdims=keepdims,
  return _nanquantile_unchecked(


In [76]:
y_preds_proba = pipe.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_preds_proba)

np.float64(0.9435952066632689)

In [77]:
y_preds = pipe.predict(X_test)
test_score_classification(y_test, y_preds, y_preds_proba)

                       Test
Accuracy           0.980967
Balanced_Accuracy  0.754818
Precision          0.919085
Recall             0.511301
f1                 0.657066
ROC_AUC            0.943595
Brier_Loss         0.015927
Log_Loss           0.067212
Jaccard            0.489277
[[142133    237]
 [  2573   2692]]


### Feature Selection and Engineering
Are all of your variables necessary, or do you have a lot of them taking up time and computing power without assing much to model building? Can some variables be combined to make a better model? Are variables linearly related to your target variable, or would it be worthwhile to include some polynomial features? 

#### Variance Threshold using Temporarily Imputed Median, Best Categorical Encoding, and No Imputing

In [78]:
# Categorical pipelines
cat_pipe_ohe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

cat_pipe_te = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', TargetEncoder())
])

# Numeric Pipeline
num_pipe = Pipeline([
    ('variance threshold', TemporarySimpleImputerVarianceThreshold(strategy='median', threshold=0.01))
])

# Combine categorical and numerical pipelines
preprocessor = ColumnTransformer([
    ('cat_ohe', cat_pipe_ohe, categorical_columns_ohe),
    ('cat_te', cat_pipe_te, categorical_columns_te),
    ('num', num_pipe, numeric_columns)
])

# Fit pipeline with transformers and an estimator to the training data
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb_model', XGBClassifier(random_state=42))
])



In [79]:
# Evaluate with cross-validation
scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='roc_auc')

print("Cross-validation scores:", scores)
print("Mean cross-validation score:", np.mean(scores))

Cross-validation scores: [0.9370522  0.93770065 0.93669726 0.9371378  0.93747928]
Mean cross-validation score: 0.937213437093182


In [80]:
pipe.fit(X_train, y_train)

In [81]:
y_preds_proba = pipe.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_preds_proba)

np.float64(0.9435952066632689)

In [82]:
y_preds = pipe.predict(X_test)
test_score_classification(y_test, y_preds, y_preds_proba)

                       Test
Accuracy           0.980967
Balanced_Accuracy  0.754818
Precision          0.919085
Recall             0.511301
f1                 0.657066
ROC_AUC            0.943595
Brier_Loss         0.015927
Log_Loss           0.067212
Jaccard            0.489277
[[142133    237]
 [  2573   2692]]


#### Variance Threshold using Temporarily Imputed -1, Best Categorical Encoding, and No Imputing

In [83]:
# Categorical pipelines
cat_pipe_ohe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

cat_pipe_te = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', TargetEncoder())
])

# Numeric Pipeline
num_pipe = Pipeline([
    ('variance threshold', TemporarySimpleImputerVarianceThreshold(strategy='constant', fill_value=-1, threshold=0.01))
])

# Combine categorical and numerical pipelines
preprocessor = ColumnTransformer([
    ('cat_ohe', cat_pipe_ohe, categorical_columns_ohe),
    ('cat_te', cat_pipe_te, categorical_columns_te),
    ('num', num_pipe, numeric_columns)
])

# Fit pipeline with transformers and an estimator to the training data
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb_model', XGBClassifier(random_state=42))
])



In [84]:
# Evaluate with cross-validation
scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='roc_auc')

print("Cross-validation scores:", scores)
print("Mean cross-validation score:", np.mean(scores))

Cross-validation scores: [0.9370522  0.93770065 0.93669726 0.9371378  0.93747928]
Mean cross-validation score: 0.937213437093182


In [85]:
pipe.fit(X_train, y_train)

In [86]:
y_preds_proba = pipe.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_preds_proba)

np.float64(0.9435952066632689)

In [87]:
y_preds = pipe.predict(X_test)
test_score_classification(y_test, y_preds, y_preds_proba)

                       Test
Accuracy           0.980967
Balanced_Accuracy  0.754818
Precision          0.919085
Recall             0.511301
f1                 0.657066
ROC_AUC            0.943595
Brier_Loss         0.015927
Log_Loss           0.067212
Jaccard            0.489277
[[142133    237]
 [  2573   2692]]


#### RFE for 99.9% of features, Best Categorical Encoding, and No Imputing

In [92]:
# Categorical pipelines
cat_pipe_ohe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

cat_pipe_te = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', TargetEncoder())
])

# Numeric Pipeline
num_pipe = Pipeline([
    ('selector', ColumnSelector(col_names=numeric_columns))
])

# Combine categorical and numerical pipelines
preprocessor = ColumnTransformer([
    ('cat_ohe', cat_pipe_ohe, categorical_columns_ohe),
    ('cat_te', cat_pipe_te, categorical_columns_te),
    ('num', num_pipe, numeric_columns)
])

# Fit pipeline with transformers and an estimator to the training data
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('rfe', RFE(estimator=XGBClassifier(random_state=42), n_features_to_select=0.999, step=3)),
    ('xgb_model', XGBClassifier(random_state=42))
])



In [93]:
# Evaluate with cross-validation
scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='roc_auc')

print("Cross-validation scores:", scores)
print("Mean cross-validation score:", np.mean(scores))

Cross-validation scores: [0.9370522  0.93770065 0.93669726 0.9371378  0.93747928]
Mean cross-validation score: 0.937213437093182


In [94]:
pipe.fit(X_train, y_train)

In [95]:
y_preds_proba = pipe.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_preds_proba)

np.float64(0.9435952066632689)

In [96]:
y_preds = pipe.predict(X_test)
test_score_classification(y_test, y_preds, y_preds_proba)

                       Test
Accuracy           0.980967
Balanced_Accuracy  0.754818
Precision          0.919085
Recall             0.511301
f1                 0.657066
ROC_AUC            0.943595
Brier_Loss         0.015927
Log_Loss           0.067212
Jaccard            0.489277
[[142133    237]
 [  2573   2692]]


#### RFE for 99% of features, Best Categorical Encoding, and No Imputing

In [98]:
# Categorical pipelines
cat_pipe_ohe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

cat_pipe_te = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', TargetEncoder())
])

# Numeric Pipeline
num_pipe = Pipeline([
    ('selector', ColumnSelector(col_names=numeric_columns))
])

# Combine categorical and numerical pipelines
preprocessor = ColumnTransformer([
    ('cat_ohe', cat_pipe_ohe, categorical_columns_ohe),
    ('cat_te', cat_pipe_te, categorical_columns_te),
    ('num', num_pipe, numeric_columns)
])

# Fit pipeline with transformers and an estimator to the training data
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('rfe', RFE(estimator=XGBClassifier(random_state=42), n_features_to_select=0.99, step=3)),
    ('xgb_model', XGBClassifier(random_state=42))
])



In [99]:
# Evaluate with cross-validation
scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='roc_auc')

print("Cross-validation scores:", scores)
print("Mean cross-validation score:", np.mean(scores))

Cross-validation scores: [0.9370522  0.93770065 0.93669726 0.9371378  0.93747928]
Mean cross-validation score: 0.937213437093182


In [100]:
pipe.fit(X_train, y_train)

In [101]:
y_preds_proba = pipe.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_preds_proba)

np.float64(0.9435952066632689)

In [102]:
y_preds = pipe.predict(X_test)
test_score_classification(y_test, y_preds, y_preds_proba)

                       Test
Accuracy           0.980967
Balanced_Accuracy  0.754818
Precision          0.919085
Recall             0.511301
f1                 0.657066
ROC_AUC            0.943595
Brier_Loss         0.015927
Log_Loss           0.067212
Jaccard            0.489277
[[142133    237]
 [  2573   2692]]


#### RFE for 95% of features, Best Categorical Encoding, and No Imputing

In [103]:
# Categorical pipelines
cat_pipe_ohe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

cat_pipe_te = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', TargetEncoder())
])

# Numeric Pipeline
num_pipe = Pipeline([
    ('selector', ColumnSelector(col_names=numeric_columns))
])

# Combine categorical and numerical pipelines
preprocessor = ColumnTransformer([
    ('cat_ohe', cat_pipe_ohe, categorical_columns_ohe),
    ('cat_te', cat_pipe_te, categorical_columns_te),
    ('num', num_pipe, numeric_columns)
])

# Fit pipeline with transformers and an estimator to the training data
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('rfe', RFE(estimator=XGBClassifier(random_state=42), n_features_to_select=0.95, step=3)),
    ('xgb_model', XGBClassifier(random_state=42))
])



In [104]:
# Evaluate with cross-validation
scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='roc_auc')

print("Cross-validation scores:", scores)
print("Mean cross-validation score:", np.mean(scores))

Cross-validation scores: [0.9370522  0.93770065 0.93669726 0.9371378  0.93747928]
Mean cross-validation score: 0.937213437093182


In [105]:
pipe.fit(X_train, y_train)

In [106]:
y_preds_proba = pipe.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_preds_proba)

np.float64(0.9435952066632689)

In [107]:
y_preds = pipe.predict(X_test)
test_score_classification(y_test, y_preds, y_preds_proba)

                       Test
Accuracy           0.980967
Balanced_Accuracy  0.754818
Precision          0.919085
Recall             0.511301
f1                 0.657066
ROC_AUC            0.943595
Brier_Loss         0.015927
Log_Loss           0.067212
Jaccard            0.489277
[[142133    237]
 [  2573   2692]]


#### RFE for 90% features, Best Categorical Encoding, and No Imputing

In [108]:
# Categorical pipelines
cat_pipe_ohe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

cat_pipe_te = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', TargetEncoder())
])

# Numeric Pipeline
num_pipe = Pipeline([
    ('selector', ColumnSelector(col_names=numeric_columns))
])

# Combine categorical and numerical pipelines
preprocessor = ColumnTransformer([
    ('cat_ohe', cat_pipe_ohe, categorical_columns_ohe),
    ('cat_te', cat_pipe_te, categorical_columns_te),
    ('num', num_pipe, numeric_columns)
])

# Fit pipeline with transformers and an estimator to the training data
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('rfe', RFE(estimator=XGBClassifier(random_state=42), n_features_to_select=0.90, step=3)),
    ('xgb_model', XGBClassifier(random_state=42))
])



In [109]:
# Evaluate with cross-validation
scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='roc_auc')

print("Cross-validation scores:", scores)
print("Mean cross-validation score:", np.mean(scores))

Cross-validation scores: [0.9370522  0.93770065 0.93669726 0.9371378  0.93747928]
Mean cross-validation score: 0.937213437093182


In [110]:
pipe.fit(X_train, y_train)

In [111]:
y_preds_proba = pipe.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_preds_proba)

np.float64(0.9435952066632689)

In [112]:
y_preds = pipe.predict(X_test)
test_score_classification(y_test, y_preds, y_preds_proba)

                       Test
Accuracy           0.980967
Balanced_Accuracy  0.754818
Precision          0.919085
Recall             0.511301
f1                 0.657066
ROC_AUC            0.943595
Brier_Loss         0.015927
Log_Loss           0.067212
Jaccard            0.489277
[[142133    237]
 [  2573   2692]]


#### RFE for 75% features, Best Categorical Encoding, and No Imputing

In [113]:
# Categorical pipelines
cat_pipe_ohe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

cat_pipe_te = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', TargetEncoder())
])

# Numeric Pipeline
num_pipe = Pipeline([
    ('selector', ColumnSelector(col_names=numeric_columns))
])

# Combine categorical and numerical pipelines
preprocessor = ColumnTransformer([
    ('cat_ohe', cat_pipe_ohe, categorical_columns_ohe),
    ('cat_te', cat_pipe_te, categorical_columns_te),
    ('num', num_pipe, numeric_columns)
])

# Fit pipeline with transformers and an estimator to the training data
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('rfe', RFE(estimator=XGBClassifier(random_state=42), n_features_to_select=0.75, step=3)),
    ('xgb_model', XGBClassifier(random_state=42))
])

In [114]:
# Evaluate with cross-validation
scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='roc_auc')

print("Cross-validation scores:", scores)
print("Mean cross-validation score:", np.mean(scores))

Cross-validation scores: [0.9370522  0.93770065 0.93669726 0.9371378  0.93747928]
Mean cross-validation score: 0.937213437093182


In [115]:
pipe.fit(X_train, y_train)

In [116]:
y_preds_proba = pipe.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_preds_proba)

np.float64(0.9435952066632689)

In [117]:
y_preds = pipe.predict(X_test)
test_score_classification(y_test, y_preds, y_preds_proba)

                       Test
Accuracy           0.980967
Balanced_Accuracy  0.754818
Precision          0.919085
Recall             0.511301
f1                 0.657066
ROC_AUC            0.943595
Brier_Loss         0.015927
Log_Loss           0.067212
Jaccard            0.489277
[[142133    237]
 [  2573   2692]]


### Hyperparameter Tuning
Once you have a model that is performing decently well, you'll want to adjust the hyperparameters to improve performance.

#### RandomizedSearchCV on core hyperparameters w/ pipeline using just best categorical encoding

In [15]:
categorical_columns_ohe = [c for c in categorical_columns if X[c].nunique() <= 5]
categorical_columns_te = [c for c in categorical_columns if X[c].nunique() >= 5]

In [16]:
cat_pipe_ohe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

cat_pipe_te = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', TargetEncoder())
])

# Numeric Pipeline
num_pipe = Pipeline([
    ('selector', ColumnSelector(col_names=numeric_columns))
])

# Combine categorical and numerical pipelines
preprocessor = ColumnTransformer([
    ('cat_ohe', cat_pipe_ohe, categorical_columns_ohe),
    ('cat_te', cat_pipe_te, categorical_columns_te),
    ('num', num_pipe, numeric_columns)
])

# Fit pipeline with transformers and an estimator to the training data
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb_model', XGBClassifier(random_state=42))
])

In [28]:
core_param_grid = {
    'xgb_model__n_estimators' : [50, 100, 200, 500],
    'xgb_model__max_depth' : [3, 6, 9, 12, 15],
    'xgb_model__learning_rate' : [0.01, 0.05, 0.1, 0.2, 0.3, 0.5]
}

random_search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=core_param_grid,
    n_iter=10,
    scoring='roc_auc',
    cv=5,
    random_state=42
)

In [29]:
random_search.fit(X_train, y_train)
print("Best parameters:", random_search.best_params_)
print("Best cross-validation score:", random_search.best_score_)

Best parameters: {'xgb_model__n_estimators': 500, 'xgb_model__max_depth': 12, 'xgb_model__learning_rate': 0.1}
Best cross-validation score: 0.9659710394715614


In [30]:
# Evaluate with cross-validation
best_rs_xgb = random_search.best_estimator_
scores = cross_val_score(best_rs_xgb, X_train, y_train, cv=5, scoring='roc_auc')

print("Cross-validation scores:", scores)
print("Mean cross-validation score:", np.mean(scores))

Cross-validation scores: [0.96480752 0.96586172 0.96801302 0.964342   0.96683093]
Mean cross-validation score: 0.9659710394715614


In [31]:
best_rs_xgb.fit(X_train, y_train)

In [32]:
y_preds_proba = best_rs_xgb.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_preds_proba)

np.float64(0.971358300846723)

In [33]:
y_preds = best_rs_xgb.predict(X_test)
test_score_classification(y_test, y_preds, y_preds_proba)

                       Test
Accuracy           0.986555
Balanced_Accuracy  0.824295
Precision          0.960674
Recall             0.649573
f1                 0.775071
ROC_AUC            0.971358
Brier_Loss         0.011142
Log_Loss           0.047886
Jaccard            0.632747
[[142230    140]
 [  1845   3420]]


#### GridSearchCV on core hyperparameters w/ pipeline using just best categorical encoding

In [34]:
cat_pipe_ohe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

cat_pipe_te = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', TargetEncoder())
])

# Numeric Pipeline
num_pipe = Pipeline([
    ('selector', ColumnSelector(col_names=numeric_columns))
])

# Combine categorical and numerical pipelines
preprocessor = ColumnTransformer([
    ('cat_ohe', cat_pipe_ohe, categorical_columns_ohe),
    ('cat_te', cat_pipe_te, categorical_columns_te),
    ('num', num_pipe, numeric_columns)
])

# Fit pipeline with transformers and an estimator to the training data
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb_model', XGBClassifier(random_state=42))
])

In [36]:
core_param_grid = {
    'xgb_model__n_estimators' : [50, 100, 200, 500],
    'xgb_model__max_depth' : [3, 6, 9, 12, 15],
    'xgb_model__learning_rate' : [0.01, 0.05, 0.1, 0.2, 0.3, 0.5]
}

grid_search = GridSearchCV(
    estimator=pipe,
    param_grid=core_param_grid,
    scoring='roc_auc',
    cv=5
)

In [37]:
grid_search.fit(X_train, y_train)
print("Best parameters:", random_search.best_params_)
print("Best cross-validation score:", random_search.best_score_)

Best parameters: {'xgb_model__n_estimators': 500, 'xgb_model__max_depth': 12, 'xgb_model__learning_rate': 0.1}
Best cross-validation score: 0.9659710394715614


In [38]:
# Evaluate with cross-validation
best_gs_xgb = random_search.best_estimator_
scores = cross_val_score(best_gs_xgb, X_train, y_train, cv=5, scoring='roc_auc')

print("Cross-validation scores:", scores)
print("Mean cross-validation score:", np.mean(scores))

Cross-validation scores: [0.96480752 0.96586172 0.96801302 0.964342   0.96683093]
Mean cross-validation score: 0.9659710394715614


In [39]:
best_gs_xgb.fit(X_train, y_train)

In [40]:
y_preds_proba = best_gs_xgb.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_preds_proba)

np.float64(0.971358300846723)

In [41]:
y_preds = best_gs_xgb.predict(X_test)
test_score_classification(y_test, y_preds, y_preds_proba)

                       Test
Accuracy           0.986555
Balanced_Accuracy  0.824295
Precision          0.960674
Recall             0.649573
f1                 0.775071
ROC_AUC            0.971358
Brier_Loss         0.011142
Log_Loss           0.047886
Jaccard            0.632747
[[142230    140]
 [  1845   3420]]


#### RandomizedSearchCV on extended hyperparameters w/ pipeline using just best categorical encoding

In [46]:
cat_pipe_ohe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

cat_pipe_te = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', TargetEncoder())
])

# Numeric Pipeline
num_pipe = Pipeline([
    ('selector', ColumnSelector(col_names=numeric_columns))
])

# Combine categorical and numerical pipelines
preprocessor = ColumnTransformer([
    ('cat_ohe', cat_pipe_ohe, categorical_columns_ohe),
    ('cat_te', cat_pipe_te, categorical_columns_te),
    ('num', num_pipe, numeric_columns)
])

# Fit pipeline with transformers and an estimator to the training data
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb_model', XGBClassifier(n_estimators=500, max_depth=12, learning_rate=0.1, random_state=42))
])

In [47]:
from scipy.stats import uniform, loguniform

ext_param_dist = {
    'xgb_model__colsample_bytree' : uniform(0.5, 0.5),
    'xgb_model__min_child_weight' : loguniform(0.01, 10),
    'xgb_model__gamma' : loguniform(1e-4, 10),
    'xgb_model__reg_alpha' : loguniform(1e-4, 10)
}

ext_random_search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=ext_param_dist,
    n_iter=10,
    scoring='roc_auc',
    cv=5,
    random_state=42
)

In [48]:
ext_random_search.fit(X_train, y_train)
print("Best parameters:", ext_random_search.best_params_)
print("Best cross-validation score:", ext_random_search.best_score_)

Best parameters: {'xgb_model__colsample_bytree': np.float64(0.5780093202212182), 'xgb_model__gamma': np.float64(0.000602521573620386), 'xgb_model__min_child_weight': np.float64(0.014936568554617643), 'xgb_model__reg_alpha': np.float64(2.1423021757741068)}
Best cross-validation score: 0.9667465305179904


In [49]:
# Evaluate with cross-validation
ext_rs_xgb = ext_random_search.best_estimator_
ext_scores = cross_val_score(ext_rs_xgb, X_train, y_train, cv=5, scoring='roc_auc')

print("Cross-validation scores:", ext_scores)
print("Mean cross-validation score:", np.mean(ext_scores))

Cross-validation scores: [0.96562065 0.96749051 0.96803075 0.96468926 0.96790149]
Mean cross-validation score: 0.9667465305179904


In [50]:
ext_rs_xgb.fit(X_train, y_train)

In [51]:
y_preds_proba = ext_rs_xgb.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_preds_proba)

np.float64(0.9723393894738513)

In [52]:
y_preds = ext_rs_xgb.predict(X_test)
test_score_classification(y_test, y_preds, y_preds_proba)

                       Test
Accuracy           0.986473
Balanced_Accuracy  0.823521
Precision          0.959505
Recall             0.648053
f1                 0.773608
ROC_AUC            0.972339
Brier_Loss         0.011193
Log_Loss           0.048456
Jaccard            0.630801
[[142226    144]
 [  1853   3412]]


### Additional Tuning, Processing, or Model-Improvement
What else can you do to improve your model from the baseline?

#### Best Core Hyperparameters w/ 1000 n_estimators and best categorical encoding

In [53]:
cat_pipe_ohe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

cat_pipe_te = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', TargetEncoder())
])

# Numeric Pipeline
num_pipe = Pipeline([
    ('selector', ColumnSelector(col_names=numeric_columns))
])

# Combine categorical and numerical pipelines
preprocessor = ColumnTransformer([
    ('cat_ohe', cat_pipe_ohe, categorical_columns_ohe),
    ('cat_te', cat_pipe_te, categorical_columns_te),
    ('num', num_pipe, numeric_columns)
])

# Fit pipeline with transformers and an estimator to the training data
core_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb_model', XGBClassifier(n_estimators=1000, 
                                max_depth=12, 
                                learning_rate=0.1, 
                                random_state=42))
])

In [54]:
# Evaluate with cross-validation
scores = cross_val_score(core_pipe, X_train, y_train, cv=5, scoring='roc_auc')

print("Cross-validation scores:", scores)
print("Mean cross-validation score:", np.mean(scores))

Cross-validation scores: [0.96565615 0.96633245 0.96898786 0.96542161 0.96824729]
Mean cross-validation score: 0.9669290714953604


In [55]:
core_pipe.fit(X_train, y_train)

In [56]:
y_preds_proba = core_pipe.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_preds_proba)

np.float64(0.9717537153068982)

In [57]:
y_preds = core_pipe.predict(X_test)
test_score_classification(y_test, y_preds, y_preds_proba)

                       Test
Accuracy           0.987496
Balanced_Accuracy  0.836763
Precision          0.964160
Recall             0.674454
f1                 0.793697
ROC_AUC            0.971754
Brier_Loss         0.010617
Log_Loss           0.049863
Jaccard            0.657958
[[142238    132]
 [  1714   3551]]


#### Best Extended Hyperparameters w/ 1000 n_estimators and best categorical encoding

In [58]:
cat_pipe_ohe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

cat_pipe_te = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', TargetEncoder())
])

# Numeric Pipeline
num_pipe = Pipeline([
    ('selector', ColumnSelector(col_names=numeric_columns))
])

# Combine categorical and numerical pipelines
preprocessor = ColumnTransformer([
    ('cat_ohe', cat_pipe_ohe, categorical_columns_ohe),
    ('cat_te', cat_pipe_te, categorical_columns_te),
    ('num', num_pipe, numeric_columns)
])

# Fit pipeline with transformers and an estimator to the training data
ext_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb_model', XGBClassifier(n_estimators=1000, 
                                max_depth=12, 
                                learning_rate=0.1, 
                                colsample_bytree=0.5780093202212182,
                                gamma=0.000602521573620386,
                                min_child_weight=0.014936568554617643,
                                reg_alpha=2.1423021757741068,
                                random_state=42))
])

In [59]:
# Evaluate with cross-validation
scores = cross_val_score(ext_pipe, X_train, y_train, cv=5, scoring='roc_auc')

print("Cross-validation scores:", scores)
print("Mean cross-validation score:", np.mean(scores))

Cross-validation scores: [0.96583462 0.96710581 0.96770204 0.96430539 0.96734058]
Mean cross-validation score: 0.9664576893203758


In [60]:
ext_pipe.fit(X_train, y_train)

In [61]:
y_preds_proba = ext_pipe.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_preds_proba)

np.float64(0.972020773553868)

In [62]:
y_preds = ext_pipe.predict(X_test)
test_score_classification(y_test, y_preds, y_preds_proba)

                       Test
Accuracy           0.986934
Balanced_Accuracy  0.829887
Precision          0.960519
Recall             0.660779
f1                 0.782941
ROC_AUC            0.972021
Brier_Loss         0.010952
Log_Loss           0.049259
Jaccard            0.643306
[[142227    143]
 [  1786   3479]]


## Outcome
At the end of this notebook, you should have a model that is performing better than the baseline model. You should be able to explain what steps you took to train this model and why each one was chosen.