### Modeling Application Data

Step 3.1: Using our optimized model, take a look at each of our preprocessing steps independently to identify any impacts on the prediction score.

### Imports

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import visuals as vs

from IPython.display import display # Allows the use of display() for DataFrames
from sklearn.model_selection import KFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split  # use model_selection to suppress deprecation warning

from sklearn.metrics import roc_auc_score, roc_curve, auc

# Pretty display for notebooks
%matplotlib inline

# Show all of the data in a dataframe
pd.set_option('display.max_columns', None)

In [None]:
def get_prediction_performance(dataFile): 
    
    # Read the datafile
    train = pd.read_csv(dataFile, index_col=0)

    # Drop y and the shared key for application data
    target = application_train['TARGET']
    train_no_target = train.drop('TARGET', axis = 1)
    train_noID = application_train_no_target.drop('SK_ID_CURR', axis=1)

    # Make a test and train set
    X_train, X_test, y_train, y_test = train_test_split(train_noID, target, test_size=0.2, random_state=42)

    dtc = DecisionTreeClassifier(criterion='entropy', max_depth=1, max_features='auto', random_state=42)
    clf = AdaBoostClassifier(base_estimator=dtc, algorithm='SAMME.R', n_estimators = 500, random_state=42)
    clf.fit(X_train, y_train) 
    
    predictions = clf.predict(X_test)
    score = roc_auc_score(y_test, predictions)
    
    return score    

# Minimal Preprocessing:  

- Data has outliers
- No scaling has been performed
- No Log Transformations on Data

In [28]:
score = get_prediction_performance('data/tmp/application_train_preprocessed_outliers_no_scaling_no_log.csv')
print("Prediction Score (ROC AUC): {0}".format(score))

Prediction Score (ROC AUC): 0.5112736305620937


# Imputation (with Mean):  

- Data has outliers
- NaNs in numeric fields are replaced with imputed mean values instead of 0
- No scaling has been performed
- No Log Transformations on Data

In [51]:
score = get_prediction_performance('data/tmp/application_train_preprocessed_imputation_no_scaling_with_outliers_no_log.csv')
print("Prediction Score (ROC AUC): {0}".format(score))

Prediction Score (ROC AUC): 0.511243328911791


# Imputation with Median:  

- Data has outliers
- NaNs in numeric fields are replaced with imputed mean values instead of 0
- No scaling has been performed
- No Log Transformations on Data
- Imputer(strategy="median")

In [63]:
score = get_prediction_performance('data/tmp/application_train_preprocessed_imputation_median_no_scaling_with_outliers_no_log.csv')
print("Prediction Score (ROC AUC): {0}".format(score))

Prediction Score (ROC AUC): 0.5109440157056088


# Imputation with Most Frequent:  

- Data has outliers
- NaNs in numeric fields are replaced with imputed mean values instead of 0
- No scaling has been performed
- No Log Transformations on Data
- Imputer(strategy="most_frequent")

In [64]:
score = get_prediction_performance('data/tmp/application_train_preprocessed_imputation_most_frequentno_scaling_with_outliers_no_log.csv')
print("Prediction Score (ROC AUC): {0}".format(score))

Prediction Score (ROC AUC): 0.5119669402528273


# Outliers Removed

- Outliers are removed using Tukey's method (1.5x IQR)
- No scaling has been performed
- No Log Transformations on Data

In [52]:
score = get_prediction_performance('data/tmp/application_train_preprocessed_no_scaling_outliers_replaced_no_log.csv')
print("Prediction Score (ROC AUC): {0}".format(score))

Prediction Score (ROC AUC): 0.5107293352481052


# Outliers Removed with Imputation (Mean)

- Outliers are removed using Tukey's method (1.5x IQR)
- No scaling has been performed
- No Log Transformations on Data

In [53]:
score = get_prediction_performance('data/tmp/application_train_preprocessed_imputation_no_scaling_outliers_replaced_no_log.csv')
print("Prediction Score (ROC AUC): {0}".format(score))

Prediction Score (ROC AUC): 0.511897495844608


# Robust Scaler with Outliers

- Outliers are included
- Scaled using RobustScaler
- No Log Transformations on Data

In [54]:
score = get_prediction_performance('data/tmp/application_train_preprocessed_robust_scaler_with_outliers_no_log.csv')
print("Prediction Score (ROC AUC): {0}".format(score))

Prediction Score (ROC AUC): 0.5115539771003558


# Standard Scaler with Outliers

- Outliers are included
- Scaled using StandardScaler
- No Log Transformations on Data

In [55]:
score = get_prediction_performance('data/tmp/application_train_preprocessed_standard_scaler_with_outliers_no_log.csv')
print("Prediction Score (ROC AUC): {0}".format(score))

Prediction Score (ROC AUC): 0.5109970623512925


# MinMax Scaler with Outliers

- Outliers are included
- Scaled using MinMaxScaler
- No Log Transformations on Data

In [57]:
score = get_prediction_performance('data/tmp/application_train_preprocessed_minmax_scaler_with_outliers_no_log.csv')
print("Prediction Score (ROC AUC): {0}".format(score))

Prediction Score (ROC AUC): 0.5102847859926386


# Robust Scaler with Outliers Replaced (Tukey's Method, All Numeric Fields)

- Outliers are removed
- Scaled using RobustScaler
- No Log Transformations on Data

In [38]:
score = get_prediction_performance('data/tmp/application_train_preprocessed_robust_scaler_outliers_removed_no_log.csv')
print("Prediction Score (ROC AUC): {0}".format(score))

Prediction Score (ROC AUC): 0.5116032454185788


# Standard Scaler with Outliers Replaced (Tukey's Method, All Numeric Fields)

- Outliers are removed
- Scaled using StandardtScaler
- No Log Transformations on Data

In [39]:
score = get_prediction_performance('data/tmp/application_train_preprocessed_standard_scaler_outliers_removed_no_log.csv')
print("Prediction Score (ROC AUC): {0}".format(score))

Prediction Score (ROC AUC): 0.5129381026070544


# MinMax Scaler with Outliers Replaced (Tukey's Method, All Numeric Fields)

- Outliers are removed
- Scaled using MinMaxScaler
- No Log Transformations on Data

In [60]:
score = get_prediction_performance('data/tmp/application_train_preprocessed_minmax_scaler_outliers_removed_no_log.csv')
print("Prediction Score (ROC AUC): {0}".format(score))

Prediction Score (ROC AUC): 0.5121386996249535


# Robust Scaler, Imputed NaNs (Mean), Outliers Replaced

- NaNs in numeric fields are replaced through Imputer.fit_transform instead of np.nan_to_num()
- Outliers are removed
- Scaled using RobustScaler
- No Log Transformations on Data

In [86]:
score = get_prediction_performance('data/tmp/application_train_preprocessed_imputation_mean_robust_scaler_outliers_replaced_log_transform_non_monetary_.csv')
print("Prediction Score (ROC AUC): {0}".format(score))

Prediction Score (ROC AUC): 0.5102544843423358


# Robust Scaler, Imputed NaNs (Most Frequent), Outliers Replaced

- NaNs in numeric fields are replaced through Imputer.fit_transform instead of np.nan_to_num()
- Imputer(strategy="most_frequent")
- Outliers are removed
- Scaled using RobustScaler
- No Log Transformations on Data

In [58]:
score = get_prediction_performance(' data/tmp/application_train_preprocessed_imputation_robust_scaler_outliers_replaced_no_log.csv')
print("Prediction Score (ROC AUC): {0}".format(score))

Prediction Score (ROC AUC): 0.5119808441405944


# Standard Scaler, Imputed NaNs, Outliers Replaced

- NaNs in numeric fields are replaced through Imputer.fit_transform instead of np.nan_to_num()
- Outliers are removed
- Scaled using StandardScaler
- No Log Transformations on Data

In [62]:
score = get_prediction_performance('data/tmp/application_train_preprocessed_imputation_standard_scaler_outliers_replaced_no_log.csv')
print("Prediction Score (ROC AUC): {0}".format(score))

Prediction Score (ROC AUC): 0.5105133703379091


# MinMax Scaler, Imputed NaNs, Outliers Replaced

- NaNs in numeric fields are replaced through Imputer.fit_transform instead of np.nan_to_num()
- Outliers are removed
- Scaled using MinMaxScaler
- No Log Transformations on Data

In [61]:
score = get_prediction_performance('data/tmp/application_train_preprocessed_imputation_minmax_scaler_outliers_replaced_no_log.csv')
print("Prediction Score (ROC AUC): {0}".format(score))

Prediction Score (ROC AUC): 0.5097228084634221


# Log Transformations (Non-Monetary) - Minimal Preprocessing

- NaNs in numeric fields are replaced with np.nan_to_num()
- Outliers are included
- No Scaling
- Log Transformations on Non-Monetary Datapoints

In [70]:
score = get_prediction_performance('data/tmp/application_train_preprocessed_no_scaling_with_outliers_log_transform_non_monetary_.csv')
print("Prediction Score (ROC AUC): {0}".format(score))

Prediction Score (ROC AUC): 0.5115628182079698


# Log Transformations (Non-Monetary) - Imputed NaNs

- NaNs in numeric fields are replaced through Imputer.fit_transform instead of np.nan_to_num()
- Outliers are included
- No Scaling
- Log Transformations on Non-Monetary Datapoints

In [68]:
score = get_prediction_performance('data/tmp/application_train_preprocessed_imputation_most_frequentno_scaling_outliers_replaced_log_transform_non_monetary_.csv')
print("Prediction Score (ROC AUC): {0}".format(score))

Prediction Score (ROC AUC): 0.5112294250240239


# Log Transformations (Non-Monetary) - Imputed NaNs, Outliers Replaced

- NaNs in numeric fields are replaced through Imputer.fit_transform instead of np.nan_to_num()
- Outliers are replaced
- No Scaling
- Log Transformations on Non-Monetary Datapoints

In [74]:
score = get_prediction_performance('data/tmp/application_train_preprocessed_imputation_most_frequentno_scaling_outliers_replaced_log_transform_non_monetary_.csv')
print("Prediction Score (ROC AUC): {0}".format(score))

Prediction Score (ROC AUC): 0.5105184331180624


# Robust Scaler, Log (Non-Monetary), Imputed NaNs (Most Frequent), Outliers Removed

- NaNs in numeric fields are replaced through Imputer.fit_transform instead of np.nan_to_num()
- Outliers are removed
- Scaled using RobustScaler
- Log Transformations on non-monetary data

In [67]:
score = get_prediction_performance('data/tmp/application_train_preprocessed_imputation_most_frequentrobust_scaler_outliers_replaced_log_transform_non_monetary_.csv')
print("Prediction Score (ROC AUC): {0}".format(score))

Prediction Score (ROC AUC): 0.5118318297638493


# Standard Scaler, Log (Non-Monetary), Imputed NaNs (Most Frequent), Outliers Removed

- NaNs in numeric fields are replaced through Imputer.fit_transform instead of np.nan_to_num()
- Outliers are removed
- Scaled using StandardScaler
- Log Transformations on non-monetary data

In [71]:
score = get_prediction_performance('data/tmp/application_train_preprocessed_imputation_most_frequentstandard_scaler_outliers_replaced_log_transform_non_monetary_.csv')
print("Prediction Score (ROC AUC): {0}".format(score))

Prediction Score (ROC AUC): 0.5119151780598359


# MinMax Scaler, Log (Non-Monetary), Imputed NaNs (Most Frequent), Outliers Removed

- NaNs in numeric fields are replaced through Imputer.fit_transform instead of np.nan_to_num()
- Outliers are removed
- Scaled using MinMaxScaler
- Log Transformations on non-monetary data

In [84]:
score = get_prediction_performance('data/tmp/application_train_preprocessed_imputation_most_frequentminmax_scaler_outliers_replaced_log_transform_non_monetary_.csv')
print("Prediction Score (ROC AUC): {0}".format(score))

Prediction Score (ROC AUC): 0.5103075309880196


# Standard Scaler, Log (Non-Monetary), Imputed NaNs (Mean), Outliers Removed

- NaNs in numeric fields are replaced through Imputer.fit_transform instead of np.nan_to_num()
- Outliers are removed
- Scaled using StandardScaler
- Log Transformations on non-monetary data

In [75]:
score = get_prediction_performance('data/tmp/application_train_preprocessed_imputation_standard_scaler_outliers_replaced_log_transform_non_monetary_.csv')
print("Prediction Score (ROC AUC): {0}".format(score))

Prediction Score (ROC AUC): 0.5114314860464527


# Robust Scaler, Log (Monetary), Imputed NaNs (Most Frequent), Outliers Removed

- NaNs in numeric fields are replaced through Imputer.fit_transform instead of np.nan_to_num()
- Outliers are removed
- Scaled using RobustScaler
- Log Transformations on monetary data

In [80]:
score = get_prediction_performance('data/tmp/application_train_preprocessed_imputation_most_frequentrobust_scaler_outliers_replaced_log_transform_with_monetary_.csv')
print("Prediction Score (ROC AUC): {0}".format(score))

Prediction Score (ROC AUC): 0.5105840991988209


# Standard Scaler, Log (Monetary), Imputed NaNs (Most Frequent), Outliers Removed

- NaNs in numeric fields are replaced through Imputer.fit_transform instead of np.nan_to_num()
- Outliers are removed
- Scaled using StandardScaler
- Log Transformations on monetary data

In [76]:
score = get_prediction_performance('data/tmp/application_train_preprocessed_imputation_standard_scaler_outliers_replaced_log_transform_with_monetary_.csv')
print("Prediction Score (ROC AUC): {0}".format(score))

Prediction Score (ROC AUC): 0.511559039880509


# MinMax Scaler, Log (Monetary), Imputed NaNs (Most Frequent), Outliers Removed

- NaNs in numeric fields are replaced through Imputer.fit_transform instead of np.nan_to_num()
- Outliers are removed
- Scaled using StandardScaler
- Log Transformations on monetary data

In [82]:
score = get_prediction_performance('data/tmp/application_train_preprocessed_imputation_most_frequentminmax_scaler_outliers_replaced_log_transform_with_monetary_.csv')
print("Prediction Score (ROC AUC): {0}".format(score))

Prediction Score (ROC AUC): 0.5118217042035431
