In [1]:
import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
import category_encoders as ce
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, QuantileTransformer
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance
pd.options.display.max_columns = None

X_train = pd.read_csv("train_values.csv")
y_train = pd.read_csv("train_labels.csv")

In [2]:
replace_dict = {
'msa_md': -1,
'state_code': -1,
'county_code': -1,
'occupancy': 3,
'preapproval': 3,
'applicant_ethnicity': [3, 4, 5],
"applicant_race": [6, 7, 8],
"applicant_sex": [3, 4, 5]
}
    
to_log = ["loan_amount", "applicant_income", "number_of_owner-occupied_units",
          "number_of_1_to_4_family_units", "minority_population"]

to_drop = ["row_id", "number_of_1_to_4_family_units",
           "occupancy", "county_code", "preapproval"]

num_cols = ["loan_amount", "applicant_income", "population", "minority_population_pct",
            "ffiecmedian_family_income", "tract_to_msa_md_income_pct",
            "number_of_owner-occupied_units"]

cat_cols_few = ["loan_type", "property_type", "loan_purpose",
            "applicant_ethnicity", "applicant_race",
            "applicant_sex", "co_applicant"]

def prepare_data(df):
    
    df["co_applicant"] = df["co_applicant"].astype("int8")
    
    df.replace(replace_dict, np.nan, inplace = True)
    
    for col in num_cols:
        df[col].fillna(df[col].median(), inplace=True)
        
    for col in cat_cols_few:
        df[col].fillna(df[col].mode()[0], inplace=True)
          
    df["minority_population"] = (df["minority_population_pct"] / 100) * (df["population"])
    df["tract_family_income"] = (df["tract_to_msa_md_income_pct"] / 100) * (df["ffiecmedian_family_income"])

    df[to_log] = df[to_log].applymap(math.log)
    
    to_drop.extend(["minority_population_pct", "population",
                    "ffiecmedian_family_income", "tract_to_msa_md_income_pct"])
    df.drop(to_drop, axis=1, inplace=True)
    
    df = pd.get_dummies(df, columns = cat_cols_few)
    
    return df

In [3]:
X_train = prepare_data(X_train)

ce_target = ce.TargetEncoder(cols = ["lender", "msa_md", "state_code"], smoothing = 5, return_df = True)
X_train = ce_target.fit_transform(X_train, y_train["accepted"])

X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 29 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   loan_amount                     500000 non-null  float64
 1   msa_md                          500000 non-null  float64
 2   state_code                      500000 non-null  float64
 3   applicant_income                500000 non-null  float64
 4   number_of_owner-occupied_units  500000 non-null  float64
 5   lender                          500000 non-null  float64
 6   minority_population             500000 non-null  float64
 7   tract_family_income             500000 non-null  float64
 8   loan_type_1                     500000 non-null  uint8  
 9   loan_type_2                     500000 non-null  uint8  
 10  loan_type_3                     500000 non-null  uint8  
 11  loan_type_4                     500000 non-null  uint8  
 12  property_type_1 

In [4]:
X = X_train.values
y = y_train["accepted"].values

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [5]:
params = {
        'max_depth': [5, 6, 7, 8],
        'n_estimators': [200, 300, 400, 500, 600],
        'reg_alpha': [0, 0.1, 0.2, 0.3, 0.4],
        'subsample': [0.6, 0.7, 0.8, 1],
        'colsample_bytree': [0.6, 0.8, 1],
        'min_child_weight': [1, 4, 5, 6, 8],
        'learning_rate': [0.01, 0.02, 0.1]
    
        }

model = XGBClassifier(objective = 'binary:logistic', silent=True, n_jobs=-1)

In [9]:
folds = 5
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True)
cv = skf.split(X, y)

random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=param_comb, return_train_score=True,
                                   scoring='accuracy', n_jobs=4, cv=cv, verbose=3)

random_search.fit(X, y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed: 31.0min finished


RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x000002688E1035C8>,
                   error_score=nan,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=-1, nthread=None,
                                           objective='binary...
                   iid='deprecated', n_iter=5, n_jobs=4,
                   param_distributions={'colsample_bytree': [0.6, 0.8, 1],
                                        'learning_rate': [0.01, 0.02, 0.1],
                                    

In [14]:
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ )
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
results.sort_values("rank_test_score")


 Best estimator:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=7,
              min_child_weight=5, missing=None, n_estimators=500, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0.1, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=True, subsample=0.7, verbosity=1)

 Best score for 5-fold search with 5 parameter combinations:
0.7261019999999999

 Best hyperparameters:
{'subsample': 0.7, 'reg_alpha': 0.1, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 7, 'learning_rate': 0.1, 'colsample_bytree': 0.8}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_reg_alpha,param_n_estimators,param_min_child_weight,param_max_depth,param_learning_rate,param_colsample_bytree,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
2,286.502708,2.516713,1.3323,0.088961,0.7,0.1,500,5,7,0.1,0.8,"{'subsample': 0.7, 'reg_alpha': 0.1, 'n_estima...",0.72759,0.72337,0.72748,0.72748,0.72459,0.726102,0.001775,1,0.754683,0.754715,0.754472,0.754495,0.755755,0.754824,0.000475
0,346.089515,6.044303,1.787042,0.133804,0.8,0.1,600,4,7,0.02,0.8,"{'subsample': 0.8, 'reg_alpha': 0.1, 'n_estima...",0.72765,0.7233,0.72713,0.7269,0.72436,0.725868,0.001715,2,0.733393,0.734565,0.733888,0.734003,0.734542,0.734078,0.000439
3,318.330126,7.843158,1.36251,0.172356,0.7,0.0,400,8,8,0.02,1.0,"{'subsample': 0.7, 'reg_alpha': 0, 'n_estimato...",0.72789,0.7236,0.72687,0.7264,0.72374,0.7257,0.001727,3,0.733785,0.73506,0.734375,0.734348,0.734955,0.734505,0.000463
4,210.022114,38.634568,0.767775,0.106037,0.7,0.1,300,6,8,0.02,1.0,"{'subsample': 0.7, 'reg_alpha': 0.1, 'n_estima...",0.72647,0.72205,0.72544,0.72537,0.72302,0.72447,0.001656,4,0.73155,0.732525,0.73122,0.73154,0.732005,0.731768,0.000454
1,245.384254,4.176276,0.849997,0.059375,0.8,0.3,500,6,5,0.01,1.0,"{'subsample': 0.8, 'reg_alpha': 0.3, 'n_estima...",0.72,0.71531,0.71903,0.71906,0.71541,0.717762,0.001992,5,0.718845,0.720178,0.719135,0.719065,0.719993,0.719443,0.000536
