**Hist_Gradient_Boost model functions**

# Imports

In [23]:
# packages
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import roc_auc_score, plot_roc_curve

# import functions
from hcdr.data.data import Data
from hcdr.modeling.preproc import preproc_pipeline
from hcdr.data.merged_data import merge_dfs

# import model
from sklearn.experimental import enable_hist_gradient_boosting   
from sklearn.ensemble import HistGradientBoostingClassifier

# Data

## train data

In [36]:
def get_train_test_data():
    # preprocessor
    preproc = preproc_pipeline(scaler_type=None)
    
    # merge train dataframes
    df_merged = merge_dfs(df_app="application_train", verbose=True)
    df_merged = df_merged.replace(to_replace=np.inf,value=np.nan).replace(to_replace=-np.inf,value=np.nan)
    # define train variables
    X = df_merged.drop(columns=["SK_ID_CURR", "TARGET"])
    X_transformed = preproc.fit_transform(X)
    y = df_merged["TARGET"]
    
    # merge test dataframes
    df_test = merge_dfs(df_app="application_test", verbose=True)
    # define test variables
    df_test = df_test.replace(to_replace=np.inf,value=np.nan).replace(to_replace=-np.inf,value=np.nan)
    test = df_test.drop(columns=["SK_ID_CURR"])
    y_test = preproc.transform(test)
    
    return X_transformed, y, df_test, y_test

# HGB model 

In [40]:
def hgb_model():
    
    # define variables
    # train data
    train_test = get_train_test_data() 
    X_transformed = train_test[0]
    y = train_test[1]
    # test data
    df_test = train_test[2]
    y_test = train_test[3]
    
    # instantiate model
    hgb = HistGradientBoostingClassifier(max_depth=15, max_iter = 20_000, scoring = 'roc_auc',verbose = 1)
    model_hgb = hgb.fit(X_transformed, y)
    
    # save final model to pickle file
    filename = 'finalized_hgb_model.sav'
    pickle.dump(model_hgb, open(filename, 'wb'))
    
    # run prediction
    hgb_pred = model_hgb.predict_proba(y_test)[:,1]
    
    # create submission file
    submit = df_test[['SK_ID_CURR']]
    submit['TARGET'] = hgb_pred
    
    # save submission file
    submit.to_csv('hgb_hcdr.csv',index=False)
    
    return submit.head()

In [41]:
%%time
hgb_model()

Running preprocessor...
scaler_type=None
Aggregating non-application dataframes...
application_train:       122 --> 72. Dropped 50 columns
application_test:       121 --> 71. Dropped 50 columns
previous_application:       37 --> 26. Dropped 11 columns
Loading credit card balance data table...
23 --> 23. Dropped 0 columns
optimized size by 26.0 % | 0.0103558 GB
Loading 'bureau' AND 'bureau_balance' data tables...
Running get_bureau_final function...
Running get_bureau_balance_final function...
optimized size by 65.99999999999999 % | 0.156472346 GB
Loading isntallments payments data table...
8 --> 8. Dropped 0 columns
optimized size by 19.999999999999996 % | 0.010866784 GB
Loading POS_CASH_balance data table...
8 --> 8. Dropped 0 columns
optimized size by 32.99999999999999 % | 0.010792064 GB
application_train:       72 --> 72. Dropped 0 columns
application_test:       71 --> 71. Dropped 0 columns
previous_application:       26 --> 26. Dropped 0 columns
credit_card_balance:       23 --> 1

[52/20000] 1 tree, 31 leaves, max depth = 10, train score: 0.78506, val score: 0.76368, in 0.456s
[53/20000] 1 tree, 31 leaves, max depth = 12, train score: 0.78601, val score: 0.76379, in 0.416s
[54/20000] 1 tree, 31 leaves, max depth = 11, train score: 0.78639, val score: 0.76373, in 0.416s
[55/20000] 1 tree, 31 leaves, max depth = 14, train score: 0.78683, val score: 0.76402, in 0.396s
[56/20000] 1 tree, 31 leaves, max depth = 14, train score: 0.78739, val score: 0.76409, in 0.372s
[57/20000] 1 tree, 31 leaves, max depth = 14, train score: 0.78824, val score: 0.76469, in 0.404s
[58/20000] 1 tree, 31 leaves, max depth = 12, train score: 0.78908, val score: 0.76492, in 0.408s
[59/20000] 1 tree, 31 leaves, max depth = 11, train score: 0.78939, val score: 0.76504, in 0.435s
[60/20000] 1 tree, 31 leaves, max depth = 13, train score: 0.79032, val score: 0.76529, in 0.456s
[61/20000] 1 tree, 31 leaves, max depth = 12, train score: 0.79051, val score: 0.76533, in 0.452s
[62/20000] 1 tree, 3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submit['TARGET'] = hgb_pred


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.056121
1,100005,0.276784
2,100013,0.036606
3,100028,0.055429
4,100038,0.167401
