In [1]:
import numpy as np 
import pandas as pd
import xgboost as xgb
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix 
import pickle

In [2]:
X = pd.read_pickle('/kaggle/input/amex-imputed-aggregate-data/X_train_agg.pkl', compression='gzip') 
y = pd.read_pickle('/kaggle/input/amex-imputed-aggregate-data/y_train_agg.pkl', compression='gzip')             

In [3]:
#kaggle competition metric

def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [4]:
xgb_classifier = xgb.XGBClassifier() #create shell XGB Classifier model with default parameters

In [5]:
xgb_classifier = xgb_classifier.fit(X,y) #fit model with aggregate training data

In [6]:
predict = xgb_classifier.predict_proba(X) #predict probably of credit default

In [7]:
pd.DataFrame(predict).head(5) #convert predictions to dataframe and display first 5 rows

Unnamed: 0,0,1
0,0.999412,0.000588
1,0.999002,0.000998
2,0.998344,0.001656
3,0.990682,0.009318
4,0.997468,0.002532


In [8]:
y_predict =  predict[:, 1] #select predictions from column 0 to evaluate performance

In [9]:
acc = accuracy_score(y, y_predict.round()) #round predictions to calculate accuracy score
print(acc*100) #print accuracy score

92.27457056130464


In [10]:
y_predict_round = y_predict.round() #round predictions to prep data for model evaluation 

In [11]:
#print kaggle competition metric for model
print(f'M = {amex_metric(pd.DataFrame(y), pd.DataFrame(y_predict_round, index=y.index, columns=["prediction"]))}')

M = 0.6600017053943578


In [12]:
print(confusion_matrix(y, y_predict_round)) #print confusion matrix for predictions

[[322953  17132]
 [ 18321 100507]]


In [13]:
print(classification_report(y, y_predict_round)) #print classification report for predictions

              precision    recall  f1-score   support

           0       0.95      0.95      0.95    340085
           1       0.85      0.85      0.85    118828

    accuracy                           0.92    458913
   macro avg       0.90      0.90      0.90    458913
weighted avg       0.92      0.92      0.92    458913



In [14]:
pickle.dump(xgb_classifier, open('xgb_classifier_model_agg.sav', 'wb'))