# AMEX Default Prediction

## Set up

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm
import pyarrow
from sklearn import preprocessing

## Preprocessing - Supervised

In [2]:
from sklearn.model_selection import train_test_split


In [3]:
# load parquet data
data = pd.read_parquet('dataset/train.parquet')


In [4]:
raw_labels = pd.read_csv('dataset/train_labels.csv')

In [5]:
# use a smaller subset for fast pipeline during testing
sample = raw_labels['customer_ID'][:10000]
data = data[data['customer_ID'].isin(sample)]

In [6]:


# replace -1's with NA's (to make it simpler to replace nas)
data.replace(-1, np.nan)

fill_values = {}
pbar = tqdm(range(2,len(data.columns)))

# get means for missing values
for i in pbar:
    # add to dictionary of means
    fill_values[data.columns[i]] = 0 #np.mean(data[data.columns[i]])

# fill missing values
data.fillna(value = fill_values, inplace= True)


# normalize with mean and std scaling


100%|██████████| 188/188 [00:00<00:00, 179578.49it/s]


In [7]:
# aggregate across customer_ids
data_agg = data.drop('S_2', axis = 1).groupby('customer_ID', as_index = False).sum()

# normalize again and split into X and y
X = preprocessing.scale(data_agg.iloc[:,1:])
y = data_agg[['customer_ID']].merge(raw_labels)['target']

In [8]:
# train test split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 0)


## Supervised method

In [9]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

### Evaluation Methods
We want to recreate the evaluation metric from: https://www.kaggle.com/competitions/amex-default-prediction/overview/evaluation

In [10]:
# recreates evaluation metric from: 
def evaluate_model(y_true, y_score):
    AUC = roc_auc_score(y_true, y_score[:,1])
    print(f"AUC Score: {AUC}")

    # GINI from AUC formula: https://yassineelkhal.medium.com/confusion-matrix-auc-and-roc-curve-and-gini-clearly-explained-221788618eb2
    Gini = (AUC*2)-1
    print(f"GINI Score: {Gini}")

    # combine arrays
    combined = np.stack([y_score[:,1], np.array(y_true)], axis = 1)
    #sort in descending order
    combined  = combined[combined[:,0].argsort(),:][::-1]
    #get top 4% ratio of positive predictions
    n = int(len(combined)*0.04)
    D = np.sum(combined[:n,1])/n
    print(f"Default rate at 4%: {D}")
    
    M = 0.5*(Gini+D)
    print(f"M: {M}")

    return M


# TODO: Plot Confusion Matrix

# TODO: Plot GINI (OR AUC) curve

### Model fitting

In [11]:
xgb = GradientBoostingClassifier()
xgb.fit(X_train, y_train)
pred = xgb.predict_proba(X_test)

In [12]:
evaluate_model(y_test, pred)

AUC Score: 0.9467307822570981
GINI Score: 0.8934615645141961
Default rate at 4%: 0.9625
M: 0.927980782257098


0.927980782257098