In [None]:
pip install optuna

In [None]:
pip install xgboost

In [1]:
## Importing libraries
import optuna
import pandas as pd
import numpy as np
from Amex_Metric import amex_metric
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

## Reading the data
train = pd.read_csv('/home/ec2-user/SageMaker/Analytics_Data_Science/American_Express/Evan/amex_train_payment_spend_final.csv')
test = pd.read_csv('/home/ec2-user/SageMaker/Analytics_Data_Science/American_Express/Evan/amex_test_payment_spend_final.csv')

In [2]:
train.head()

Unnamed: 0,customer_ID,P_2_mean,P_2_median,P_2_correlation,P_3_mean,P_2_sum,S_25_mean,S_25_sum,S_25_std,S_25_mad,S_25_data_range,S_25_iqr,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.9336,0.9385,-0.438767,0.68,12.14,0.9746,12.67,0.002504,0.001802,0.00928,0.001465,0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.9,0.905,-0.854416,0.567,11.695,0.9756,12.68,0.002622,0.001802,0.00879,0.001465,0
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0.8784,0.885,-0.109422,0.618,11.42,0.974,12.664,0.002858,0.002329,0.009766,0.003418,0
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0.599,0.598,0.953176,0.611,7.785,0.9746,12.67,0.002941,0.002403,0.00879,0.004395,0
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0.8916,0.8794,-0.597527,0.5273,11.59,0.974,12.664,0.003314,0.002817,0.00879,0.005371,0


In [3]:
train.shape

(458913, 13)

In [4]:
test.head()

Unnamed: 0,customer_ID,P_2_mean,P_2_median,P_2_sum,P_2_correlation,P_3_mean,S_25_mean,S_25_sum,S_25_std,S_25_mad,S_25_data_range,S_25_iqr
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0.6016,0.597,5.414,-0.484413,0.5737,0.974,8.766,0.003436,0.003038,0.00879,0.005859
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,0.8623,0.861,11.21,-0.459726,0.553,0.974,12.664,0.003073,0.002554,0.0083,0.005371
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...,0.749,0.7437,9.734,-0.398301,0.671,0.973,12.65,0.003055,0.002628,0.0083,0.005371
3,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...,0.4746,0.474,6.17,0.511864,0.611,0.973,12.65,0.003233,0.002855,0.00879,0.005859
4,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...,0.324,0.3162,4.215,-0.495118,0.635,0.5396,7.016,0.363472,0.3347,0.8164,0.712158


In [5]:
test.shape

(589833, 12)

## Modeling Processes:

#### Splitting the data

In [6]:
## Splitting the train data-frame into training (80%) and validation (20%)

## Defining the input and target variables
X_train = train.drop(columns = ['customer_ID', 'target'])
X_test = test.drop(columns = ['customer_ID'])
Y_train = train['target']

## Splitting the data
X_training, X_validation, Y_training, Y_validation = train_test_split(X_train, Y_train, test_size = 0.2, stratify = Y_train)

#### Hyper-parameter tuning with Optuna

In [7]:
## XGBoost Classifier
def objective_amex_xgb(trial):
    
    ## Defining the XGB hyper-parameter grid
    XGB_param_grid = {'n_estimators': trial.suggest_int('n_estimators', 100, 500, 100),
                     'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.951, step = 0.05),
                     'min_split_loss': trial.suggest_int('min_split_loss', 0, 5, 1),
                     'max_depth' : trial.suggest_int('max_depth', 3, 7, 1),
                     'min_child_weight' : trial.suggest_int('min_child_weight', 5, 9, 1),
                     'subsample' : trial.suggest_float('subsample', 0.6, 1, step = 0.1),
                     'colsample_bytree' : trial.suggest_float('colsample_bytree', 0.6, 1, step = 0.1)}
    
    ## Building the XGBClassifier model
    model = XGBClassifier(**XGB_param_grid, n_jobs = -1).fit(X_training, Y_training)
        
    ## Predicting on the validation data-frame
    xgb_val_preds = model.predict_proba(X_validation)[:, 1]
    
    ## Evaluating model performance on the validation set
    amex_score = amex_metric(Y_validation, xgb_val_preds)
    
    ## Returning absolute difference of model validation predictions
    return amex_score


## RandomForest Classifier
def objective_amex_rf(trial):
    
    ## Defining the XGB hyper-parameter grid
    rf_param_grid = {'n_estimators': trial.suggest_int('n_estimators', 100, 500, 100), 
                     'max_depth' : trial.suggest_int('max_depth', 3, 7, 1),
                     'min_samples_split' : trial.suggest_int('min_samples_split', 5, 15, 1), 
                     'min_samples_leaf' : trial.suggest_int('min_samples_leaf', 5, 15, 1)}
    
    ## Building the RandomForestClassifier model
    model = RandomForestClassifier(**rf_param_grid, n_jobs = -1).fit(X_training, Y_training)
        
    ## Predicting on the validation data-frame
    rf_val_preds = model.predict_proba(X_validation)[:, 1]
    
    ## Evaluating model performance on the validation set
    amex_score = amex_metric(Y_validation, rf_val_preds)
    
    ## Returning absolute difference of model validation predictions
    return amex_score

In [8]:
## Calling Optuna objective function for XGBoost
xgb_study = optuna.create_study(direction = 'maximize')
xgb_study.optimize(objective_amex_xgb, n_trials = 5)

## Extracting best model 
xgb_best_params = xgb_study.best_trial.params

[32m[I 2022-07-24 19:16:38,185][0m A new study created in memory with name: no-name-d1bc5d7b-0ddf-46dd-92ff-2fd97598f7cc[0m
[32m[I 2022-07-24 19:17:20,932][0m Trial 0 finished with value: 9.124367584192536e-05 and parameters: {'n_estimators': 300, 'learning_rate': 0.101, 'min_split_loss': 2, 'max_depth': 5, 'min_child_weight': 7, 'subsample': 0.8, 'colsample_bytree': 0.6}. Best is trial 0 with value: 9.124367584192536e-05.[0m
[32m[I 2022-07-24 19:17:33,246][0m Trial 1 finished with value: 0.00027461626555567997 and parameters: {'n_estimators': 100, 'learning_rate': 0.30100000000000005, 'min_split_loss': 5, 'max_depth': 3, 'min_child_weight': 5, 'subsample': 0.6, 'colsample_bytree': 0.9}. Best is trial 1 with value: 0.00027461626555567997.[0m
[32m[I 2022-07-24 19:18:01,368][0m Trial 2 finished with value: 0.0004714649529968494 and parameters: {'n_estimators': 300, 'learning_rate': 0.7010000000000001, 'min_split_loss': 4, 'max_depth': 3, 'min_child_weight': 6, 'subsample': 0.9

In [9]:
## Calling Optuna objective function for Random Forest
rf_study = optuna.create_study(direction = 'maximize')
rf_study.optimize(objective_amex_rf, n_trials = 5)

## Extracting best model 
rf_best_params = rf_study.best_trial.params

[32m[I 2022-07-24 19:19:36,854][0m A new study created in memory with name: no-name-8efbd714-701c-4b80-b66b-7a5d1fe38088[0m
[32m[I 2022-07-24 19:19:50,920][0m Trial 0 finished with value: 0.00047763268668644596 and parameters: {'n_estimators': 100, 'max_depth': 5, 'min_samples_split': 10, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.00047763268668644596.[0m
[32m[I 2022-07-24 19:20:47,148][0m Trial 1 finished with value: 0.0003671679426219543 and parameters: {'n_estimators': 400, 'max_depth': 5, 'min_samples_split': 8, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.00047763268668644596.[0m
[32m[I 2022-07-24 19:21:18,361][0m Trial 2 finished with value: 0.00039576080194453703 and parameters: {'n_estimators': 200, 'max_depth': 6, 'min_samples_split': 7, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.00047763268668644596.[0m
[32m[I 2022-07-24 19:22:11,143][0m Trial 3 finished with value: 0.00019163317026102758 and parameters: {'n_estimators': 300, 'max_

#### Modeling

In [None]:
## Building a Random Forest model with the optimal set of hyper-parameters
rf_md = RandomForestClassifier(**rf_best_params, n_jobs = -1).fit(X_training, Y_training)

## Predicting on the test data-frame
X_test_preds = rf_md.predict_proba(X_test)[:, 1]

## Creating the final output data-frame
data_out = pd.DataFrame({'customer_ID': test['customer_ID'], 'prediction': X_test_preds})

## Exporting as a csv file for submission
data_out.to_csv('amex_rf_payment_spend_preds.csv', index = False)

In [None]:
## Building a XGBoost model with the optimal set of hyper-parameters
xgb_md = XGBClassifier(**xgb_best_params, n_jobs = -1).fit(X_training, Y_training)

## Predicting on the test data-frame
X_test_preds = xgb_md.predict_proba(X_test)[:, 1]

## Creating the final output data-frame
data_out = pd.DataFrame({'customer_ID': test['customer_ID'], 'prediction': X_test_preds})

## Exporting as a csv file for submission
data_out.to_csv('amex_xgb_payment_spend_preds.csv', index = False)