## Import Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import boxcox
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import log_loss, f1_score, roc_curve, auc, confusion_matrix, accuracy_score
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from xgboost import XGBRegressor
import optuna

## Set file paths for train and predict datasets

In [2]:
train_dataset = "Dataset/Train.csv"
predict_dataset = "Dataset/Test.csv"

## Exploratory Data Analysis

#### Get train dataset info

In [None]:
train_df = pd.read_csv(train_dataset)
train_df.head()

In [None]:
train_df.info(verbose=True)

#### Get predict dataset info

In [None]:
predict_df = pd.read_csv(predict_dataset)
predict_df.head()

In [None]:
predict_df.info(verbose=True)

#### Count plot of target variable

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x="IsUnderRisk", data=train_df).set_title('Count plot of "IsUnderRisk" data')
plt.grid()

#### Examine class imbalance

In [None]:
neg, pos = np.bincount(train_df['IsUnderRisk'])
total = neg + pos
print('Total: {}\n  Positive: {} ({:.2f}% of total)\n  Negative: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total, neg, 100 * neg / total))

#### Get correlation between different features

In [None]:
plt.figure(figsize=(10,6))
ax = sns.heatmap(train_df.corr(), annot=True, linewidth = 0.5, cmap='coolwarm')
plt.show()

#### Examine data distribution in different fields

In [None]:
fig, ax = plt.subplots()
train_df['Internal_Audit_Score'].hist(color='#A9C5D3', edgecolor='black', grid=True)
ax.set_title('Internal_Audit_Score Histogram', fontsize=12)
ax.set_xlabel('Internal_Audit_Score', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)

In [None]:
fig, ax = plt.subplots()
train_df['External_Audit_Score'].hist(color='#A9C5D3', edgecolor='black', grid=True)
ax.set_title('External_Audit_Score Histogram', fontsize=12)
ax.set_xlabel('External_Audit_Score', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)

In [None]:
x = np.log1p(train_df['Loss_score'])
sns.distplot(x)
plt.grid()

In [None]:
fig, ax = plt.subplots()
train_df['Fin_Score'].hist(color='#A9C5D3', edgecolor='black', grid=True)
ax.set_title('Fin_Score Histogram', fontsize=12)
ax.set_xlabel('Fin_Score', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)

In [None]:
fig, ax = plt.subplots()
train_df['Loss_score'].hist(color='#A9C5D3', edgecolor='black', grid=True)
ax.set_title('Loss_score Histogram', fontsize=12)
ax.set_xlabel('Loss_score', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)

In [None]:
fig, ax = plt.subplots()
train_df['Past_Results'].hist(color='#A9C5D3', edgecolor='black', grid=True)
ax.set_title('Past_Results Histogram', fontsize=12)
ax.set_xlabel('Past_Results', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)

In [None]:
fig, ax = plt.subplots()
np.log1p(train_df['Location_Score']).hist(color='#A9C5D3', edgecolor='black', grid=True)
ax.set_title('Location_Score Histogram', fontsize=12)
ax.set_xlabel('Location_Score', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)

#### Trying out other data transformations

In [None]:
l, opt_lambda = boxcox(train_df["Location_Score"])
print('Optimal lambda value:', opt_lambda)

train_df['Location_boxcox_lambda_0'] = boxcox((1+train_df['Location_Score']), lmbda=0)
train_df['Location_boxcox_lambda_opt'] = boxcox(train_df['Location_Score'], lmbda=opt_lambda)

location_boxcox_mean = np.round(np.mean(train_df['Location_boxcox_lambda_opt']),2)
fig, ax = plt.subplots()
train_df['Location_boxcox_lambda_opt'].hist(bins=30, color='#A9C5D3', edgecolor='black', grid=True)
plt.axvline(location_boxcox_mean, color='r')
ax.set_title('Location_Score Histogram after Box–Cox Transform', fontsize=12)
ax.set_xlabel('Location_Score (Box–Cox transform)', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)

In [None]:
l, opt_lambda = boxcox(train_df["Internal_Audit_Score"])
print('Optimal lambda value:', opt_lambda)

train_df['IAS_boxcox_lambda_0'] = boxcox((1+train_df['Internal_Audit_Score']), lmbda=0)
train_df['IAS_boxcox_lambda_opt'] = boxcox(train_df['Internal_Audit_Score'], lmbda=opt_lambda)

IAS_boxcox_mean = np.round(np.mean(train_df['IAS_boxcox_lambda_opt']),2)
fig, ax = plt.subplots()
train_df['IAS_boxcox_lambda_opt'].hist(bins=15, color='#A9C5D3', edgecolor='black', grid=True)
plt.axvline(IAS_boxcox_mean, color='r')
ax.set_title('Internal_Audit_Score Histogram after Box–Cox Transform', fontsize=12)
ax.set_xlabel('Internal_Audit_Score (Box–Cox transform)', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)

In [None]:
l, opt_lambda = boxcox(train_df["External_Audit_Score"])
print('Optimal lambda value:', opt_lambda)

train_df['EAS_boxcox_lambda_0'] = boxcox((1+train_df['External_Audit_Score']), lmbda=0)
train_df['EAS_boxcox_lambda_opt'] = boxcox(train_df['External_Audit_Score'], lmbda=opt_lambda)

EAS_boxcox_mean = np.round(np.mean(train_df['EAS_boxcox_lambda_opt']),2)
fig, ax = plt.subplots()
train_df['EAS_boxcox_lambda_opt'].hist(bins=15, color='#A9C5D3', edgecolor='black', grid=True)
plt.axvline(EAS_boxcox_mean, color='r')
ax.set_title('External_Audit_Score Histogram after Box–Cox Transform', fontsize=12)
ax.set_xlabel('External_Audit_Score (Box–Cox transform)', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)

In [None]:
l, opt_lambda = boxcox(train_df["Fin_Score"])
print('Optimal lambda value:', opt_lambda)

train_df['Fin_boxcox_lambda_0'] = boxcox((1+train_df['Fin_Score']), lmbda=0)
train_df['Fin_boxcox_lambda_opt'] = boxcox(train_df['Fin_Score'], lmbda=opt_lambda)

Fin_boxcox_mean = np.round(np.mean(train_df['Fin_boxcox_lambda_opt']),2)
fig, ax = plt.subplots()
#train_df['Fin_boxcox_lambda_opt'].hist(bins=15, color='#A9C5D3', edgecolor='black', grid=True)
train_df['Fin_boxcox_lambda_0'].hist(bins=15, color='#A9C5D3', edgecolor='black', grid=True)
plt.axvline(Fin_boxcox_mean, color='r')
ax.set_title('Fin_Score Histogram after Box–Cox Transform', fontsize=12)
ax.set_xlabel('Fin_Score (Box–Cox transform)', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)

## Data Preprocessing

#### Read train and predict datasets

In [3]:
train_df = pd.read_csv(train_dataset)
predict_df = pd.read_csv(predict_dataset)
print("train_df: {}".format(train_df.shape))
print("predict_df: {}".format(predict_df.shape))

train_df: (543, 8)
predict_df: (233, 7)


#### Extract "IsUnderRisk" field from train_df into NumPy array

In [4]:
train_y = np.array([train_df['IsUnderRisk'].values]).T
train_df.drop(['IsUnderRisk'], inplace=True, axis=1)
print("train_y: {}".format(train_y.shape))

train_y: (543, 1)


#### Feature Engineering

In [5]:
# Combine train and predict dataframes
combined_df = train_df.append(predict_df, sort=False, ignore_index=True)
print(combined_df.shape)

(776, 7)


In [6]:
# Standard Deviation of all score values
combined_df['Location_Score_STD'] = np.std(combined_df['Location_Score'])
combined_df['Internal_Audit_Score_STD'] = np.std(combined_df['Internal_Audit_Score'])
combined_df['External_Audit_Score_STD'] = np.std(combined_df['External_Audit_Score'])
combined_df['Fin_Score_STD'] = np.std(combined_df['Fin_Score'])
combined_df['Loss_score_STD'] = np.std(combined_df['Loss_score'])
combined_df['Past_Results_STD'] = np.std(combined_df['Past_Results'])

# Binning
combined_df['Past_Results_bin'] = combined_df['Past_Results'].apply(lambda x: "[0-2]" if 0<=x<2 else "[2-4]" if 2<=x<4 else "[4-10]")
combined_df['Loss_Score_bin'] = combined_df['Loss_score'].apply(lambda x: "[0-5]" if 0<=x<5 else "[5-9]" if 5<=x<9 else "[9-15]")

# One-hot encoding
one_hot = pd.get_dummies(combined_df['City'])
combined_df.drop('City', axis = 1, inplace=True)
combined_df = combined_df.join(one_hot)

one_hot = pd.get_dummies(combined_df['Past_Results_bin'])
combined_df.drop('Past_Results_bin', axis = 1, inplace=True)
combined_df = combined_df.join(one_hot)

one_hot = pd.get_dummies(combined_df['Loss_Score_bin'])
combined_df.drop('Loss_Score_bin', axis = 1, inplace=True)
combined_df = combined_df.join(one_hot)

# Convert all scores to log scale
combined_df['Location_Score'] = np.log1p(combined_df['Location_Score'])
combined_df['Internal_Audit_Score'] = np.log1p(combined_df['Internal_Audit_Score'])
combined_df['External_Audit_Score'] = np.log1p(combined_df['External_Audit_Score'])
combined_df['Fin_Score'] = np.log1p(combined_df['Fin_Score'])
combined_df['Loss_score'] = np.log1p(combined_df['Loss_score'])
combined_df['Past_Results'] = np.log1p(combined_df['Past_Results'])

# Get difference and mean of different scores
combined_df["Audit_Score_Diff"] = np.power(combined_df["Internal_Audit_Score"] - combined_df["External_Audit_Score"], 2)
combined_df["Average_Audit_Score"] = (combined_df["Internal_Audit_Score"] + combined_df["External_Audit_Score"]) / 2
combined_df["Score_diff1"] = np.power(combined_df["Fin_Score"] - combined_df["Loss_score"], 2)
combined_df["Score_diff2"] = np.power(combined_df["Fin_Score"] - combined_df["Average_Audit_Score"], 2)
combined_df["Score_diff3"] = np.power(combined_df["Loss_score"] - combined_df["Average_Audit_Score"], 2)
combined_df["Cumulative_Score"] = (combined_df['Location_Score'] + combined_df["Internal_Audit_Score"] + combined_df["External_Audit_Score"] + combined_df["Fin_Score"] + combined_df["Loss_score"]) / 5

# Calculate different score ratios
combined_df["Fin_Score/Average_Audit_Score"] = combined_df["Fin_Score"] / combined_df["Average_Audit_Score"]
combined_df["Loss_score/Average_Audit_Score"] = combined_df["Loss_score"] / combined_df["Average_Audit_Score"]
combined_df["Loss_score/Fin_Score"] = combined_df["Loss_score"] / combined_df["Fin_Score"]
combined_df["Fin_Score/Location_Score"] = combined_df["Fin_Score"] / combined_df["Location_Score"]
combined_df["Loss_score/Location_Score"] = combined_df["Loss_score"] / combined_df["Location_Score"]
combined_df["Average_Audit_Score/Location_Score"] = combined_df["Average_Audit_Score"] / combined_df["Location_Score"]
combined_df["Past_Results/Location_Score"] = combined_df["Past_Results"] / combined_df["Location_Score"]

# Ratio of score value with corresponding max value
max_loc_score = max(combined_df['Location_Score'])
combined_df['Location_Score/max_loc_score'] = combined_df['Location_Score']/max_loc_score
max_ias_score = max(combined_df['Internal_Audit_Score'])
combined_df['Internal_Audit_Score/max_ias_score'] = combined_df['Internal_Audit_Score']/max_ias_score
max_eas_score = max(combined_df['External_Audit_Score'])
combined_df['External_Audit_Score/max_eas_score'] = combined_df['External_Audit_Score']/max_eas_score
max_fin_score = max(combined_df['Fin_Score'])
combined_df['Fin_Score/max_fin_score'] = combined_df['Fin_Score']/max_fin_score
max_loss_score = max(combined_df['Loss_score'])
combined_df['Loss_score/max_loss_score'] = combined_df['Loss_score']/max_loss_score
max_past_results = max(combined_df['Past_Results'])
combined_df['Past_Results/max_past_results'] = combined_df['Past_Results']/max_past_results

# Ratio of score value with corresponding mean value
mean_loc_score = np.mean(combined_df['Location_Score'])
combined_df['Location_Score/mean_loc_score'] = combined_df['Location_Score']/mean_loc_score
mean_ias_score = np.mean(combined_df['Internal_Audit_Score'])
combined_df['Internal_Audit_Score/mean_ias_score'] = combined_df['Internal_Audit_Score']/mean_ias_score
mean_eas_score = np.mean(combined_df['External_Audit_Score'])
combined_df['External_Audit_Score/mean_eas_score'] = combined_df['External_Audit_Score']/mean_eas_score
mean_fin_score = np.mean(combined_df['Fin_Score'])
combined_df['Fin_Score/mean_fin_score'] = combined_df['Fin_Score']/mean_fin_score
mean_loss_score = np.mean(combined_df['Loss_score'])
combined_df['Loss_score/mean_loss_score'] = combined_df['Loss_score']/mean_loss_score
mean_past_results = np.mean(combined_df['Past_Results'])
combined_df['Past_Results/mean_past_results'] = combined_df['Past_Results']/mean_past_results

# Box-cox transformation
_, opt_lambda = boxcox(combined_df["Location_Score"])
combined_df['Location_boxcox_lambda_0'] = boxcox((1+combined_df['Location_Score']), lmbda=0)
combined_df['Location_boxcox_lambda_opt'] = boxcox(combined_df['Location_Score'], lmbda=opt_lambda)

_, opt_lambda = boxcox(combined_df["Internal_Audit_Score"])
combined_df['IAS_boxcox_lambda_0'] = boxcox((1+combined_df['Internal_Audit_Score']), lmbda=0)
combined_df['IAS_boxcox_lambda_opt'] = boxcox(combined_df['Internal_Audit_Score'], lmbda=opt_lambda)

_, opt_lambda = boxcox(combined_df["External_Audit_Score"])
combined_df['EAS_boxcox_lambda_0'] = boxcox((1+combined_df['External_Audit_Score']), lmbda=0)
combined_df['EAS_boxcox_lambda_opt'] = boxcox(combined_df['External_Audit_Score'], lmbda=opt_lambda)

_, opt_lambda = boxcox(combined_df["Fin_Score"])
combined_df['Fin_boxcox_lambda_0'] = boxcox((1+combined_df['Fin_Score']), lmbda=0)
combined_df['Fin_boxcox_lambda_opt'] = boxcox(combined_df['Fin_Score'], lmbda=opt_lambda)

#### Create train and predict Numpy arrays and scale them

In [7]:
# Segregate combined_df into train/predict datasets
train_x = combined_df[:543]
predict_x = combined_df[543:]

print("train_x: {}".format(train_x.shape))
print("predict_x: {}".format(predict_x.shape))

train_x: (543, 96)
predict_x: (233, 96)


In [8]:
# Scale the train_x/predict_x arrays
scaler = MinMaxScaler().fit(train_x)
train_x = scaler.transform(train_x)
predict_x = scaler.transform(predict_x)

In [9]:
train_x_full = train_x.copy()
train_y_full = train_y.copy()

#### Split training data into train/test datasets

In [10]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.095, random_state=1)
for train_index, test_index in sss.split(train_x, train_y):
    train_x, test_x = train_x[train_index], train_x[test_index]
    train_y, test_y = train_y[train_index], train_y[test_index]

print("------------------------- Training Dataset -------------------------")
print("train_x shape: {}".format(train_x.shape))
print("train_y shape: {}".format(train_y.shape))

print("\n------------------------- Test Dataset -------------------------")
print("test_x shape: {}".format(test_x.shape))
print("test_y shape: {}".format(test_y.shape))

------------------------- Training Dataset -------------------------
train_x shape: (491, 96)
train_y shape: (491, 1)

------------------------- Test Dataset -------------------------
test_x shape: (52, 96)
test_y shape: (52, 1)


In [11]:
Xtrain_full, Ytrain_full = train_x_full.copy(), train_y_full.copy()
Xtrain, Ytrain = train_x.copy(), train_y.copy()
Xtest, Ytest = test_x.copy(), test_y.copy()
Xpredict = predict_x.copy()

print("------------------------- Training Dataset -------------------------")
print("Xtrain_full shape: {}".format(Xtrain_full.shape))
print("Ytrain_full shape: {}".format(Ytrain_full.shape))
print("Xtrain shape: {}".format(Xtrain.shape))
print("Ytrain shape: {}".format(Ytrain.shape))

print("\n------------------------- Test Dataset -------------------------")
print("Xtest shape: {}".format(Xtest.shape))
print("Ytest shape: {}".format(Ytest.shape))

print("\n------------------------- Prediction Dataset -------------------------")
print("Xpredict shape: {}".format(Xpredict.shape))

------------------------- Training Dataset -------------------------
Xtrain_full shape: (543, 96)
Ytrain_full shape: (543, 1)
Xtrain shape: (491, 96)
Ytrain shape: (491, 1)

------------------------- Test Dataset -------------------------
Xtest shape: (52, 96)
Ytest shape: (52, 1)

------------------------- Prediction Dataset -------------------------
Xpredict shape: (233, 96)


## Hyperparameter search using Optuna

In [12]:
# Convert Ytest to one-hot encoding
df = pd.DataFrame(Ytest, columns=["IsUnderRisk"])
test_y = pd.get_dummies(df['IsUnderRisk']).values

In [13]:
# Define 5-fold cross validation test harness
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)

In [16]:
def objective(trial):
       
    y_pred = 0
    counter = 0
    
    for train, val in kfold.split(Xtrain, Ytrain):
        counter += 1
    
        train_x, train_y = Xtrain[train], Ytrain[train]
        val_x, val_y = Xtrain[val], Ytrain[val]
        df = pd.DataFrame(train_y, columns=['IsUnderRisk'])
        neg, pos = np.bincount(df['IsUnderRisk'])
        
        model = XGBRegressor(
            objective='multi:softprob',
            eval_metric='mlogloss',
            booster='gbtree',
            sample_type='uniform',
            tree_method='hist',
            scale_pos_weight=neg/pos,
            grow_policy='lossguide',
            num_round=5000,
            num_class=2,
            rate_drop=trial.suggest_uniform("rate_drop", 0.1, 1.0),
            learning_rate=trial.suggest_loguniform("learning_rate", 1e-2, 1e-1),
            min_split_loss=trial.suggest_uniform("min_split_loss", 0.1, 1.0),
            max_depth=trial.suggest_int("max_depth", 8, 25),
            min_child_weight=trial.suggest_int("min_child_weight", 1, 10),
            subsample=trial.suggest_uniform("subsample", 0.5, 1.0),
            colsample_bytree=trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
            reg_lambda=trial.suggest_loguniform("reg_lambda", 1e-4, 1.0),
            max_leaves=trial.suggest_int("max_leaves", 40, 200)
        )


        model.fit(train_x, train_y, eval_set=[(val_x, val_y)], early_stopping_rounds=200, verbose=False)
        pred = model.predict(Xtest, ntree_limit=model.best_ntree_limit)
        y_pred += pred

    y_pred /= float(counter)
    loss = log_loss(test_y, y_pred)
    return loss

In [18]:
study = optuna.create_study()
study.optimize(objective, n_trials=150)

[I 2020-05-17 17:54:49,770] Finished trial#0 with value: 0.3505540769547224 with parameters: {'rate_drop': 0.14919289838323793, 'learning_rate': 0.021177329857984546, 'min_split_loss': 0.4516864622415613, 'max_depth': 12, 'min_child_weight': 10, 'subsample': 0.5261579596080811, 'colsample_bytree': 0.8668669660519669, 'reg_lambda': 0.008472595221768804, 'max_leaves': 197}. Best is trial#0 with value: 0.3505540769547224.
[I 2020-05-17 17:54:52,610] Finished trial#1 with value: 0.32299451916836774 with parameters: {'rate_drop': 0.8183056541911972, 'learning_rate': 0.016075103061531017, 'min_split_loss': 0.9635924639878313, 'max_depth': 9, 'min_child_weight': 4, 'subsample': 0.9564032427535452, 'colsample_bytree': 0.5935418252656961, 'reg_lambda': 0.0006299862124012033, 'max_leaves': 62}. Best is trial#1 with value: 0.32299451916836774.
[I 2020-05-17 17:54:54,817] Finished trial#2 with value: 0.2832826720192455 with parameters: {'rate_drop': 0.49681923764583036, 'learning_rate': 0.07009527

[I 2020-05-17 17:57:00,995] Finished trial#38 with value: 0.24200540621621677 with parameters: {'rate_drop': 0.6691810285938375, 'learning_rate': 0.04919076595229832, 'min_split_loss': 0.8275315386246478, 'max_depth': 12, 'min_child_weight': 2, 'subsample': 0.9366093973412823, 'colsample_bytree': 0.7805047216030403, 'reg_lambda': 0.0004980527821383388, 'max_leaves': 65}. Best is trial#32 with value: 0.2336807178799063.
[I 2020-05-17 17:57:07,365] Finished trial#39 with value: 0.2428020303710722 with parameters: {'rate_drop': 0.645151003286583, 'learning_rate': 0.05031880689785131, 'min_split_loss': 0.8130635476879012, 'max_depth': 11, 'min_child_weight': 1, 'subsample': 0.9222533506037981, 'colsample_bytree': 0.774411903499565, 'reg_lambda': 0.001488121119376214, 'max_leaves': 72}. Best is trial#32 with value: 0.2336807178799063.
[I 2020-05-17 17:57:10,089] Finished trial#40 with value: 0.2874180633538904 with parameters: {'rate_drop': 0.6542479919412673, 'learning_rate': 0.04899911362

[I 2020-05-17 17:59:50,658] Finished trial#76 with value: 0.24606420703303927 with parameters: {'rate_drop': 0.9247679312172523, 'learning_rate': 0.042024088966813485, 'min_split_loss': 0.8577958272177121, 'max_depth': 12, 'min_child_weight': 1, 'subsample': 0.9127057197370674, 'colsample_bytree': 0.9074660298004875, 'reg_lambda': 0.0008599162613210832, 'max_leaves': 118}. Best is trial#32 with value: 0.2336807178799063.
[I 2020-05-17 17:59:54,444] Finished trial#77 with value: 0.2573921849879508 with parameters: {'rate_drop': 0.9870286726645165, 'learning_rate': 0.034428016564244504, 'min_split_loss': 0.8254564293598482, 'max_depth': 8, 'min_child_weight': 3, 'subsample': 0.951626643894464, 'colsample_bytree': 0.8801606074449513, 'reg_lambda': 0.00010611170051465685, 'max_leaves': 101}. Best is trial#32 with value: 0.2336807178799063.
[I 2020-05-17 18:00:00,436] Finished trial#78 with value: 0.25298721264474666 with parameters: {'rate_drop': 0.6307815186900375, 'learning_rate': 0.0308

[I 2020-05-17 18:02:44,680] Finished trial#114 with value: 0.238750842328255 with parameters: {'rate_drop': 0.8491364682656789, 'learning_rate': 0.0770375293063097, 'min_split_loss': 0.5186298491331421, 'max_depth': 8, 'min_child_weight': 1, 'subsample': 0.9840074016547521, 'colsample_bytree': 0.8887733870783623, 'reg_lambda': 0.004042845575970693, 'max_leaves': 105}. Best is trial#110 with value: 0.2328482777453386.
[I 2020-05-17 18:02:48,414] Finished trial#115 with value: 0.23673801475132888 with parameters: {'rate_drop': 0.8433600899190027, 'learning_rate': 0.07616214011546839, 'min_split_loss': 0.5172541127544699, 'max_depth': 8, 'min_child_weight': 1, 'subsample': 0.999810791315374, 'colsample_bytree': 0.8855032703108465, 'reg_lambda': 0.003392105701194897, 'max_leaves': 104}. Best is trial#110 with value: 0.2328482777453386.
[I 2020-05-17 18:02:52,442] Finished trial#116 with value: 0.24567071452307013 with parameters: {'rate_drop': 0.8474734019278819, 'learning_rate': 0.0767838

In [19]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("Value: {}".format(trial.value))

print("Params: ")
for key, value in trial.params.items():
    print(" {}: {}".format(key, value))

Number of finished trials: 150
Best trial:
Value: 0.22509114333213523
Params: 
 rate_drop: 0.7998554790473605
 learning_rate: 0.08254592411822248
 min_split_loss: 0.453358104006338
 max_depth: 8
 min_child_weight: 1
 subsample: 0.9995250735503225
 colsample_bytree: 0.9008138801912015
 reg_lambda: 0.0033177806819007093
 max_leaves: 101


## Build the model

In [None]:
# Define 5-fold cross validation test harness
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)
y_pred = 0

In [None]:
# Train the model using K-fold
counter = 0

for train, val in kfold.split(Xtrain, Ytrain):
    counter += 1

    train_x, train_y = Xtrain[train], Ytrain[train]
    val_x, val_y = Xtrain[val], Ytrain[val]
    df = pd.DataFrame(train_y, columns=['IsUnderRisk'])
    neg, pos = np.bincount(df['IsUnderRisk'])
    
    model = XGBRegressor(
        objective='multi:softprob',
        eval_metric='mlogloss',
        booster='gbtree',
        sample_type='uniform',
        tree_method='hist',
        scale_pos_weight=neg/pos,
        grow_policy='lossguide',
        num_round=5000,
        num_class=2,
        rate_drop=0.14,
        learning_rate=0.07,
        min_split_loss=0.8,
        max_depth=16,
        min_child_weight=2,
        subsample=0.96,
        colsample_bytree=0.88,
        reg_lambda=0.13,
        max_leaves=94
    )

    model.fit(train_x, train_y, eval_set=[(val_x, val_y)], early_stopping_rounds=200, verbose=True)
    pred = model.predict(Xtest, ntree_limit=model.best_ntree_limit)

    y_pred += pred

y_pred /= float(counter)

## Validate the model

In [None]:
#Print log_loss
loss = log_loss(test_y, y_pred)
print('Overall log_loss of model:', loss)

In [None]:
#Print accuracy
y_pred_binary = np.array([np.argmax(y_pred, axis=1)]).T
acc_score = accuracy_score(Ytest, y_pred_binary)
f1 = f1_score(Ytest, y_pred_binary)
print('Overall accuracy:', acc_score)
print('Overall F1-Score:', f1)

In [None]:
#Print Area Under Curve
plt.figure()
false_positive_rate, recall, thresholds = roc_curve(Ytest, y_pred_binary)
roc_auc = auc(false_positive_rate, recall)
plt.title('Receiver Operating Characteristic (ROC)')
plt.plot(false_positive_rate, recall, 'b', label = 'AUC = %0.3f' %roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.0])
plt.ylabel('Recall')
plt.xlabel('Fall-out (1-Specificity)')
plt.show()

print('AUC score:', roc_auc)

In [None]:
#Print Confusion Matrix
cm = confusion_matrix(Ytest, y_pred_binary)
labels = ['0', '1']
sns.heatmap(cm, xticklabels = labels, yticklabels = labels, annot = True, fmt='d', cmap="Blues", vmin = 0.5);
plt.title('Confusion Matrix')
plt.ylabel('True Class')
plt.xlabel('Predicted Class')
plt.show()

## Train model on entire data

In [None]:
# Define 5-fold cross validation test harness
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)
y_pred = 0

In [None]:
# Train the model using K-fold
counter = 0

for train, val in kfold.split(Xtrain_full, Ytrain_full):
    counter += 1

    train_x, train_y = Xtrain_full[train], Ytrain_full[train]
    val_x, val_y = Xtrain_full[val], Ytrain_full[val]
    df = pd.DataFrame(train_y, columns=['IsUnderRisk'])
    neg, pos = np.bincount(df['IsUnderRisk'])
    
    model = XGBRegressor(
        objective='multi:softprob',
        eval_metric='mlogloss',
        booster='gbtree',
        sample_type='uniform',
        tree_method='hist',
        scale_pos_weight=neg/pos,
        grow_policy='lossguide',
        num_round=5000,
        num_class=2,
        rate_drop=0.6,
        learning_rate=0.06,
        min_split_loss=0.7,
        max_depth=11,
        min_child_weight=1,
        subsample=0.88,
        colsample_bytree=0.87,
        reg_lambda=0.0006,
        max_leaves=132
    )

    model.fit(train_x, train_y, eval_set=[(val_x, val_y)], early_stopping_rounds=200, verbose=True)
    pred = model.predict(Xpredict, ntree_limit=model.best_ntree_limit)

    y_pred += pred

y_pred /= float(counter)

## Create submission file

In [None]:
submit_df = pd.DataFrame(y_pred, columns=['0','1'])
submit_df.head()

In [None]:
submit_df.to_excel("Predictions/predictions_v12.xlsx", index=False)