## Import Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import boxcox
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import log_loss, f1_score, roc_curve, auc, confusion_matrix, accuracy_score
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
import lightgbm as lgb
import optuna

## Set file paths for train and predict datasets

In [2]:
train_dataset = "Dataset/Train.csv"
predict_dataset = "Dataset/Test.csv"

## Exploratory Data Analysis

#### Get train dataset info

In [None]:
train_df = pd.read_csv(train_dataset)
train_df.head()

In [None]:
train_df.info(verbose=True)

#### Get predict dataset info

In [None]:
predict_df = pd.read_csv(predict_dataset)
predict_df.head()

In [None]:
predict_df.info(verbose=True)

#### Count plot of target variable

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x="IsUnderRisk", data=train_df).set_title('Count plot of "IsUnderRisk" data')
plt.grid()

#### Examine class imbalance

In [None]:
neg, pos = np.bincount(train_df['IsUnderRisk'])
total = neg + pos
print('Total: {}\n  Positive: {} ({:.2f}% of total)\n  Negative: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total, neg, 100 * neg / total))

#### Get correlation between different features

In [None]:
plt.figure(figsize=(30,15))
ax = sns.heatmap(train_df.corr(), annot=True, linewidth = 0.5, cmap='coolwarm', 
                 fmt='.1g', vmin=-1, vmax=1, center= 0, square=True)
plt.show()
#figure = ax.get_figure()    
#figure.savefig('correlation_heatmap.png', dpi=500)

## Data Preprocessing

#### Read train and predict datasets

In [23]:
train_df = pd.read_csv(train_dataset)
predict_df = pd.read_csv(predict_dataset)
print("train_df: {}".format(train_df.shape))
print("predict_df: {}".format(predict_df.shape))

train_df: (543, 8)
predict_df: (233, 7)


#### Extract "IsUnderRisk" field from train_df into NumPy array

In [24]:
train_y = np.array([train_df['IsUnderRisk'].values]).T
train_df.drop(['IsUnderRisk'], inplace=True, axis=1)
print("train_y: {}".format(train_y.shape))

train_y: (543, 1)


#### Feature Engineering

In [25]:
# Combine train and predict dataframes
combined_df = train_df.append(predict_df, sort=False, ignore_index=True)
print(combined_df.shape)

(776, 7)


In [26]:
# Standard Deviation of all score values
combined_df['Location_Score_STD'] = np.std(combined_df['Location_Score'])
combined_df['Internal_Audit_Score_STD'] = np.std(combined_df['Internal_Audit_Score'])
combined_df['External_Audit_Score_STD'] = np.std(combined_df['External_Audit_Score'])
combined_df['Fin_Score_STD'] = np.std(combined_df['Fin_Score'])
combined_df['Loss_score_STD'] = np.std(combined_df['Loss_score'])
combined_df['Past_Results_STD'] = np.std(combined_df['Past_Results'])

# Binning
combined_df['Past_Results_bin'] = combined_df['Past_Results'].apply(lambda x: "[0-2]" if 0<=x<2 else "[2-4]" if 2<=x<4 else "[4-10]")
combined_df['Loss_Score_bin'] = combined_df['Loss_score'].apply(lambda x: "[0-5]" if 0<=x<5 else "[5-9]" if 5<=x<9 else "[9-15]")

# One-hot encoding
one_hot = pd.get_dummies(combined_df['City'])
combined_df.drop('City', axis = 1, inplace=True)
combined_df = combined_df.join(one_hot)

one_hot = pd.get_dummies(combined_df['Past_Results_bin'])
combined_df.drop('Past_Results_bin', axis = 1, inplace=True)
combined_df = combined_df.join(one_hot)

one_hot = pd.get_dummies(combined_df['Loss_Score_bin'])
combined_df.drop('Loss_Score_bin', axis = 1, inplace=True)
combined_df = combined_df.join(one_hot)

# Convert all scores to log scale
combined_df['Location_Score'] = np.log1p(combined_df['Location_Score'])
combined_df['Internal_Audit_Score'] = np.log1p(combined_df['Internal_Audit_Score'])
combined_df['External_Audit_Score'] = np.log1p(combined_df['External_Audit_Score'])
combined_df['Fin_Score'] = np.log1p(combined_df['Fin_Score'])
combined_df['Loss_score'] = np.log1p(combined_df['Loss_score'])
combined_df['Past_Results'] = np.log1p(combined_df['Past_Results'])

# Get difference and mean of different scores
combined_df["Audit_Score_Diff"] = np.power(combined_df["Internal_Audit_Score"] - combined_df["External_Audit_Score"], 2)
combined_df["Average_Audit_Score"] = (combined_df["Internal_Audit_Score"] + combined_df["External_Audit_Score"]) / 2
combined_df["Score_diff1"] = np.power(combined_df["Fin_Score"] - combined_df["Loss_score"], 2)
combined_df["Score_diff2"] = np.power(combined_df["Fin_Score"] - combined_df["Average_Audit_Score"], 2)
combined_df["Score_diff3"] = np.power(combined_df["Loss_score"] - combined_df["Average_Audit_Score"], 2)
combined_df["Cumulative_Score"] = (combined_df['Location_Score'] + combined_df["Internal_Audit_Score"] + combined_df["External_Audit_Score"] + combined_df["Fin_Score"] + combined_df["Loss_score"]) / 5

# Calculate different score ratios
combined_df["Fin_Score/Average_Audit_Score"] = combined_df["Fin_Score"] / combined_df["Average_Audit_Score"]
combined_df["Loss_score/Average_Audit_Score"] = combined_df["Loss_score"] / combined_df["Average_Audit_Score"]
combined_df["Loss_score/Fin_Score"] = combined_df["Loss_score"] / combined_df["Fin_Score"]
combined_df["Fin_Score/Location_Score"] = combined_df["Fin_Score"] / combined_df["Location_Score"]
combined_df["Loss_score/Location_Score"] = combined_df["Loss_score"] / combined_df["Location_Score"]
combined_df["Average_Audit_Score/Location_Score"] = combined_df["Average_Audit_Score"] / combined_df["Location_Score"]
combined_df["Past_Results/Location_Score"] = combined_df["Past_Results"] / combined_df["Location_Score"]

# Box-cox transformation
_, opt_lambda = boxcox(combined_df["Location_Score"])
combined_df['Location_boxcox_lambda_0'] = boxcox((1+combined_df['Location_Score']), lmbda=0)
combined_df['Location_boxcox_lambda_opt'] = boxcox(combined_df['Location_Score'], lmbda=opt_lambda)

_, opt_lambda = boxcox(combined_df["Internal_Audit_Score"])
combined_df['IAS_boxcox_lambda_0'] = boxcox((1+combined_df['Internal_Audit_Score']), lmbda=0)
combined_df['IAS_boxcox_lambda_opt'] = boxcox(combined_df['Internal_Audit_Score'], lmbda=opt_lambda)

_, opt_lambda = boxcox(combined_df["External_Audit_Score"])
combined_df['EAS_boxcox_lambda_0'] = boxcox((1+combined_df['External_Audit_Score']), lmbda=0)
combined_df['EAS_boxcox_lambda_opt'] = boxcox(combined_df['External_Audit_Score'], lmbda=opt_lambda)

_, opt_lambda = boxcox(combined_df["Fin_Score"])
combined_df['Fin_boxcox_lambda_0'] = boxcox((1+combined_df['Fin_Score']), lmbda=0)
combined_df['Fin_boxcox_lambda_opt'] = boxcox(combined_df['Fin_Score'], lmbda=opt_lambda)

#### Create train and predict Numpy arrays and scale them

In [27]:
# Segregate combined_df into train/predict datasets
train_x = combined_df[:543]
predict_x = combined_df[543:]

print("train_x: {}".format(train_x.shape))
print("predict_x: {}".format(predict_x.shape))

train_x: (543, 84)
predict_x: (233, 84)


In [28]:
# Scale the train_x/predict_x arrays
scaler = MinMaxScaler().fit(train_x)
train_x = scaler.transform(train_x)
predict_x = scaler.transform(predict_x)

In [29]:
train_x_full = train_x.copy()
train_y_full = train_y.copy()

#### Split training data into train/test datasets

In [30]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.095, random_state=1)
for train_index, test_index in sss.split(train_x, train_y):
    train_x, test_x = train_x[train_index], train_x[test_index]
    train_y, test_y = train_y[train_index], train_y[test_index]

print("------------------------- Training Dataset -------------------------")
print("train_x shape: {}".format(train_x.shape))
print("train_y shape: {}".format(train_y.shape))

print("\n------------------------- Test Dataset -------------------------")
print("test_x shape: {}".format(test_x.shape))
print("test_y shape: {}".format(test_y.shape))

------------------------- Training Dataset -------------------------
train_x shape: (491, 84)
train_y shape: (491, 1)

------------------------- Test Dataset -------------------------
test_x shape: (52, 84)
test_y shape: (52, 1)


In [31]:
Xtrain_full, Ytrain_full = train_x_full.copy(), train_y_full.copy()
Xtrain, Ytrain = train_x.copy(), train_y.copy()
Xtest, Ytest = test_x.copy(), test_y.copy()
Xpredict = predict_x.copy()

print("------------------------- Training Dataset -------------------------")
print("Xtrain_full shape: {}".format(Xtrain_full.shape))
print("Ytrain_full shape: {}".format(Ytrain_full.shape))
print("Xtrain shape: {}".format(Xtrain.shape))
print("Ytrain shape: {}".format(Ytrain.shape))

print("\n------------------------- Test Dataset -------------------------")
print("Xtest shape: {}".format(Xtest.shape))
print("Ytest shape: {}".format(Ytest.shape))

print("\n------------------------- Prediction Dataset -------------------------")
print("Xpredict shape: {}".format(Xpredict.shape))

------------------------- Training Dataset -------------------------
Xtrain_full shape: (543, 84)
Ytrain_full shape: (543, 1)
Xtrain shape: (491, 84)
Ytrain shape: (491, 1)

------------------------- Test Dataset -------------------------
Xtest shape: (52, 84)
Ytest shape: (52, 1)

------------------------- Prediction Dataset -------------------------
Xpredict shape: (233, 84)


## Hyperparameter search using Optuna

In [32]:
# Convert Ytest to one-hot encoding
df = pd.DataFrame(Ytest, columns=["IsUnderRisk"])
test_y = pd.get_dummies(df['IsUnderRisk']).values

In [33]:
# Define 5-fold cross validation test harness
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)

In [34]:
def objective(trial):
    
    params = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "num_class": 2,
        "verbosity": -1,
        "boosting_type": "gbdt",
        "is_unbalance": True,
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 1e-1),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-4, 1.0),
        "num_leaves": trial.suggest_int("num_leaves", 35, 200),
        "max_depth": trial.suggest_int("max_depth", 8, 25),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.5, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 5, 20),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 20),
    }
    
    y_pred = 0
    counter = 0
    for train, val in kfold.split(Xtrain, Ytrain):
        counter += 1
    
        train_x, train_y = Xtrain[train], Ytrain[train]
        val_x, val_y = Xtrain[val], Ytrain[val]

        lgtrain = lgb.Dataset(train_x, label=train_y.ravel())
        lgvalidation = lgb.Dataset(val_x, label=val_y.ravel())

        model = lgb.train(params, lgtrain, valid_sets=[lgvalidation], 
                          num_boost_round=5000, early_stopping_rounds=200, verbose_eval=False)
        pred = model.predict(Xtest, num_iteration=model.best_iteration)

        y_pred += pred

    y_pred /= float(counter)
    loss = log_loss(test_y, y_pred)
    return loss

In [35]:
study = optuna.create_study()
study.optimize(objective, n_trials=100)

[I 2020-05-17 18:36:12,998] Finished trial#0 with value: 0.246493482372785 with parameters: {'learning_rate': 0.07195551211067949, 'lambda_l2': 0.24930400666346547, 'num_leaves': 167, 'max_depth': 23, 'feature_fraction': 0.81893454182465, 'bagging_fraction': 0.8067354629805283, 'bagging_freq': 12, 'min_child_samples': 7}. Best is trial#0 with value: 0.246493482372785.
[I 2020-05-17 18:36:15,543] Finished trial#1 with value: 0.3017543228884818 with parameters: {'learning_rate': 0.013501557739583881, 'lambda_l2': 0.0032146154741599328, 'num_leaves': 136, 'max_depth': 8, 'feature_fraction': 0.8247659930439812, 'bagging_fraction': 0.622483821439892, 'bagging_freq': 14, 'min_child_samples': 20}. Best is trial#0 with value: 0.246493482372785.
[I 2020-05-17 18:36:24,543] Finished trial#2 with value: 0.22863513083938264 with parameters: {'learning_rate': 0.020022106734999004, 'lambda_l2': 0.18504421567002546, 'num_leaves': 186, 'max_depth': 11, 'feature_fraction': 0.8458746856011641, 'bagging_

[I 2020-05-17 18:38:29,934] Finished trial#21 with value: 0.21952376732803872 with parameters: {'learning_rate': 0.017455491608048095, 'lambda_l2': 0.0011391506979238364, 'num_leaves': 115, 'max_depth': 14, 'feature_fraction': 0.850097964218496, 'bagging_fraction': 0.7477052820661827, 'bagging_freq': 13, 'min_child_samples': 1}. Best is trial#21 with value: 0.21952376732803872.
[I 2020-05-17 18:38:36,865] Finished trial#22 with value: 0.23309276411321272 with parameters: {'learning_rate': 0.01763728320349444, 'lambda_l2': 0.00047835136717570023, 'num_leaves': 79, 'max_depth': 10, 'feature_fraction': 0.779638624643137, 'bagging_fraction': 0.7889635966652238, 'bagging_freq': 13, 'min_child_samples': 3}. Best is trial#21 with value: 0.21952376732803872.
[I 2020-05-17 18:38:46,625] Finished trial#23 with value: 0.23449818395857713 with parameters: {'learning_rate': 0.023711445750699167, 'lambda_l2': 0.0027188794527594432, 'num_leaves': 104, 'max_depth': 14, 'feature_fraction': 0.8852505256

[I 2020-05-17 18:41:11,543] Finished trial#42 with value: 0.2514428095468876 with parameters: {'learning_rate': 0.013476541999830527, 'lambda_l2': 0.0001455709041724295, 'num_leaves': 121, 'max_depth': 12, 'feature_fraction': 0.8262807181700286, 'bagging_fraction': 0.6980308962313827, 'bagging_freq': 19, 'min_child_samples': 3}. Best is trial#41 with value: 0.21729107501309106.
[I 2020-05-17 18:41:22,937] Finished trial#43 with value: 0.2417858299082517 with parameters: {'learning_rate': 0.016690523400352535, 'lambda_l2': 0.0005694936163665529, 'num_leaves': 97, 'max_depth': 10, 'feature_fraction': 0.8993309920777763, 'bagging_fraction': 0.7350283829742368, 'bagging_freq': 14, 'min_child_samples': 1}. Best is trial#41 with value: 0.21729107501309106.
[I 2020-05-17 18:41:28,389] Finished trial#44 with value: 0.23758063115109113 with parameters: {'learning_rate': 0.021447219185030047, 'lambda_l2': 0.3666210377250154, 'num_leaves': 146, 'max_depth': 9, 'feature_fraction': 0.83407078241877

KeyboardInterrupt: 

In [16]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("Value: {}".format(trial.value))

print("Params: ")
for key, value in trial.params.items():
    print(" {}: {}".format(key, value))

Number of finished trials: 200
Best trial:
Value: 0.20152153538527245
Params: 
 learning_rate: 0.06429437878342074
 lambda_l2: 0.0008997726071095372
 num_leaves: 200
 max_depth: 9
 feature_fraction: 0.6883384481523004
 bagging_fraction: 0.798563706360225
 bagging_freq: 13
 min_child_samples: 1


## Build the model

In [17]:
# Define model hyperparameters
params = {}
params["objective"] = 'multiclass'
params["metric"] = 'multi_logloss'
params["num_class"] = 2
params["is_unbalance"] = True
params["boosting"] = 'gbdt'
params["max_depth"] = 9
params["num_leaves"] = 200
params["learning_rate"] = 0.06
params["bagging_fraction"] = 0.8
params["feature_fraction"] = 0.8
params["bagging_freq"] = 13
params["bagging_seed"] = 10
params["lambda_l2"] = 0.009
params["min_data_in_leaf"] = 1
params["verbosity"] = -1
num_rounds = 5000

In [18]:
# Define 5-fold cross validation test harness
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)
y_pred = 0

In [19]:
# Train the model using K-fold
counter = 0

for train, val in kfold.split(Xtrain, Ytrain):
    counter += 1

    train_x, train_y = Xtrain[train], Ytrain[train]
    val_x, val_y = Xtrain[val], Ytrain[val]

    lgtrain = lgb.Dataset(train_x, label=train_y.ravel())
    lgvalidation = lgb.Dataset(val_x, label=val_y.ravel())

    model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgvalidation], early_stopping_rounds=200, verbose_eval=1000)
    pred = model.predict(Xtest, num_iteration=model.best_iteration)

    y_pred += pred

y_pred /= float(counter)

Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[35]	valid_0's multi_logloss: 0.371732
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[38]	valid_0's multi_logloss: 0.317244
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[32]	valid_0's multi_logloss: 0.341795
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[45]	valid_0's multi_logloss: 0.241932
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[52]	valid_0's multi_logloss: 0.32197


## Validate the model

In [20]:
#Print log_loss
loss = log_loss(test_y, y_pred)
print('Overall log_loss of model:', loss)

Overall log_loss of model: 0.2632133063224379


In [21]:
#Print accuracy
y_pred_binary = np.array([np.argmax(y_pred, axis=1)]).T
acc_score = accuracy_score(Ytest, y_pred_binary)
f1 = f1_score(Ytest, y_pred_binary)
print('Overall accuracy:', acc_score)
print('Overall F1-Score:', f1)

Overall accuracy: 0.8846153846153846
Overall F1-Score: 0.90625


In [None]:
#Print Area Under Curve
plt.figure()
false_positive_rate, recall, thresholds = roc_curve(Ytest, y_pred_binary)
roc_auc = auc(false_positive_rate, recall)
plt.title('Receiver Operating Characteristic (ROC)')
plt.plot(false_positive_rate, recall, 'b', label = 'AUC = %0.3f' %roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.0])
plt.ylabel('Recall')
plt.xlabel('Fall-out (1-Specificity)')
plt.show()

print('AUC score:', roc_auc)

In [None]:
#Print Confusion Matrix
cm = confusion_matrix(Ytest, y_pred_binary)
labels = ['0', '1']
sns.heatmap(cm, xticklabels = labels, yticklabels = labels, annot = True, fmt='d', cmap="Blues", vmin = 0.5);
plt.title('Confusion Matrix')
plt.ylabel('True Class')
plt.xlabel('Predicted Class')
plt.show()

## Train model on entire data

In [None]:
# Define 5-fold cross validation test harness
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)
y_pred = 0

In [None]:
# Train the model using K-fold
counter = 0

for train, val in kfold.split(Xtrain_full, Ytrain_full):
    counter += 1

    train_x, train_y = Xtrain_full[train], Ytrain_full[train]
    val_x, val_y = Xtrain_full[val], Ytrain_full[val]

    lgtrain = lgb.Dataset(train_x, label=train_y.ravel())
    lgvalidation = lgb.Dataset(val_x, label=val_y.ravel())

    model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgvalidation], early_stopping_rounds=200, verbose_eval=1000)
    pred = model.predict(Xpredict, num_iteration=model.best_iteration)

    y_pred += pred

y_pred /= float(counter)

## Create submission file

In [None]:
submit_df = pd.DataFrame(y_pred, columns=['0','1'])
submit_df.head()

In [None]:
submit_df.to_excel("Predictions/predictions_v13.xlsx", index=False)