In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc

def evaluation_metrics(y_pred, y_label, thresh=0.5):
    true_pos = []
    true_neg = []
    false_pos = []
    false_neg = []

    for i in range(len(y_pred)):
        if y_label[i] < thresh and y_pred[i] < thresh:
            true_neg.append(i)
        elif y_label[i] > thresh and y_pred[i] > thresh:
            true_pos.append(i)
        elif y_label[i] < thresh and y_pred[i] > thresh:
            false_pos.append(i)
        elif y_label[i] > thresh and y_pred[i] < thresh:
            false_neg.append(i)

    TP = len(true_pos)
    TN = len(true_neg)
    FP = len(false_pos)
    FN = len(false_neg)

    fpr, tpr, thresholds = roc_curve(y_label, y_pred, pos_label=1)

    eval = \
    {
        'accuracy'  : (TP + TN) / (TP + TN + TN + FN) if (TP + TN + TN + FN) > 0 else 0, 
        'precision' : TP / (TP + FP) if (TP + FP) > 0 else 0,
        'recall'    : TP / (TP + FN) if (TP + FN) > 0 else 0,
        'f1'        : (2*TP) / (2*TP + FP + FN) if (TP + FP + FN) > 0 else 0,
        'roc_auc'   : auc(fpr, tpr),
        'fpr'       : fpr,
        'tpr'       : tpr
    }

    return eval

In [52]:
# *** LOAD DATA *** 
from sklearn.model_selection import train_test_split

nominal_sample = pd.read_csv('2015_sample.csv')
failure_sample = pd.read_csv('2015_failures.csv')

# For further testing on data from the future
future_nominal = pd.read_csv('2018_q1_sample.csv')
future_failure = pd.read_csv('2018_q1_failures.csv')

# nominal_sample = nominal_sample.dropna(axis=0, thresh=20).dropna(axis=1)
# failure_sample = failure_sample.dropna(axis=0, thresh=20).dropna(axis=1)

feature_columns = [ 'smart_1_normalized', 'smart_3_normalized', 'smart_4_normalized', 'smart_5_normalized',
           'smart_7_normalized','smart_12_normalized', 'smart_194_normalized', 
           'smart_197_normalized', 'smart_198_normalized', 'smart_199_normalized']

data_curr = pd.concat([nominal_sample, failure_sample])
y_curr = data_curr['failure']
x_curr = data_curr[feature_columns]

x_train, x_val, y_train, y_val = train_test_split(x_curr, y_curr, train_size=0.8, random_state=1)

data_future = pd.concat([future_nominal, future_failure])
y_future = data_future['failure']
x_future = data_future[feature_columns]

x_train = x_train.values
x_val = x_val.values
y_train = y_train.values
y_val = y_val.values
x_future = x_future.values
y_future = y_future.values

# *** LOAD DATA ***



In [60]:
# *** MODELS ***
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier

lr1 = LogisticRegression(solver='newton-cg')
lr2 = LogisticRegression(solver='lbfgs')
lr3 = LogisticRegression(solver='saga')
lr4 = LogisticRegression(solver='sag')
lr5 = LogisticRegression(solver='liblinear')

svm_linear = svm.SVC(kernel='linear', gamma='scale', probability=True)
svm_rbf = svm.SVC(kernel='rbf', gamma='scale', probability=True)
svm_poly = svm.SVC(kernel='poly', gamma='scale', probability=True)
svm_sig = svm.SVC(kernel='sigmoid', gamma='scale', probability=True)

rfc = RandomForestClassifier()
rfc100 = RandomForestClassifier(n_estimators=100)

boost1 = GradientBoostingClassifier(loss='deviance')
boost2 = GradientBoostingClassifier(loss='exponential')

gnb = GaussianNB()
ada = AdaBoostClassifier(gnb, algorithm="SAMME", n_estimators=200)



models = [lr1, lr2, lr3, lr4, lr5, svm_linear, svm_rbf, svm_poly, svm_sig, rfc, rfc100, boost1, boost2, ada]
# *** MODELS ***

In [None]:
# *** RUN MODELS ***

for model in models:

    model.fit(x_train, y_train)

    pred_train = model.predict_proba(x_train)[:,1]
    pred_val = model.predict_proba(x_val)[:,1]
    
    # pred_future = svm_linear.predict_proba(x_future)[:,1]
    # pred_train
    
    eval = evaluation_metrics(pred_val, y_val)
    print("Trained:  ", model)
    print("Precision:", eval['precision'])
    print("Recall:   ", eval['recall'])
    print("Accuracy: ", eval['accuracy'])
    print("F1-Score: ", eval['f1'])
    print("AuC:      ", eval['roc_auc'])
    print("")
    
    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.plot(eval['fpr'], eval['tpr'], label='ROC curve (area = %0.2f)' % eval['roc_auc'])
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.show()

# *** RUN MODELS ***

In [59]:
# *** SPECIAL MODEL: xgboost ***
from xgboost import XGBClassifier

xgb = XGBClassifier(objective ='reg:linear', 
                             max_depth = 10,
                             silent = 1,
                             learning_rate = 0.05,
                             n_estimators = 1000)

eval_set = [(x_val, y_val)]

xgb.fit(x_train, y_train, early_stopping_rounds=100,eval_metric= 'logloss', eval_set=eval_set)

pred_train = model.predict_proba(x_train)[:,1]
pred_val = model.predict_proba(x_val)[:,1]

# pred_future = svm_linear.predict_proba(x_future)[:,1]
# pred_train

eval = evaluation_metrics(pred_val, y_val)
print("Trained:  ", model)
print("Precision:", eval['precision'])
print("Recall:   ", eval['recall'])
print("Accuracy: ", eval['accuracy'])
print("F1-Score: ", eval['f1'])
print("AuC:      ", eval['roc_auc'])
print("")

# *** SPECIAL MODEL: xgboost ***

[0]	validation_0-logloss:0.680333
Will train until validation_0-logloss hasn't improved in 100 rounds.
[1]	validation_0-logloss:0.6681
[2]	validation_0-logloss:0.65772
[3]	validation_0-logloss:0.647879
[4]	validation_0-logloss:0.638731
[5]	validation_0-logloss:0.630713
[6]	validation_0-logloss:0.623046
[7]	validation_0-logloss:0.616239
[8]	validation_0-logloss:0.610338
[9]	validation_0-logloss:0.60489
[10]	validation_0-logloss:0.599988
[11]	validation_0-logloss:0.595239
[12]	validation_0-logloss:0.590716
[13]	validation_0-logloss:0.587124
[14]	validation_0-logloss:0.583701
[15]	validation_0-logloss:0.580077
[16]	validation_0-logloss:0.577065
[17]	validation_0-logloss:0.574641
[18]	validation_0-logloss:0.572136
[19]	validation_0-logloss:0.569933
[20]	validation_0-logloss:0.56829
[21]	validation_0-logloss:0.566556
[22]	validation_0-logloss:0.564917
[23]	validation_0-logloss:0.562984
[24]	validation_0-logloss:0.561114
[25]	validation_0-logloss:0.559432
[26]	validation_0-logloss:0.558281
[