## Fit a Logistic Regression and a Gradient Boosted Tree

In [25]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score,roc_auc_score, auc, roc_curve
from sklearn.multiclass import OneVsRestClassifier

In [26]:
intro = "/media/eric/nachmanides/programming/projects/post-program_project/"
df = pd.read_pickle(intro+"modeling-eric/data/final_df.pkl")

In [27]:
# df = df[(df.safe == 0.0) | (df.safe == 1.0)]

In [28]:
# Let the sunrise_delta and sunset_delta be, for the day of each timebin, the elapsed 
# time between midnight of that day and sunrise (respectively, sunset) on that day. 

midnight = df['sunrise_time'].apply(lambda x: x.round(freq = 'd'))
df['sunrise_delta'] = (df['sunrise_time'] - midnight)  / np.timedelta64(1,'m')
df['sunset_delta'] = (df['sunset_time'] - midnight)  / np.timedelta64(1,'m')

In [29]:
# We will drop sunrise time and sunset time, since those predictors are hard to 
# compare with the others, and NaTs can't be fed into StandardScaler. We will 
# instead use sunrise_delta and sunset_delta as measures of how early or late 
# sunrise and sunset are on each day. 
df = df.drop(['sunrise_time','sunset_time'],axis=1)

In [30]:
# Get just the predictors
pred = df.drop('safe',axis=1)

# Impute means of each column
pred = pred.fillna(pred.mean())

scaler = StandardScaler()
scaler.fit(pred)
X = scaler.transform(pred)
y = df.safe

# Binarize the output
y = label_binarize(y, classes=[0, 1, 2])
n_classes = y.shape[1]

In [31]:
# Define our training and test sets
# NOTE: Because we have time series, we can't split our train and test sets randomly.
tscv = TimeSeriesSplit(max_train_size=None, n_splits=5)

# Call our classifier. Since we're keeping NAs (in the form of 2.0), 
# we'll treat this as a multi-class classification problem. 
clf = OneVsRestClassifier(LogisticRegression(solver = 'lbfgs', 
                                                multi_class = 'ovr', 
                                                random_state = 42))


# Create dicts whose keys are the classes and whose values 
# are the arrays of scores for those classes for our metrics

acc = dict()
prec = dict()
rec = dict()
roc_auc = dict()

for i in range(n_classes):
        # Initialize the score arrays for class i
        acc[i] = []
        prec[i] = []
        rec[i] = []
        roc_auc[i] = []

for train_index, test_index in tscv.split(X):
    
    # Define indices identifying each pair of CV training and test set
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Learn to predict each class against the other
    y_score = clf.fit(X_train, y_train).predict(X_test)
    
    
    # For each response class and each metric, append our classifier's 
    # score for that class w.r.t. that metric, on the train-test-split
    # under consideration:
    for i in range(n_classes):
        
        # Calculate the scores for class i
        accuracy = accuracy_score(y_test[:, i], y_score[:, i])
        precision = precision_score(y_test[:, i], y_score[:, i])
        recall = recall_score(y_test[:, i], y_score[:, i])
        false_pos, true_pos, _ = roc_curve(y_test[:, i], y_score[:, i])
        area_under_curve = auc(false_pos,true_pos)
        
        acc[i].append(accuracy)
        prec[i].append(precision)
        rec[i].append(recall)
        roc_auc[i].append(area_under_curve)    
        
#     # Compute micro-average ROC curve and ROC area
#     fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
#     roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [32]:
roc_auc

{0: [0.5, 0.5, 0.5, 0.5, 0.5],
 1: [0.5, 0.5, 0.5, 0.7486130374479889, 0.5],
 2: [0.5, 0.5, 0.5, 0.5318443443443444, 0.4952076677316294]}

In [37]:
# Replace each array by its mean
for i in range(n_classes):
        acc[i] = np.mean(acc[i])
        prec[i] = np.mean(prec[i])
        rec[i] = np.mean(rec[i])
        roc_auc[i] = np.mean(roc_auc[i])

        print("\nresponse class:\t", i, 
              "\naccuracy:\t",acc[i],
              "\nprecision:\t",prec[i],
              "\nrecall:\t\t",rec[i],
              "\nROC AUC:\t",roc_auc[i])


response class:	 0 
accuracy:	 0.98328173374613 
precision:	 0.0 
recall:		 0.0 
ROC AUC:	 0.5

response class:	 1 
accuracy:	 0.9591331269349845 
precision:	 0.03333333333333333 
recall:		 0.1285714285714286 
ROC AUC:	 0.5497226074895978

response class:	 2 
accuracy:	 0.94984520123839 
precision:	 0.9657198405678061 
recall:		 0.9811911752007598 
ROC AUC:	 0.5054104024151947


## For Later:

In [None]:
# # Plot of a ROC curve for a specific class

# plt.figure()
# lw = 2
# plt.plot(fpr[2], tpr[2], color='darkorange',
#          lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
# plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('Receiver operating characteristic example')
# plt.legend(loc="lower right")
# plt.show()

In [None]:
# from sklearn.metrics import roc_auc_score, roc_curve

# def roc_auc(model,model_name, X_test, y_test):
#     # generate a no skill prediction (majority class)
#     ns_probs = [0 for _ in range(len(y_test))]
#     # predict probabilities
#     lr_probs = model.predict_proba(X_test)
#     # # keep probabilities for the positive outcome only
#     lr_probs = lr_probs[:, 1]
#     # calculate scores
#     ns_auc = roc_auc_score(y_test, ns_probs)
#     lr_auc = roc_auc_score(y_test, lr_probs)
#     # summarize scores
#     print('No Skill: ROC AUC=%.3f' % (ns_auc))
#     print(model_name+': ROC AUC=%.3f' % (lr_auc))
#     # calculate roc curves
#     ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
#     lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
#     # plot the roc curve for the model
#     plt.figure()
#     plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
#     plt.plot(lr_fpr, lr_tpr, marker='.', label=model_name)
#     # axis labels
#     plt.xlabel('False Positive Rate')
#     plt.ylabel('True Positive Rate')
#     plt.legend(); plt.show()
