## Fit and Validate a Logistic Regression

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score,roc_auc_score, auc, roc_curve
from sklearn.multiclass import OneVsRestClassifier

In [2]:
intro = "/media/eric/nachmanides/programming/projects/post-program_project/"
df = pd.read_pickle(intro+"modeling-eric/data/final_df.pkl")

In [3]:
# OPTIONAL: Drop all NAs in the response variable
df = df[(df.safe == 0.0) | (df.safe == 1.0)]

### Preprocessing

#### Find a manageable way to represent the sunrise and sunset times

In [4]:
# Let the sunrise_delta and sunset_delta be, for the day of each timebin, the elapsed 
# time between midnight of that day and sunrise (respectively, sunset) on that day. 

midnight = df['sunrise_time'].apply(lambda x: x.round(freq = 'd'))
df['sunrise_delta'] = (df['sunrise_time'] - midnight)  / np.timedelta64(1,'m')
df['sunset_delta'] = (df['sunset_time'] - midnight)  / np.timedelta64(1,'m')

In [5]:
# We will now drop sunrise time and sunset time, since those predictors are hard to 
# compare with the others, and NaTs can't be fed into StandardScaler. We will 
# instead use sunrise_delta and sunset_delta as measures of how early or late 
# sunrise and sunset are on each day. 
df = df.drop(['sunrise_time','sunset_time'],axis=1)

#### Further Preprocessing: 
#### 1. Upsample minority class of 0 and 1 to make them even and then upsample both 0 and 1 classes.
#### 2. Differentiate predictors from response
#### 3. Fill predictor NAs with column means
#### 4. Standardize predictors
#### 5. Binarize categorical predictors

In [6]:
# to deal with unbalanced data, try upsampling
from sklearn.utils import resample

# Separate majority and minority classes
df_mj = df[df.safe==0.0] # safe
df_mi = df[df.safe==1.0] # unsafe
 
# Upsample minority class, with replacement 
df_mi_up = resample(df_mi, replace=True, n_samples=len(df_mj), random_state=42) 

## Now upsample both classes
n_target = 1000
df_mi_re = resample(df_mi_up, replace=True, n_samples=n_target, random_state=42) 
df_mj_re = resample(df_mj, replace=True, n_samples=n_target, random_state=42) 
 
# Combine majority class with upsampled minority class
df_re= pd.concat([df_mj_re, df_mi_re])
 
# Re-sort index so our timebins are ordered again
df_re = df_re.sort_index()

# Set variable 'df' to df_up
df = df_re

# Display new class counts
df.safe.value_counts()

0.0    1000
1.0    1000
Name: safe, dtype: int64

In [7]:
# Get just the unprocessed predictor columns
pred = df.drop('safe',axis=1)

# Impute means of each column
pred = pred.fillna(pred.mean())

# Standardize the predictors
scaler = StandardScaler()
scaler.fit(pred)

# # Define the predictors that we will train on
X = scaler.transform(pred)

# Binarize the response variable classes and save the result as our response "column(s)"
y = df.safe

### Split, Train, and Validate our Classifier

In [8]:
# Define our training and test sets
# NOTE: Because we have time series, we can't split our train and test sets randomly.
tscv = TimeSeriesSplit(max_train_size=None, n_splits=5)

clf = LogisticRegression(random_state=42)

# Create dicts whose keys are the classes and whose values 
# are the arrays of scores for those classes for our metrics

acc = []
prec = []
rec = []
roc_auc = []
fpr = []
tpr = []

for train_index, test_index in tscv.split(X):
    
    # Define indices identifying each pair of CV training and test set
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Learn to predict each class against the other
    clf_fit = clf.fit(X_train, y_train)
    y_score = clf_fit.predict(X_test)
    y_pred_proba = clf_fit.predict_proba(X_test)[:,-1]
        
    # Calculate the scores for class i
    accuracy = accuracy_score(y_test, y_score)
    precision = precision_score(y_test, y_score)
    recall = recall_score(y_test, y_score)
    false_pos, true_pos, _ = roc_curve(y_test, y_pred_proba, pos_label=1.0)
    area_under_curve = auc(false_pos,true_pos)
        
    acc.append(accuracy)
    prec.append(precision)
    rec.append(recall)
    roc_auc.append(area_under_curve)
    fpr.append(false_pos)
    tpr.append(true_pos)



In [9]:
# Replace each array by its mean (that is, mean CV score)
acc = np.mean(acc)
prec = np.mean(prec)
rec = np.mean(rec)
roc_auc = np.mean(roc_auc)

def aggregate_across_CVfolds(measure):
    measure_df = pd.DataFrame(measure)
    measure_agg = []
    for i in range(0,len(measure_df.columns)):
        measure_agg.append(measure_df.iloc[:,i].mean())
    
    return measure_agg

fpr_agg = aggregate_across_CVfolds(fpr)
tpr_agg = aggregate_across_CVfolds(tpr)

print("\naccuracy:\t",acc,
      "\nprecision:\t",prec,
      "\nrecall:\t\t",rec,
      "\nROC AUC:\t",roc_auc)


accuracy:	 0.6024024024024024 
precision:	 0.47297923618835747 
recall:		 0.4557786116322702 
ROC AUC:	 0.6847032536145707


#### Plot ROC Curves:

In [10]:
# Plot of a ROC curve for each response class on each cross validation fold
import matplotlib.pyplot as plt
plt.style.use("ggplot")

# Plot the ROC curve for class i on fold j
plt.plot(fpr_agg, tpr_agg, color='darkorange',
label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Upsampled & Binary')
plt.legend(loc="lower right")
plt.tight_layout()
plt.savefig(intro + "modeling-eric/visualizations/upsampled_binary/ROC_upsampled_binary.png")
plt.show()

<Figure size 640x480 with 1 Axes>