In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv('../Data/imputedWQ.csv', parse_dates=True, header=0, index_col=0)

In [6]:
# Separate input features (X) and target variable (y)
y = df.EVENT
X = df.drop('EVENT', axis=1)
 
# Train model
clf_0 = LogisticRegression().fit(X, y)
 
# Predict on training set
pred_y_0 = clf_0.predict(X)

In [7]:
# How's the accuracy?
print( accuracy_score(pred_y_0, y) )
# 0.9216

0.9908573721393462


In [10]:
# Should we be excited?
print( np.unique( pred_y_0 ) )
# [0]

[False  True]


##  Up-sample Minority Class
Up-sampling is the process of randomly duplicating observations from the minority class in order to reinforce its signal.

There are several heuristics for doing so, but the most common way is to simply resample with replacement.

In [11]:
from sklearn.utils import resample

In [24]:
# Separate majority and minority classes
df_majority = df[df.EVENT==0]
df_minority = df[df.EVENT==1]
df_minority.count()

Tp       1726
Cl       1726
pH       1726
Redox    1726
Leit     1726
Trueb    1726
Cl_2     1726
Fm       1726
Fm_2     1726
EVENT    1726
dtype: int64

In [20]:
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=137000,    # to match majority class
                                 random_state=123) # reproducible results
 


In [21]:
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled.EVENT.value_counts()

False    137840
True     137000
Name: EVENT, dtype: int64

In [23]:
# Separate input features (X) and target variable (y)
y = df_upsampled.EVENT
X = df_upsampled.drop('EVENT', axis=1)
 
# Train model
clf_1 = LogisticRegression().fit(X, y)
 
# Predict on training set
pred_y_1 = clf_1.predict(X)
 
# Is our model still predicting just one class?
print( np.unique( pred_y_1 ) )
# [0 1]
 
# How's our accuracy?
print( accuracy_score(y, pred_y_1) )
# 0.513888888889

[False  True]
0.8786020957648086


## Down-sample Majority Class
Down-sampling involves randomly removing observations from the majority class to prevent its signal from dominating the learning algorithm.

The most common heuristic for doing so is resampling without replacement.



In [26]:
# Separate majority and minority classes
df_majority = df[df.EVENT==0]
df_minority = df[df.EVENT==1]
 
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=1726,     # to match minority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
 
# Display new class counts
df_downsampled.EVENT.value_counts()
# 1    49
# 0    49
# Name: balance, dtype: int64

True     1726
False    1726
Name: EVENT, dtype: int64

In [27]:
# Separate input features (X) and target variable (y)
y = df_downsampled.EVENT
X = df_downsampled.drop('EVENT', axis=1)
 
# Train model
clf_2 = LogisticRegression().fit(X, y)
 
# Predict on training set
pred_y_2 = clf_2.predict(X)
 
# Is our model still predicting just one class?
print( np.unique( pred_y_2 ) )
# [0 1]
 
# How's our accuracy?
print( accuracy_score(y, pred_y_2) )
# 0.581632653061

[False  True]
0.8600811123986095


In [28]:
from sklearn.metrics import roc_auc_score

In [29]:
# Predict class probabilities
prob_y_2 = clf_2.predict_proba(X)
 
# Keep only the positive class
prob_y_2 = [p[1] for p in prob_y_2]
 
prob_y_2[:5]

[0.4966399732186486,
 0.5181934773275377,
 0.29142641137109915,
 0.3635506667245564,
 0.729848563263724]

In [30]:

print( roc_auc_score(y, prob_y_2) )

0.9043089199469903


In [31]:
#AUROC of model trained on imbalanced dataset
prob_y_0 = clf_0.predict_proba(X)
prob_y_0 = [p[1] for p in prob_y_0]
 
print( roc_auc_score(y, prob_y_0) )

0.9029477596409088
