In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix, accuracy_score
    )

## Preprocessing

In [2]:
quake_frame = pd.read_csv('data/consolidated_data.csv')

quake_frame['simple_label'] = quake_frame['type'] != 'earthquake'

quake_frame.drop(['id', 'Unnamed: 0', 'place', 'time', 'updated', 'type'], inplace=True, axis=1)

## Undersampling, no imputation

Keeping the Random Forest from before, we undersample by throwing away the NA values and undersample by various strategies to see the different results.

In [3]:
quake_frame.dropna(inplace=True)
quake_frame.isna().sum()

latitude           0
longitude          0
depth              0
mag                0
magType            0
nst                0
gap                0
dmin               0
rms                0
net                0
horizontalError    0
depthError         0
magError           0
magNst             0
status             0
locationSource     0
magSource          0
simple_label       0
dtype: int64

In [4]:
len(quake_frame)

1227408

In [5]:
quake_frame.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
latitude,1227408.0,37.361674,4.841731,0.0,35.964167,37.573,38.817,62.030667
longitude,1227408.0,-119.557707,10.027502,-179.098,-122.701333,-120.558833,-118.150167,179.6615
depth,1227408.0,6.016756,7.92288,-3.882,1.816,4.413,7.83,211.0
mag,1227408.0,1.258097,0.694405,-2.5,0.8,1.18,1.67,5.84
nst,1227408.0,17.010182,13.671235,0.0,8.0,13.0,22.0,276.0
gap,1227408.0,121.03215,65.767724,0.0,72.0,105.0,153.0,360.0
dmin,1227408.0,0.078264,0.342578,0.0,0.01712,0.03784,0.07999,141.16
rms,1227408.0,0.097118,0.195847,0.0,0.03,0.06,0.13,64.29
horizontalError,1227408.0,0.801039,2.296862,0.0,0.27,0.41,0.72,194.5841
depthError,1227408.0,2.773763,6.903563,0.0,0.49,0.77,1.46,725.3


Alright, this changes the proportions slightly, but not too bad. If anything, one might suggest that at least the mild increase in proportion of non-earthquakes offsets the reduced dataset a little.  
Okay, so the problematic values are no longer there, that's something.  
Let's try this.  
We'll start by mixing up the data frame, then encoding all the categories numerically and splitting it sklearn style.

In [6]:
quake_frame = quake_frame.sample(frac=1, random_state=42).reset_index(drop=True)

le = LabelEncoder()

cat_columns = ['magType', 'net', 'status', 'locationSource', 'magSource']

for cat in cat_columns:
    quake_frame[cat + '_enc'] = le.fit_transform(quake_frame[cat])

In [7]:
x_cols = ['latitude',
 'longitude',
 'depth',
 'mag',
 'nst',
 'gap',
 'dmin',
 'rms',
 'horizontalError',
 'depthError',
 'magError',
 'magNst',
 'magType_enc',
 'net_enc',
 'status_enc',
 'locationSource_enc',
 'magSource_enc']

y_col = ['simple_label']

In [8]:
train_length = int(np.round(len(quake_frame.index) * 0.8))

In [9]:
train_X = quake_frame.loc[:train_length, x_cols]
train_y = quake_frame.loc[:train_length, y_col]

valid_X = quake_frame.loc[train_length:, x_cols]
valid_y = quake_frame.loc[train_length:, y_col]

## Try RandomUnderSampler (Controlled Undersampling)

In [11]:
from imblearn.under_sampling import RandomUnderSampler

In [12]:
rus = RandomUnderSampler(random_state=42)

In [13]:
train_X_resampled, train_y_resampled = rus.fit_resample(train_X, train_y)

In [14]:
n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

In [15]:
rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [16]:
preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

In [17]:
prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)

Precision:  0.6553233983714857
Recall:  0.9794489092996556
ROC score:  0.9802490943876346
F1 score:  0.7852540500736376


## Try NearMiss Version 1, 2 and 3 (Controlled Undersampling)

In [11]:
from imblearn.under_sampling import NearMiss

### Version 1

In [18]:
nemi1 = NearMiss(version=1)

In [19]:
train_X_resampled, train_y_resampled = nemi1.fit_resample(train_X, train_y)

In [20]:
n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

In [21]:
rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [22]:
preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

In [23]:
prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)

Precision:  0.07178685482118076
Recall:  0.9858783008036739
ROC score:  0.7584709700426729
F1 score:  0.13382893834548967


### Version 2

In [11]:
nemi2 = NearMiss(version=2)

In [None]:
# train_X_resampled, train_y_resampled = nemi2.fit_resample(train_X, train_y)

# n_estim = 100

# rfc = RandomForestClassifier(n_estimators=n_estim,
#                              random_state=42)

# rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

# preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

# prec = precision_score(valid_y, preds)
# reca = recall_score(valid_y, preds)
# roc = roc_auc_score(valid_y, preds)
# f1 = f1_score(valid_y, preds)
# conf_mat = confusion_matrix(valid_y, preds)

# print("Precision: ", prec)
# print("Recall: ", reca)
# print("ROC score: ", roc)
# print("F1 score: ", f1)

Very interesting. The kernel keeps dying for no apparent reason when running this.

### Version 3

In [12]:
nemi3 = NearMiss(version=3)

In [13]:
train_X_resampled, train_y_resampled = nemi3.fit_resample(train_X, train_y)

In [14]:
n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

In [15]:
rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [16]:
preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

In [17]:
prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)

Precision:  0.83841059602649
Recall:  0.9447761194029851
ROC score:  0.9690388461120479
F1 score:  0.8884210526315789


## Try Tomek's Links (Cleaning Undersampling)

In [18]:
from imblearn.under_sampling import TomekLinks

In [19]:
toli = TomekLinks()

In [20]:
train_X_resampled, train_y_resampled = toli.fit_resample(train_X, train_y)

In [21]:
n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

In [22]:
rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [23]:
preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

In [24]:
prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)

Precision:  0.9612611511670537
Recall:  0.9030998851894374
ROC score:  0.9508805222240667
F1 score:  0.9312733084709643


## Try EditedNearestNeighbours (Cleaning Undersampling)

In [10]:
from imblearn.under_sampling import EditedNearestNeighbours

In [11]:
enn = EditedNearestNeighbours()

In [12]:
train_X_resampled, train_y_resampled = enn.fit_resample(train_X, train_y)

In [13]:
n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

In [14]:
rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [15]:
preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

In [16]:
prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)

Precision:  0.9120779515069114
Recall:  0.9242250287026407
ROC score:  0.9604738070717432
F1 score:  0.9181113138686131


## Try RepeatedEditedNearestNeighbours (Cleaning Undersampling)

In [17]:
from imblearn.under_sampling import RepeatedEditedNearestNeighbours

In [20]:
renn = RepeatedEditedNearestNeighbours()

In [21]:
train_X_resampled, train_y_resampled = renn.fit_resample(train_X, train_y)

In [22]:
n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

In [23]:
rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [24]:
preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

In [25]:
prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)

Precision:  0.8854052873060957
Recall:  0.9305396096440872
ROC score:  0.9630545935639557
F1 score:  0.907411553963278


## Try AllKNN (Cleaning Undersampling)

In [26]:
from imblearn.under_sampling import AllKNN

In [27]:
allknn = AllKNN()

In [28]:
train_X_resampled, train_y_resampled = allknn.fit_resample(train_X, train_y)

In [29]:
n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

In [30]:
rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [31]:
preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

from sklearn.metrics import (
    precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix
    )

In [32]:
prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)

Precision:  0.8995321897972822
Recall:  0.9272101033295063
ROC score:  0.9617002656258635
F1 score:  0.9131614654002714


## Try CondensedNearestNeighbour (Cleaning Undersampling)

In [24]:
from imblearn.under_sampling import CondensedNearestNeighbour

In [25]:
connn = CondensedNearestNeighbour(random_state=42)

In [26]:
train_X_resampled, train_y_resampled = connn.fit_resample(train_X, train_y)

KeyboardInterrupt: 

In [None]:
n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

In [None]:
rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

In [None]:
preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

from sklearn.metrics import (
    precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix
    )

In [None]:
prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)

## Try OneSidedSelection (Cleaning Undersampling)

In [10]:
from imblearn.under_sampling import OneSidedSelection

In [11]:
oness = OneSidedSelection(random_state=42)

In [12]:
train_X_resampled, train_y_resampled = oness.fit_resample(train_X, train_y)

In [13]:
n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

In [14]:
rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [15]:
preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

from sklearn.metrics import (
    precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix
    )

In [16]:
prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)

Precision:  0.9603960396039604
Recall:  0.9020665901262916
ROC score:  0.9503490925391987
F1 score:  0.9303179207862176


## Try NeighbourhoodCleaningRule (Cleaning Undersampling)

In [17]:
from imblearn.under_sampling import NeighbourhoodCleaningRule

In [18]:
ncr = NeighbourhoodCleaningRule()

In [19]:
train_X_resampled, train_y_resampled = ncr.fit_resample(train_X, train_y)

In [20]:
n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

In [21]:
rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [22]:
preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

from sklearn.metrics import (
    precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix
    )

In [23]:
prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)

Precision:  0.9199220987512888
Recall:  0.9219288174512055
ROC score:  0.9594883051322725
F1 score:  0.9209243649291816
