In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import (
    LabelEncoder, MinMaxScaler
    )
from sklearn.metrics import (
    precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix, accuracy_score
    )

## Preprocessing

In [2]:
quake_frame = pd.read_csv('data/consolidated_data.csv')

quake_frame['simple_label'] = quake_frame['type'] != 'earthquake'

quake_frame.drop(['id', 'Unnamed: 0', 'place', 'time', 'updated', 'type'], inplace=True, axis=1)

## Undersampling, no imputation

Keeping the Random Forest from before, we undersample by throwing away the NA values and undersample by various strategies to see the different results.

In [3]:
quake_frame.dropna(inplace=True)
quake_frame.isna().sum()

latitude           0
longitude          0
depth              0
mag                0
magType            0
nst                0
gap                0
dmin               0
rms                0
net                0
horizontalError    0
depthError         0
magError           0
magNst             0
status             0
locationSource     0
magSource          0
simple_label       0
dtype: int64

In [4]:
len(quake_frame)

1227408

In [5]:
quake_frame.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
latitude,1227408.0,37.361674,4.841731,0.0,35.964167,37.573,38.817,62.030667
longitude,1227408.0,-119.557707,10.027502,-179.098,-122.701333,-120.558833,-118.150167,179.6615
depth,1227408.0,6.016756,7.92288,-3.882,1.816,4.413,7.83,211.0
mag,1227408.0,1.258097,0.694405,-2.5,0.8,1.18,1.67,5.84
nst,1227408.0,17.010182,13.671235,0.0,8.0,13.0,22.0,276.0
gap,1227408.0,121.03215,65.767724,0.0,72.0,105.0,153.0,360.0
dmin,1227408.0,0.078264,0.342578,0.0,0.01712,0.03784,0.07999,141.16
rms,1227408.0,0.097118,0.195847,0.0,0.03,0.06,0.13,64.29
horizontalError,1227408.0,0.801039,2.296862,0.0,0.27,0.41,0.72,194.5841
depthError,1227408.0,2.773763,6.903563,0.0,0.49,0.77,1.46,725.3


Alright, this changes the proportions slightly, but not too bad. If anything, one might suggest that at least the mild increase in proportion of non-earthquakes offsets the reduced dataset a little.  
Okay, so the problematic values are no longer there, that's something.  
Let's try this.  
We'll start by mixing up the data frame, then encoding all the categories numerically and splitting it sklearn style.

In [6]:
quake_frame = quake_frame.sample(frac=1, random_state=42).reset_index(drop=True)

cat_columns = ['magType', 'net', 'status', 'locationSource', 'magSource']

for cat in cat_columns:
    quake_frame = pd.concat([quake_frame,
                             pd.get_dummies(quake_frame[cat], prefix=cat)],
                            axis=1)

scale_cols = ['latitude', 'longitude', 'depth', 'mag', 'nst', 'gap', 'dmin', 'rms', 'horizontalError',
 'depthError', 'magError', 'magNst']

scaler = MinMaxScaler()

quake_frame[scale_cols] = scaler.fit_transform(quake_frame[scale_cols])

x_cols = ['latitude', 'longitude', 'depth', 'mag', 'nst', 'gap', 'dmin', 'rms', 'horizontalError', 'depthError',
 'magError', 'magNst', 'magType_Mb', 'magType_Md', 'magType_Ml', 'magType_Unknown', 'magType_ma', 'magType_mb',
 'magType_mc', 'magType_md', 'magType_me', 'magType_mh', 'magType_ml', 'magType_mlg', 'magType_mlr', 'magType_mw',
 'net_av', 'net_ci', 'net_hv', 'net_ismpkansas', 'net_ld', 'net_mb', 'net_nc', 'net_nm', 'net_nn', 'net_pr',
 'net_se', 'net_uu', 'net_uw', 'status_automatic', 'status_manual', 'status_reviewed', 'locationSource_av',
 'locationSource_ci', 'locationSource_hv', 'locationSource_ismp', 'locationSource_ld', 'locationSource_mb',
 'locationSource_nc', 'locationSource_nm', 'locationSource_nn', 'locationSource_pr', 'locationSource_se',
 'locationSource_uu', 'locationSource_uw', 'magSource_av', 'magSource_ci', 'magSource_hv', 'magSource_ismp',
 'magSource_ld', 'magSource_mb', 'magSource_nc', 'magSource_nm', 'magSource_nn', 'magSource_pr', 'magSource_se',
 'magSource_uu', 'magSource_uw']

y_col = ['simple_label']

In [7]:
train_length = int(np.round(len(quake_frame.index) * 0.8))

In [8]:
train_X = quake_frame.loc[:train_length, x_cols]
train_y = quake_frame.loc[:train_length, y_col]

valid_X = quake_frame.loc[train_length:, x_cols]
valid_y = quake_frame.loc[train_length:, y_col]

## Try RandomUnderSampler (Controlled Undersampling)

In [9]:
from imblearn.under_sampling import RandomUnderSampler

In [10]:
rus = RandomUnderSampler(random_state=42)

train_X_resampled, train_y_resampled = rus.fit_resample(train_X, train_y)

n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

In [11]:
prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.6320670722659149
Recall:  0.9780711825487944
ROC score:  0.9785634915328738
F1 score:  0.7678925545339823
Accuracy score:  0.9790208650736103


## Try NearMiss Version 1, 2 and 3 (Controlled Undersampling)

In [9]:
from imblearn.under_sampling import NearMiss

### Version 1

In [13]:
nemi1 = NearMiss(version=1)

train_X_resampled, train_y_resampled = nemi1.fit_resample(train_X, train_y)

n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.08719435367390213
Recall:  0.9772675086107921
ROC score:  0.8004590545942816
F1 score:  0.16010382672974016
Accuracy score:  0.6361973586658085


### Version 2

In [14]:
# nemi2 = NearMiss(version=2)

# train_X_resampled, train_y_resampled = nemi2.fit_resample(train_X, train_y)

# n_estim = 100

# rfc = RandomForestClassifier(n_estimators=n_estim,
#                              random_state=42)

# rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

# preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

# prec = precision_score(valid_y, preds)
# reca = recall_score(valid_y, preds)
# roc = roc_auc_score(valid_y, preds)
# f1 = f1_score(valid_y, preds)
# acc = accuracy_score(valid_y, preds)
# conf_mat = confusion_matrix(valid_y, preds)

# print("Precision: ", prec)
# print("Recall: ", reca)
# print("ROC score: ", roc)
# print("F1 score: ", f1)
# print("Accuracy score: ", acc)

Very interesting. The kernel keeps dying for no apparent reason when running this.

### Version 3

In [10]:
nemi3 = NearMiss(version=3)

train_X_resampled, train_y_resampled = nemi3.fit_resample(train_X, train_y)

n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.8737427776588915
Recall:  0.9375430539609644
ROC score:  0.9662796782821564
F1 score:  0.9045192733717324
Accuracy score:  0.9929770818226998


## Try Tomek's Links (Cleaning Undersampling)

In [11]:
from imblearn.under_sampling import TomekLinks

In [12]:
toli = TomekLinks()

train_X_resampled, train_y_resampled = toli.fit_resample(train_X, train_y)

n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.962578222778473
Recall:  0.88300803673938
ROC score:  0.9408726092503685
F1 score:  0.9210778443113773
Accuracy score:  0.9946309709062171


## Try EditedNearestNeighbours (Cleaning Undersampling)

In [13]:
from imblearn.under_sampling import EditedNearestNeighbours

In [14]:
enn = EditedNearestNeighbours()

train_X_resampled, train_y_resampled = enn.fit_resample(train_X, train_y)

n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.9167342211928199
Recall:  0.9088404133180252
ROC score:  0.9529018683419819
F1 score:  0.9127702507927355
Accuracy score:  0.9938366153119169


## Try RepeatedEditedNearestNeighbours (Cleaning Undersampling)

In [9]:
from imblearn.under_sampling import RepeatedEditedNearestNeighbours

In [None]:
# renn = RepeatedEditedNearestNeighbours()

# train_X_resampled, train_y_resampled = renn.fit_resample(train_X, train_y)

# n_estim = 100

# rfc = RandomForestClassifier(n_estimators=n_estim,
#                              random_state=42)

# rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

# preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

# prec = precision_score(valid_y, preds)
# reca = recall_score(valid_y, preds)
# roc = roc_auc_score(valid_y, preds)
# f1 = f1_score(valid_y, preds)
# acc = accuracy_score(valid_y, preds)
# conf_mat = confusion_matrix(valid_y, preds)

# print("Precision: ", prec)
# print("Recall: ", reca)
# print("ROC score: ", roc)
# print("F1 score: ", f1)
# print("Accuracy score: ", acc)

This takes very, very long to run. I gave up eventually, I've got things to do and people to see.

## Try AllKNN (Cleaning Undersampling)

In [9]:
from imblearn.under_sampling import AllKNN

Using TensorFlow backend.


In [10]:
allknn = AllKNN()

train_X_resampled, train_y_resampled = allknn.fit_resample(train_X, train_y)

n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

In [11]:
prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.9004637484447461
Recall:  0.9140068886337543
ROC score:  0.9551451164740579
F1 score:  0.9071847757962509
Accuracy score:  0.9933640755737692


## Try CondensedNearestNeighbour (Cleaning Undersampling)

In [None]:
from imblearn.under_sampling import CondensedNearestNeighbour

In [None]:
# connn = CondensedNearestNeighbour(random_state=42)

# train_X_resampled, train_y_resampled = connn.fit_resample(train_X, train_y)

# n_estim = 100

# rfc = RandomForestClassifier(n_estimators=n_estim,
#                              random_state=42)

# rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

# preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

# prec = precision_score(valid_y, preds)
# reca = recall_score(valid_y, preds)
# roc = roc_auc_score(valid_y, preds)
# f1 = f1_score(valid_y, preds)
# conf_mat = confusion_matrix(valid_y, preds)

# print("Precision: ", prec)
# print("Recall: ", reca)
# print("ROC score: ", roc)
# print("F1 score: ", f1)

This takes very, very long to run. Yes, I ran out of patience eventually.

## Try OneSidedSelection (Cleaning Undersampling)

In [12]:
from imblearn.under_sampling import OneSidedSelection

In [13]:
oness = OneSidedSelection(random_state=42)

train_X_resampled, train_y_resampled = oness.fit_resample(train_X, train_y)

n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.9626343414146463
Recall:  0.8843857634902411
ROC score:  0.9415614726257991
F1 score:  0.9218525610339875
Accuracy score:  0.9946798543274049


## Try NeighbourhoodCleaningRule (Cleaning Undersampling)

In [14]:
from imblearn.under_sampling import NeighbourhoodCleaningRule

In [15]:
ncr = NeighbourhoodCleaningRule()

train_X_resampled, train_y_resampled = ncr.fit_resample(train_X, train_y)

n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.9239588207767899
Recall:  0.9067738231917336
ROC score:  0.9520142830756027
F1 score:  0.9152856646193069
Accuracy score:  0.9940443698519648
