In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import (
    LabelEncoder, MinMaxScaler
    )
from sklearn.metrics import (
    precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix, accuracy_score
    )

## Preprocessing

In [20]:
quake_frame = pd.read_csv('data/consolidated_data.csv')

quake_frame['simple_label'] = quake_frame['type'] != 'earthquake'

quake_frame.drop(['id', 'Unnamed: 0', 'place', 'time', 'updated', 'type'], inplace=True, axis=1)

## Oversampling, no imputation

Keeping the Random Forest from before, we oversample by throwing away the NA values and test various strategies to see the different results.

In [21]:
quake_frame.dropna(inplace=True)
quake_frame.isna().sum()

latitude           0
longitude          0
depth              0
mag                0
magType            0
nst                0
gap                0
dmin               0
rms                0
net                0
horizontalError    0
depthError         0
magError           0
magNst             0
status             0
locationSource     0
magSource          0
simple_label       0
dtype: int64

In [22]:
len(quake_frame)

1227408

In [23]:
quake_frame.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
latitude,1227408.0,37.361674,4.841731,0.0,35.964167,37.573,38.817,62.030667
longitude,1227408.0,-119.557707,10.027502,-179.098,-122.701333,-120.558833,-118.150167,179.6615
depth,1227408.0,6.016756,7.92288,-3.882,1.816,4.413,7.83,211.0
mag,1227408.0,1.258097,0.694405,-2.5,0.8,1.18,1.67,5.84
nst,1227408.0,17.010182,13.671235,0.0,8.0,13.0,22.0,276.0
gap,1227408.0,121.03215,65.767724,0.0,72.0,105.0,153.0,360.0
dmin,1227408.0,0.078264,0.342578,0.0,0.01712,0.03784,0.07999,141.16
rms,1227408.0,0.097118,0.195847,0.0,0.03,0.06,0.13,64.29
horizontalError,1227408.0,0.801039,2.296862,0.0,0.27,0.41,0.72,194.5841
depthError,1227408.0,2.773763,6.903563,0.0,0.49,0.77,1.46,725.3


Alright, this changes the proportions slightly, but not too bad. If anything, one might suggest that at least the mild increase in proportion of non-earthquakes offsets the reduced dataset a little.  
Okay, so the problematic values are no longer there, that's something.  
Let's try this.  
We'll start by mixing up the data frame, then encoding all the categories numerically and splitting it sklearn style.

In [24]:
quake_frame = quake_frame.sample(frac=1, random_state=42).reset_index(drop=True)

le = LabelEncoder()

cat_columns = ['magType', 'net', 'status', 'locationSource', 'magSource']

for cat in cat_columns:
    quake_frame = pd.concat([quake_frame,
                             pd.get_dummies(quake_frame[cat], prefix=cat)],
                            axis=1)

scale_cols = ['latitude', 'longitude', 'depth', 'mag', 'nst', 'gap', 'dmin', 'rms', 'horizontalError',
 'depthError', 'magError', 'magNst']

scaler = MinMaxScaler()

quake_frame[scale_cols] = scaler.fit_transform(quake_frame[scale_cols])

x_cols = ['latitude', 'longitude', 'depth', 'mag', 'nst', 'gap', 'dmin', 'rms', 'horizontalError', 'depthError',
 'magError', 'magNst', 'magType_Mb', 'magType_Md', 'magType_Ml', 'magType_Unknown', 'magType_ma', 'magType_mb',
 'magType_mc', 'magType_md', 'magType_me', 'magType_mh', 'magType_ml', 'magType_mlg', 'magType_mlr', 'magType_mw',
 'net_av', 'net_ci', 'net_hv', 'net_ismpkansas', 'net_ld', 'net_mb', 'net_nc', 'net_nm', 'net_nn', 'net_pr',
 'net_se', 'net_uu', 'net_uw', 'status_automatic', 'status_manual', 'status_reviewed', 'locationSource_av',
 'locationSource_ci', 'locationSource_hv', 'locationSource_ismp', 'locationSource_ld', 'locationSource_mb',
 'locationSource_nc', 'locationSource_nm', 'locationSource_nn', 'locationSource_pr', 'locationSource_se',
 'locationSource_uu', 'locationSource_uw', 'magSource_av', 'magSource_ci', 'magSource_hv', 'magSource_ismp',
 'magSource_ld', 'magSource_mb', 'magSource_nc', 'magSource_nm', 'magSource_nn', 'magSource_pr', 'magSource_se',
 'magSource_uu', 'magSource_uw']

y_col = ['simple_label']

In [25]:
train_length = int(np.round(len(quake_frame.index) * 0.8))

train_X = quake_frame.loc[:train_length, x_cols]
train_y = quake_frame.loc[:train_length, y_col]

valid_X = quake_frame.loc[train_length:, x_cols]
valid_y = quake_frame.loc[train_length:, y_col]

## Try RandomOverSampler

In [26]:
from imblearn.over_sampling import RandomOverSampler

In [27]:
ros = RandomOverSampler(random_state=42)

train_X_resampled, train_y_resampled = ros.fit_resample(train_X, train_y)

n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.9524906071991274
Recall:  0.9022962112514351
ROC score:  0.950320305041189
F1 score:  0.926714226755498
Accuracy score:  0.9949364922886403


## Try AdaSYN

In [28]:
from imblearn.over_sampling import ADASYN

In [29]:
ada = ADASYN(random_state=42)

train_X_resampled, train_y_resampled = ada.fit_resample(train_X, train_y)

n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.8308323329331733
Recall:  0.9535017221584385
ROC score:  0.9731799151903473
F1 score:  0.8879503902491179
Accuracy score:  0.991461695765881


## Try SMOTE

In [30]:
from imblearn.over_sampling import (
    SMOTE, BorderlineSMOTE, SVMSMOTE, SMOTENC, KMeansSMOTE
    )

In [31]:
smoter = SMOTE(random_state=42)

train_X_resampled, train_y_resampled = smoter.fit_resample(train_X, train_y)

n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.8547220500103327
Recall:  0.9497129735935707
ROC score:  0.9718873857206446
F1 score:  0.8997172068740483
Accuracy score:  0.9924882476108228


## Try BorderlineSMOTE

### Kind Borderline-1

In [32]:
borsmoter = BorderlineSMOTE(random_state=42)

train_X_resampled, train_y_resampled = borsmoter.fit_resample(train_X, train_y)

n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.854806790959275
Recall:  0.9422502870264065
ROC score:  0.9681813832712828
F1 score:  0.8964010703948446
Accuracy score:  0.9922723458339104


### Kind Borderline-2

In [33]:
borsmoter = BorderlineSMOTE(random_state=42, kind='borderline-2')

train_X_resampled, train_y_resampled = borsmoter.fit_resample(train_X, train_y)

n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.8153299792838118
Recall:  0.9489092996555684
ROC score:  0.9705014796894232
F1 score:  0.8770626624927044
Accuracy score:  0.9905614260923408


## Try SVMSMOTE

In [34]:
svmsmoter = SVMSMOTE(random_state=42)

train_X_resampled, train_y_resampled = svmsmoter.fit_resample(train_X, train_y)

n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.8433316300459888
Recall:  0.9474167623421355
ROC score:  0.9704710895994292
F1 score:  0.8923492835901595
Accuracy score:  0.9918894257012734


## Try KMeansSMOTE

In [35]:
kmsmoter = KMeansSMOTE(random_state=42, kmeans_estimator=16, cluster_balance_threshold=0.4)

train_X_resampled, train_y_resampled = kmsmoter.fit_resample(train_X, train_y)

n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

RuntimeError: No clusters found with sufficient samples of class True. Try lowering the cluster_balance_threshold or increasing the number of clusters.