In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix, accuracy_score
    )

## Preprocessing

In [2]:
quake_frame = pd.read_csv('data/consolidated_data.csv')

quake_frame['simple_label'] = quake_frame['type'] != 'earthquake'

quake_frame.drop(['id', 'Unnamed: 0', 'place', 'time', 'updated', 'type'], inplace=True, axis=1)

## Oversampling, no imputation

Keeping the Random Forest from before, we oversample by throwing away the NA values and test various strategies to see the different results.

In [3]:
quake_frame.dropna(inplace=True)
quake_frame.isna().sum()

latitude           0
longitude          0
depth              0
mag                0
magType            0
nst                0
gap                0
dmin               0
rms                0
net                0
horizontalError    0
depthError         0
magError           0
magNst             0
status             0
locationSource     0
magSource          0
simple_label       0
dtype: int64

In [4]:
len(quake_frame)

1227408

In [5]:
quake_frame.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
latitude,1227408.0,37.361674,4.841731,0.0,35.964167,37.573,38.817,62.030667
longitude,1227408.0,-119.557707,10.027502,-179.098,-122.701333,-120.558833,-118.150167,179.6615
depth,1227408.0,6.016756,7.92288,-3.882,1.816,4.413,7.83,211.0
mag,1227408.0,1.258097,0.694405,-2.5,0.8,1.18,1.67,5.84
nst,1227408.0,17.010182,13.671235,0.0,8.0,13.0,22.0,276.0
gap,1227408.0,121.03215,65.767724,0.0,72.0,105.0,153.0,360.0
dmin,1227408.0,0.078264,0.342578,0.0,0.01712,0.03784,0.07999,141.16
rms,1227408.0,0.097118,0.195847,0.0,0.03,0.06,0.13,64.29
horizontalError,1227408.0,0.801039,2.296862,0.0,0.27,0.41,0.72,194.5841
depthError,1227408.0,2.773763,6.903563,0.0,0.49,0.77,1.46,725.3


Alright, this changes the proportions slightly, but not too bad. If anything, one might suggest that at least the mild increase in proportion of non-earthquakes offsets the reduced dataset a little.  
Okay, so the problematic values are no longer there, that's something.  
Let's try this.  
We'll start by mixing up the data frame, then encoding all the categories numerically and splitting it sklearn style.

In [6]:
quake_frame = quake_frame.sample(frac=1, random_state=42).reset_index(drop=True)

le = LabelEncoder()

cat_columns = ['magType', 'net', 'status', 'locationSource', 'magSource']

for cat in cat_columns:
    quake_frame[cat + '_enc'] = le.fit_transform(quake_frame[cat])

In [7]:
x_cols = ['latitude',
 'longitude',
 'depth',
 'mag',
 'nst',
 'gap',
 'dmin',
 'rms',
 'horizontalError',
 'depthError',
 'magError',
 'magNst',
 'magType_enc',
 'net_enc',
 'status_enc',
 'locationSource_enc',
 'magSource_enc']

y_col = ['simple_label']

In [8]:
train_length = int(np.round(len(quake_frame.index) * 0.8))

In [9]:
train_X = quake_frame.loc[:train_length, x_cols]
train_y = quake_frame.loc[:train_length, y_col]

valid_X = quake_frame.loc[train_length:, x_cols]
valid_y = quake_frame.loc[train_length:, y_col]

## Try RandomOverSampler

In [10]:
from imblearn.over_sampling import RandomOverSampler

Using TensorFlow backend.


In [11]:
ros = RandomOverSampler(random_state=42)

In [12]:
train_X_resampled, train_y_resampled = ros.fit_resample(train_X, train_y)

In [13]:
n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

In [14]:
rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [15]:
preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

In [16]:
prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.9534187522452401
Recall:  0.914121699196326
ROC score:  0.9562393842221896
F1 score:  0.9333567786179005
Accuracy score:  0.995368295842465


## Try AdaSYN

In [17]:
from imblearn.over_sampling import ADASYN

In [18]:
ada = ADASYN(random_state=42)

In [19]:
train_X_resampled, train_y_resampled = ada.fit_resample(train_X, train_y)

In [20]:
n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

In [21]:
rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [22]:
preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

In [23]:
prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.8299441786283892
Recall:  0.9559127439724454
ROC score:  0.9743537500545754
F1 score:  0.8884857539216732
Accuracy score:  0.9914861374764748


## Try SMOTE

In [24]:
from imblearn.over_sampling import (SMOTE, BorderlineSMOTE, SVMSMOTE, SMOTENC,
                                    KMeansSMOTE)

In [25]:
smoter = SMOTE(random_state=42)

In [26]:
train_X_resampled, train_y_resampled = smoter.fit_resample(train_X, train_y)

n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.8563615725931276
Recall:  0.952812858783008
ROC score:  0.9734668926219535
F1 score:  0.9020161947720232
Accuracy score:  0.9926552659665474


## Try BorderlineSMOTE

### Kind Borderline-1

In [27]:
borsmoter = BorderlineSMOTE(random_state=42)

In [28]:
train_X_resampled, train_y_resampled = borsmoter.fit_resample(train_X, train_y)

In [29]:
n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

In [30]:
rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [31]:
preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

In [32]:
prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.8567114443403103
Recall:  0.9445464982778415
ROC score:  0.9693675001483307
F1 score:  0.8984874133129471
Accuracy score:  0.9924271433343381


### Kind Borderline-2

In [41]:
borsmoter = BorderlineSMOTE(random_state=42, kind='borderline-2')

In [42]:
train_X_resampled, train_y_resampled = borsmoter.fit_resample(train_X, train_y)

In [43]:
n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

In [44]:
rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [45]:
preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

In [46]:
prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.8021307506053269
Recall:  0.9508610792192882
ROC score:  0.9711162625836445
F1 score:  0.8701864985552931
Accuracy score:  0.9899340888537652


## Try SVMSMOTE

In [33]:
svmsmoter = SVMSMOTE(random_state=42)

In [34]:
train_X_resampled, train_y_resampled = svmsmoter.fit_resample(train_X, train_y)

In [35]:
n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

In [36]:
rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [37]:
preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

In [38]:
prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)

Precision:  0.8472492572482327
Recall:  0.9494833524684271
ROC score:  0.971593077582346
F1 score:  0.8954577445725732
Accuracy score:  0.992133842807212


## Try KMeansSMOTE

In [39]:
kmsmoter = KMeansSMOTE(random_state=42, kmeans_estimator=16, cluster_balance_threshold=0.4)

In [40]:
train_X_resampled, train_y_resampled = kmsmoter.fit_resample(train_X, train_y)

RuntimeError: No clusters found with sufficient samples of class True. Try lowering the cluster_balance_threshold or increasing the number of clusters.

In [None]:
n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

In [None]:
rfc.fit(train_X_resampled, np.ravel(train_y_resampled))

In [None]:
preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

In [None]:
prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)