In [1]:
import pandas as pd
import numpy as np
from numpy import mean
from numpy import std
from numpy import dstack
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras import backend as K
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import (
    precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix
    )
from sklearn.model_selection import StratifiedKFold

Using TensorFlow backend.


## Preprocessing

In [2]:
quake_frame = pd.read_csv('data/consolidated_data.csv')

quake_frame['simple_label'] = quake_frame['type'] != 'earthquake'

quake_frame.drop(['id', 'Unnamed: 0', 'place', 'time', 'updated', 'type'], inplace=True, axis=1)

## Simple model, no imputation

We'll start this off with a simple model, the same architecture as the discriminator in the GAN examples. We'll use the same data structure as for the majority of the other examples, scaled and one-hot encoded.

In [3]:
quake_frame.dropna(inplace=True)
quake_frame.isna().sum()

latitude           0
longitude          0
depth              0
mag                0
magType            0
nst                0
gap                0
dmin               0
rms                0
net                0
horizontalError    0
depthError         0
magError           0
magNst             0
status             0
locationSource     0
magSource          0
simple_label       0
dtype: int64

In [4]:
len(quake_frame)

1227408

In [5]:
quake_frame.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
latitude,1227408.0,37.361674,4.841731,0.0,35.964167,37.573,38.817,62.030667
longitude,1227408.0,-119.557707,10.027502,-179.098,-122.701333,-120.558833,-118.150167,179.6615
depth,1227408.0,6.016756,7.92288,-3.882,1.816,4.413,7.83,211.0
mag,1227408.0,1.258097,0.694405,-2.5,0.8,1.18,1.67,5.84
nst,1227408.0,17.010182,13.671235,0.0,8.0,13.0,22.0,276.0
gap,1227408.0,121.03215,65.767724,0.0,72.0,105.0,153.0,360.0
dmin,1227408.0,0.078264,0.342578,0.0,0.01712,0.03784,0.07999,141.16
rms,1227408.0,0.097118,0.195847,0.0,0.03,0.06,0.13,64.29
horizontalError,1227408.0,0.801039,2.296862,0.0,0.27,0.41,0.72,194.5841
depthError,1227408.0,2.773763,6.903563,0.0,0.49,0.77,1.46,725.3


In [6]:
quake_frame = quake_frame.sample(frac=1, random_state=42).reset_index(drop=True)

cat_columns = ['magType', 'net', 'status', 'locationSource', 'magSource']

for cat in cat_columns:
    quake_frame = pd.concat([quake_frame,
                             pd.get_dummies(quake_frame[cat], prefix=cat)],
                            axis=1)

scale_cols = ['latitude', 'longitude', 'depth', 'mag', 'nst', 'gap', 'dmin', 'rms', 'horizontalError',
 'depthError', 'magError', 'magNst']

scaler = MinMaxScaler()

quake_frame[scale_cols] = scaler.fit_transform(quake_frame[scale_cols])

x_cols = ['latitude', 'longitude', 'depth', 'mag', 'nst', 'gap', 'dmin', 'rms', 'horizontalError', 'depthError',
 'magError', 'magNst', 'magType_Mb', 'magType_Md', 'magType_Ml', 'magType_Unknown', 'magType_ma', 'magType_mb',
 'magType_mc', 'magType_md', 'magType_me', 'magType_mh', 'magType_ml', 'magType_mlg', 'magType_mlr', 'magType_mw',
 'net_av', 'net_ci', 'net_hv', 'net_ismpkansas', 'net_ld', 'net_mb', 'net_nc', 'net_nm', 'net_nn', 'net_pr',
 'net_se', 'net_uu', 'net_uw', 'status_automatic', 'status_manual', 'status_reviewed', 'locationSource_av',
 'locationSource_ci', 'locationSource_hv', 'locationSource_ismp', 'locationSource_ld', 'locationSource_mb',
 'locationSource_nc', 'locationSource_nm', 'locationSource_nn', 'locationSource_pr', 'locationSource_se',
 'locationSource_uu', 'locationSource_uw', 'magSource_av', 'magSource_ci', 'magSource_hv', 'magSource_ismp',
 'magSource_ld', 'magSource_mb', 'magSource_nc', 'magSource_nm', 'magSource_nn', 'magSource_pr', 'magSource_se',
 'magSource_uu', 'magSource_uw']

y_col = ['simple_label']

We'll split 80/20 and keep the validation set to compare the model performance with the modified data used in the training epochs.

In [7]:
train_length = int(np.round(len(quake_frame.index) * 0.8))

In [8]:
train_X = quake_frame.loc[:train_length, x_cols]
train_y = quake_frame.loc[:train_length, y_col]

valid_X = quake_frame.loc[train_length:, x_cols]
valid_y = quake_frame.loc[train_length:, y_col]

### Try a Multilayer Perceptron undersampled

We need to build a few custom metrics to be able to get recall, precision and f1-score as part of our validation.

In [9]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
X, Y = rus.fit_resample(train_X.values, train_y.values)

In [10]:
# Build custom metrics

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

We'll do 5-fold cross validation on all data and take the average of the validation to get our results.  
We will keep both the cross validation results and validation results to be able to do a comparison as well.

In [11]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
metrics=['accuracy', precision_m, recall_m, f1_m]

cvscores = []
val_scores = []
for train, test in kfold.split(X, Y):
  # create model
    model = Sequential()
    model.add(Dense(768, input_dim=len(x_cols), activation='relu'))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(384, activation='relu')) 
    model.add(Dense(256, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=metrics)
    # Fit the model
    model.fit(X[train], Y[train], epochs=50, batch_size=200, verbose=15)
    # evaluate the model
    scores = model.evaluate(X[test], Y[test], verbose=15)
    scores_val = model.evaluate(valid_X, valid_y, verbose=15)
    print("%s: %.2f%%" % (model.metrics_names[0], scores[0]*100))
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    print("%s: %.2f%%" % (model.metrics_names[2], scores[2]*100))
    print("%s: %.2f%%" % (model.metrics_names[3], scores[3]*100))
    print("%s: %.2f%%" % (model.metrics_names[4], scores[4]*100))    
    print("%s: %.2f%%" % (model.metrics_names[0], scores_val[0]*100))
    print("%s: %.2f%%" % (model.metrics_names[1], scores_val[1]*100))
    print("%s: %.2f%%" % (model.metrics_names[2], scores_val[2]*100))
    print("%s: %.2f%%" % (model.metrics_names[3], scores_val[3]*100))
    print("%s: %.2f%%" % (model.metrics_names[4], scores_val[4]*100))    
    cvscores.append(scores)
    val_scores.append(scores_val)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
loss: 14.19%
accuracy: 94.83%
precision_m: 50.23%
recall_m: 47.84%
f1_m: 48.98%
loss: 16.31%
accuracy: 94.12%
precision_m: 35.36%
recall_m: 64.87%
f1_m: 43.51%
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7

In [12]:
cvscore_frame = pd.DataFrame.from_records(cvscores, columns=['loss', 'accuracy', 'precision', 'recall', 'f1_score'])

cvscore_frame.mean()

loss         0.153580
accuracy     0.945075
precision    0.501905
recall       0.474165
f1_score     0.487337
dtype: float64

In [13]:
val_score_frame = pd.DataFrame.from_records(cvscores, columns=['loss', 'accuracy', 'precision', 'recall', 'f1_score'])

val_score_frame.mean()

loss         0.153580
accuracy     0.945075
precision    0.501905
recall       0.474165
f1_score     0.487337
dtype: float64

## Try a Multilayer Perceptron oversampled

In [14]:
from imblearn.over_sampling import SMOTE

smoter = SMOTE(random_state=42)

X, Y = smoter.fit_resample(train_X.values, train_y.values)

We'll do 5-fold cross validation on all data and take the average of the validation to get our results.  
We will keep both the cross validation results and validation results to be able to do a comparison as well.

In [15]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
metrics=['accuracy', precision_m, recall_m, f1_m]

cvscores = []
val_scores = []
for train, test in kfold.split(X, Y):
  # create model
    model = Sequential()
    model.add(Dense(768, input_dim=len(x_cols), activation='relu'))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(384, activation='relu')) 
    model.add(Dense(256, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=metrics)
    # Fit the model
    model.fit(X[train], Y[train], epochs=45, batch_size=20, verbose=15)
    # evaluate the model
    scores = model.evaluate(X[test], Y[test], verbose=15)
    scores_val = model.evaluate(valid_X, valid_y, verbose=15)
    print("%s: %.2f%%" % (model.metrics_names[0], scores[0]*100))
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    print("%s: %.2f%%" % (model.metrics_names[2], scores[2]*100))
    print("%s: %.2f%%" % (model.metrics_names[3], scores[3]*100))
    print("%s: %.2f%%" % (model.metrics_names[4], scores[4]*100))    
    print("%s: %.2f%%" % (model.metrics_names[0], scores_val[0]*100))
    print("%s: %.2f%%" % (model.metrics_names[1], scores_val[1]*100))
    print("%s: %.2f%%" % (model.metrics_names[2], scores_val[2]*100))
    print("%s: %.2f%%" % (model.metrics_names[3], scores_val[3]*100))
    print("%s: %.2f%%" % (model.metrics_names[4], scores_val[4]*100))    
    cvscores.append(scores)
    val_scores.append(scores_val)

Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
Epoch 8/45
Epoch 9/45
Epoch 10/45
Epoch 11/45
Epoch 12/45
Epoch 13/45
Epoch 14/45
Epoch 15/45
Epoch 16/45
Epoch 17/45
Epoch 18/45
Epoch 19/45
Epoch 20/45
Epoch 21/45
Epoch 22/45
Epoch 23/45
Epoch 24/45
Epoch 25/45
Epoch 26/45
Epoch 27/45
Epoch 28/45
Epoch 29/45
Epoch 30/45
Epoch 31/45
Epoch 32/45
Epoch 33/45
Epoch 34/45
Epoch 35/45
Epoch 36/45
Epoch 37/45
Epoch 38/45
Epoch 39/45
Epoch 40/45
Epoch 41/45
Epoch 42/45
Epoch 43/45
Epoch 44/45
Epoch 45/45
loss: 14.53%
accuracy: 95.33%
precision_m: 75.27%
recall_m: 76.92%
f1_m: 74.81%
loss: 16.03%
accuracy: 97.86%
precision_m: 51.21%
recall_m: 60.61%
f1_m: 53.65%
Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
Epoch 8/45
Epoch 9/45
Epoch 10/45
Epoch 11/45
Epoch 12/45
Epoch 13/45
Epoch 14/45
Epoch 15/45
Epoch 16/45
Epoch 17/45
Epoch 18/45
Epoch 19/45
Epoch 20/45
Epoch 21/45
Epoch 22/45
Epoch 23/45
Epoch 24/45
Epoch 25/45
Epoch 26/45
Epoch 

In [16]:
cvscore_frame = pd.DataFrame.from_records(cvscores, columns=['loss', 'accuracy', 'precision', 'recall', 'f1_score'])

cvscore_frame.mean()

loss         0.086225
accuracy     0.973356
precision    0.740397
recall       0.805997
f1_score     0.757513
dtype: float64

In [17]:
val_score_frame = pd.DataFrame.from_records(cvscores, columns=['loss', 'accuracy', 'precision', 'recall', 'f1_score'])

val_score_frame.mean()

loss         0.086225
accuracy     0.973356
precision    0.740397
recall       0.805997
f1_score     0.757513
dtype: float64