In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

## Preprocessing

In [2]:
quake_frame = pd.read_csv('data/consolidated_data.csv')

quake_frame['simple_label'] = quake_frame['type'] != 'earthquake'

quake_frame.drop(['id', 'Unnamed: 0', 'place', 'time', 'updated', 'type'], inplace=True, axis=1)


## Iterative imputation

Our Random Forest is still for two classes that takes only rows that have no nans in them. This time, we'll use the iterative imputer. Let's see how many we get.  
Then we'll split the data 80/20 and run training.

In [3]:
quake_frame.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
latitude,3272774.0,35.720738,20.256723,-84.422,34.118,37.576167,42.258667,87.265
longitude,3272774.0,-92.856671,80.553255,-179.999,-122.79583,-118.811167,-115.454167,180.0
depth,3272765.0,22.334946,56.320328,-10.0,3.002,7.155,15.0,735.8
mag,3116325.0,1.878941,1.352506,-9.99,0.97,1.5,2.46,9.1
nst,2391208.0,15.601496,26.606866,0.0,5.0,10.0,18.0,934.0
gap,2434225.0,130.487608,69.710621,0.0,79.0,115.0,168.26,360.0
dmin,1926032.0,0.255999,1.333459,0.0,0.02093,0.05135,0.116,141.16
rms,3061121.0,0.315205,0.399901,-1.0,0.06,0.15,0.48,104.33
horizontalError,1740811.0,1.266841,3.168282,0.0,0.3,0.48,0.93,280.6
depthError,2666089.0,5.64032,1167.801181,-1.0,0.49,0.96,2.76,1773552.5


In [4]:
quake_frame['depthError'].clip(upper=10000, inplace=True)

In [5]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.preprocessing import LabelEncoder

class LabelEncoderByCol(BaseEstimator, TransformerMixin):
    def __init__(self,col):
        #List of column names in the DataFrame that should be encoded
        self.col = col
        #Dictionary storing a LabelEncoder for each column
        self.le_dic = {}
        for el in self.col:
            self.le_dic[el] = LabelEncoder()

    def fit(self,x,y=None):
        #Fill missing values with the string 'NaN'
        x[self.col] = x[self.col].fillna('NaN')
        for el in self.col:
            #Only use the values that are not 'NaN' to fit the Encoder
            a = x[el][x[el]!='NaN']
            self.le_dic[el].fit(a)
        return self

    def transform(self,x,y=None):
        #Fill missing values with the string 'NaN'
        x[self.col] = x[self.col].fillna('NaN')
        for el in self.col:
            #Only use the values that are not 'NaN' to fit the Encoder
            a = x[el][x[el]!='NaN']
            #Store an ndarray of the current column
            b = x[el].values
            #Replace the elements in the ndarray that are not 'NaN'
            #using the transformer
            b[b!='NaN'] = self.le_dic[el].transform(a)
            #Overwrite the column in the DataFrame
            x[el]=b
        #return the transformed DataFrame
        return x
    
def create_label_encoder_by_column(data, cols=None):
    if not cols:
        cols = data.columns.to_list()
    ### fit with the desired col, col in position 0 for this example
    fit_list = [data[col].unique() for col in cols]
    enc_list = [LabelEncoder().fit(col) for col in fit_list]
    return enc_list

def transform_all_but_nans(data, cols=None, name='_enc'):
    if not cols:
        cols = data.columns.to_list()
    encs = create_label_encoder_by_column(data, cols)
    for col, enc in zip(cols, encs):
        data[col + name] = data[col].apply(lambda x: enc.transform(x) if x is not None else x)
    return data



In [6]:
col = ['magType', 'net', 'status', 'locationSource', 'magSource']
le = LabelEncoderByCol(col=col)

le.fit(quake_frame)

le.transform(quake_frame)

quake_frame.replace('NaN', pd.NA)

Unnamed: 0,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,net,horizontalError,depthError,magError,magNst,status,locationSource,magSource,simple_label
0,37.003502,-117.996834,0.00,0.00,17,0.0,,,,3,,,,0.0,2,31,54,True
1,35.642788,-120.933601,5.00,1.99,17,2.0,,,,3,,,,0.0,2,31,54,False
2,34.164520,-118.185036,0.00,0.00,17,,,,,3,,,,0.0,2,31,54,False
3,33.836494,-116.781868,0.00,0.00,17,,,,,3,,,,0.0,2,31,54,True
4,33.208477,-115.476997,5.00,0.00,17,,,,,3,,,,0.0,2,31,54,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3272769,61.417200,-147.564900,13.10,1.20,18,,,,0.66,0,,0.40,,,0,5,10,False
3272770,66.227700,-157.202600,0.00,1.80,18,,,,1.01,0,,0.40,,,0,5,10,False
3272771,33.234667,-116.771167,12.45,0.65,18,19.0,54.0,0.01048,0.16,3,0.27,0.70,0.158,14.0,0,31,54,False
3272772,62.829900,-148.766400,55.50,1.80,18,,,,0.51,0,,1.60,,,0,5,10,False


In [7]:
imp = IterativeImputer(max_iter=20, random_state=42)
imp_cols = quake_frame.columns.to_list()
imp_cols.remove('simple_label')

quake_frame[imp_cols] = imp.fit_transform(quake_frame[imp_cols])



In [8]:
quake_frame.head()

Unnamed: 0,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,net,horizontalError,depthError,magError,magNst,status,locationSource,magSource,simple_label
0,37.003502,-117.996834,0.0,0.0,17.0,0.0,133.378259,0.01768,0.005528,3.0,0.348471,9.448469,0.171992,0.0,2.0,31.0,54.0,True
1,35.642788,-120.933601,5.0,1.99,17.0,2.0,144.933605,0.502248,0.374875,3.0,3.588748,28.481856,0.192026,0.0,2.0,31.0,54.0,False
2,34.16452,-118.185036,0.0,0.0,17.0,-2.06895,134.251144,0.179252,-0.004134,3.0,0.497971,10.261631,0.174977,0.0,2.0,31.0,54.0,False
3,33.836494,-116.781868,0.0,0.0,17.0,-1.955438,134.10983,0.191569,-0.003743,3.0,0.511422,10.160881,0.175186,0.0,2.0,31.0,54.0,True
4,33.208477,-115.476997,5.0,0.0,17.0,-1.427663,134.115152,0.235334,-0.000928,3.0,0.614169,10.114192,0.17517,0.0,2.0,31.0,54.0,True


In [9]:
quake_frame[col] = quake_frame[col].round(decimals=0)
quake_frame[col] = quake_frame[col].astype(int)

In [10]:
quake_frame.magType.unique()

array([ 17,  14,  24,  18,  15,   9,  21,  10,  11,   6,  12,  13,   7,
         8,  19,  16,  20,  -6,  23,  26,  22, -17,  -5,  -3, -26,   0,
        -9, -30,   5,  -7,  25,  90,   3,  27,  -1,   4,   2, -82,  28,
        29,  49,   1,  56])

In [11]:
quake_frame.head()

Unnamed: 0,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,net,horizontalError,depthError,magError,magNst,status,locationSource,magSource,simple_label
0,37.003502,-117.996834,0.0,0.0,17,0.0,133.378259,0.01768,0.005528,3,0.348471,9.448469,0.171992,0.0,2,31,54,True
1,35.642788,-120.933601,5.0,1.99,17,2.0,144.933605,0.502248,0.374875,3,3.588748,28.481856,0.192026,0.0,2,31,54,False
2,34.16452,-118.185036,0.0,0.0,17,-2.06895,134.251144,0.179252,-0.004134,3,0.497971,10.261631,0.174977,0.0,2,31,54,False
3,33.836494,-116.781868,0.0,0.0,17,-1.955438,134.10983,0.191569,-0.003743,3,0.511422,10.160881,0.175186,0.0,2,31,54,True
4,33.208477,-115.476997,5.0,0.0,17,-1.427663,134.115152,0.235334,-0.000928,3,0.614169,10.114192,0.17517,0.0,2,31,54,True


Alright, this changes the proportions slightly, but not too bad. If anything, one might suggest that at least the mild increase in proportion of non-earthquakes offsets the reduced dataset a little.  
Okay, so the problematic values are no longer there, that's something.  
Let's try this.  
We'll start by mixing up the data frame, then encoding all the categories numerically and splitting it sklearn style.

In [12]:
quake_frame = quake_frame.sample(frac=1, random_state=42).reset_index(drop=True)


In [13]:
quake_frame.columns

Index(['latitude', 'longitude', 'depth', 'mag', 'magType', 'nst', 'gap',
       'dmin', 'rms', 'net', 'horizontalError', 'depthError', 'magError',
       'magNst', 'status', 'locationSource', 'magSource', 'simple_label'],
      dtype='object')

In [14]:
x_cols = ['latitude',
 'longitude',
 'depth',
 'mag',
 'nst',
 'gap',
 'dmin',
 'rms',
 'horizontalError',
 'depthError',
 'magError',
 'magNst',
 'magType',
 'net',
 'status',
 'locationSource',
 'magSource']

y_col = ['simple_label']

In [15]:
train_length = int(np.round(len(quake_frame.index) * 0.8))

In [16]:
train_X = quake_frame.loc[:train_length, x_cols]
train_y = quake_frame.loc[:train_length, y_col]

valid_X = quake_frame.loc[train_length:, x_cols]
valid_y = quake_frame.loc[train_length:, y_col]

In [17]:
n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

In [18]:
rfc.fit(train_X, np.ravel(train_y))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [19]:
from sklearn.metrics import (
    precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix
    )

In [20]:
preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)

Precision:  0.971945591450085
Recall:  0.9059316278016754
ROC score:  0.9526031121970817
F1 score:  0.9377782985704242
