In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from time import time
from sklearn.metrics import (
    precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix, accuracy_score
    )

## Preprocessing

In [2]:
quake_frame = pd.read_csv('data/consolidated_data.csv')

quake_frame['simple_label'] = quake_frame['type'] != 'earthquake'

quake_frame.drop(['id', 'Unnamed: 0', 'place', 'time', 'updated', 'type'], inplace=True, axis=1)

quake_frame['has_null'] = quake_frame.isna().sum(axis=1)

## KNN imputation

Our Random Forest is still for two classes that takes only rows that have no nans in them. This time, we'll use the iterative imputer. Let's see how many we get.  
Then we'll split the data 80/20 and run training.

In [3]:
start_time = time()
quake_frame.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
latitude,3272774.0,35.720738,20.256723,-84.422,34.118,37.576167,42.258667,87.265
longitude,3272774.0,-92.856671,80.553255,-179.999,-122.79583,-118.811167,-115.454167,180.0
depth,3272765.0,22.334946,56.320328,-10.0,3.002,7.155,15.0,735.8
mag,3116325.0,1.878941,1.352506,-9.99,0.97,1.5,2.46,9.1
nst,2391208.0,15.601496,26.606866,0.0,5.0,10.0,18.0,934.0
gap,2434225.0,130.487608,69.710621,0.0,79.0,115.0,168.26,360.0
dmin,1926032.0,0.255999,1.333459,0.0,0.02093,0.05135,0.116,141.16
rms,3061121.0,0.315205,0.399901,-1.0,0.06,0.15,0.48,104.33
horizontalError,1740811.0,1.266841,3.168282,0.0,0.3,0.48,0.93,280.6
depthError,2666089.0,5.64032,1167.801181,-1.0,0.49,0.96,2.76,1773552.5


In [4]:
quake_frame['depthError'].clip(upper=10000, inplace=True)

In [5]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin

class LabelEncoderByCol(BaseEstimator, TransformerMixin):
    def __init__(self,col):
        #List of column names in the DataFrame that should be encoded
        self.col = col
        #Dictionary storing a LabelEncoder for each column
        self.le_dic = {}
        for el in self.col:
            self.le_dic[el] = LabelEncoder()

    def fit(self,x,y=None):
        #Fill missing values with the string 'NaN'
        x[self.col] = x[self.col].fillna('NaN')
        for el in self.col:
            #Only use the values that are not 'NaN' to fit the Encoder
            a = x[el][x[el]!='NaN']
            self.le_dic[el].fit(a)
        return self

    def transform(self,x,y=None):
        #Fill missing values with the string 'NaN'
        x[self.col] = x[self.col].fillna('NaN')
        for el in self.col:
            #Only use the values that are not 'NaN' to fit the Encoder
            a = x[el][x[el]!='NaN']
            #Store an ndarray of the current column
            b = x[el].values
            #Replace the elements in the ndarray that are not 'NaN'
            #using the transformer
            b[b!='NaN'] = self.le_dic[el].transform(a)
            #Overwrite the column in the DataFrame
            x[el]=b
        #return the transformed DataFrame
        return x
    
def create_label_encoder_by_column(data, cols=None):
    if not cols:
        cols = data.columns.to_list()
    ### fit with the desired col, col in position 0 for this example
    fit_list = [data[col].unique() for col in cols]
    enc_list = [LabelEncoder().fit(col) for col in fit_list]
    return enc_list

def transform_all_but_nans(data, cols=None, name='_enc'):
    if not cols:
        cols = data.columns.to_list()
    encs = create_label_encoder_by_column(data, cols)
    for col, enc in zip(cols, encs):
        data[col + name] = data[col].apply(lambda x: enc.transform(x) if x is not None else x)
    return data



In [6]:
col = ['magType', 'net', 'status', 'locationSource', 'magSource']
le = LabelEncoderByCol(col=col)

le.fit(quake_frame)

le.transform(quake_frame)

quake_frame.replace('NaN', pd.NA)

Unnamed: 0,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,net,horizontalError,depthError,magError,magNst,status,locationSource,magSource,simple_label,has_null
0,37.003502,-117.996834,0.00,0.00,17,0.0,,,,3,,,,0.0,2,31,54,True,6
1,35.642788,-120.933601,5.00,1.99,17,2.0,,,,3,,,,0.0,2,31,54,False,6
2,34.164520,-118.185036,0.00,0.00,17,,,,,3,,,,0.0,2,31,54,False,7
3,33.836494,-116.781868,0.00,0.00,17,,,,,3,,,,0.0,2,31,54,True,7
4,33.208477,-115.476997,5.00,0.00,17,,,,,3,,,,0.0,2,31,54,True,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3272769,61.417200,-147.564900,13.10,1.20,18,,,,0.66,0,,0.40,,,0,5,10,False,6
3272770,66.227700,-157.202600,0.00,1.80,18,,,,1.01,0,,0.40,,,0,5,10,False,6
3272771,33.234667,-116.771167,12.45,0.65,18,19.0,54.0,0.01048,0.16,3,0.27,0.70,0.158,14.0,0,31,54,False,0
3272772,62.829900,-148.766400,55.50,1.80,18,,,,0.51,0,,1.60,,,0,5,10,False,6


In [7]:
quake_frame_imputation = quake_frame[quake_frame['has_null'] < 3]

In [9]:
imp = KNNImputer()
imp_cols = quake_frame.columns.to_list()

quake_frame_imputation[imp_cols] = imp.fit_transform(quake_frame_imputation[imp_cols])
quake_frame_imputation[col] = quake_frame_imputation[col].round(decimals=0)
quake_frame_imputation[col] = quake_frame_imputation[col].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  quake_frame_imputation[imp_cols] = imp.fit_transform(quake_frame_imputation[imp_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [10]:
quake_frame_imputation.magType.unique()

array([14, 17, 18, 24,  8, 23, 13,  9, 21, 19,  2, 15,  3,  0, 10, 16,  1,
        4, 25, 11, 27, 22, 26, 28, 20,  5])

In [11]:
quake_frame_imputation.isna().sum()

latitude           0
longitude          0
depth              0
mag                0
magType            0
nst                0
gap                0
dmin               0
rms                0
net                0
horizontalError    0
depthError         0
magError           0
magNst             0
status             0
locationSource     0
magSource          0
simple_label       0
has_null           0
dtype: int64

Alright, this changes the proportions slightly, but not too bad. If anything, one might suggest that at least the mild increase in proportion of non-earthquakes offsets the reduced dataset a little.  
Okay, so the problematic values are no longer there, that's something.  
Let's try this.  
We'll start by mixing up the data frame, then encoding all the categories numerically and splitting it sklearn style.

In [12]:
quake_frame_imputation = quake_frame_imputation.sample(frac=1, random_state=42).reset_index(drop=True)
quake_frame_imputation.sort_values(by=['has_null'], inplace=True, ignore_index=True)

In [13]:
x_cols = ['latitude',
 'longitude',
 'depth',
 'mag',
 'nst',
 'gap',
 'dmin',
 'rms',
 'horizontalError',
 'depthError',
 'magError',
 'magNst',
 'magType',
 'net',
 'status',
 'locationSource',
 'magSource']

y_col = ['simple_label']

In [14]:
# Separate train and valid sets and shuffle training set
valid_length = int(np.round(len(quake_frame_imputation.index) * 0.2))
quake_frame_imp_valid = quake_frame_imputation.loc[:valid_length, :]
quake_frame_imp_train = quake_frame_imputation.loc[valid_length:, :]
quake_frame_imp_train = quake_frame_imp_train.sample(frac=1, random_state=42).reset_index(drop=True)

In [15]:
valid_X = quake_frame_imp_valid.loc[:, x_cols]
valid_y = quake_frame_imp_valid.loc[:, y_col]

train_X = quake_frame_imp_train.loc[:, x_cols]
train_y = quake_frame_imp_train.loc[:, y_col]

In [16]:
n_estim = 100

rfc = RandomForestClassifier(n_estimators=n_estim,
                             random_state=42)

In [17]:
rfc.fit(train_X, np.ravel(train_y))
end_time = time()

In [18]:
preds = pd.DataFrame(rfc.predict(valid_X), columns=['predictions'])

In [21]:
prec = precision_score(valid_y, preds)
reca = recall_score(valid_y, preds)
roc = roc_auc_score(valid_y, preds)
f1 = f1_score(valid_y, preds)
acc = accuracy_score(valid_y, preds)
conf_mat = confusion_matrix(valid_y, preds)

print("Precision: ", prec)
print("Recall: ", reca)
print("ROC score: ", roc)
print("F1 score: ", f1)
print("Accuracy score: ", acc)
print("Time taken (min): ", np.round((end_time - start_time)/60, 2))

Precision:  0.9658252745451565
Recall:  0.8938187334091771
ROC score:  0.9463217828999199
F1 score:  0.9284279355575689
Accuracy score:  0.9950628756507657
Time taken (min):  3002.33


In [19]:
len(quake_frame_imputation)

1840134