In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

In [3]:
quake_frame = pd.read_csv('data/consolidated_data.csv')

In [4]:
quake_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3272774 entries, 0 to 3272773
Data columns (total 23 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Unnamed: 0       int64  
 1   time             object 
 2   latitude         float64
 3   longitude        float64
 4   depth            float64
 5   mag              float64
 6   magType          object 
 7   nst              float64
 8   gap              float64
 9   dmin             float64
 10  rms              float64
 11  net              object 
 12  id               object 
 13  updated          object 
 14  place            object 
 15  type             object 
 16  horizontalError  float64
 17  depthError       float64
 18  magError         float64
 19  magNst           float64
 20  status           object 
 21  locationSource   object 
 22  magSource        object 
dtypes: float64(12), int64(1), object(10)
memory usage: 574.3+ MB


In [17]:
quake_frame.head()

Unnamed: 0,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,net,type,horizontalError,depthError,magError,magNst,status,locationSource,magSource,simple_label,has_null
0,37.003502,-117.996834,0.0,0.0,mh,0.0,,,,ci,sonic boom,,,,0.0,reviewed,ci,ci,True,True
1,35.642788,-120.933601,5.0,1.99,mh,2.0,,,,ci,earthquake,,,,0.0,reviewed,ci,ci,False,True
2,34.16452,-118.185036,0.0,0.0,mh,,,,,ci,earthquake,,,,0.0,reviewed,ci,ci,False,True
3,33.836494,-116.781868,0.0,0.0,mh,,,,,ci,sonic boom,,,,0.0,reviewed,ci,ci,True,True
4,33.208477,-115.476997,5.0,0.0,mh,,,,,ci,sonic boom,,,,0.0,reviewed,ci,ci,True,True


In [5]:
quake_frame.type.value_counts()['earthquake']/len(quake_frame.index)

0.9729571916667634

Righto, so 97.3% of our dataset consists of earthquakes. That is definitely imbalanced.

In [6]:
var_cols = quake_frame.columns
var_cols

Index(['Unnamed: 0', 'time', 'latitude', 'longitude', 'depth', 'mag',
       'magType', 'nst', 'gap', 'dmin', 'rms', 'net', 'id', 'updated', 'place',
       'type', 'horizontalError', 'depthError', 'magError', 'magNst', 'status',
       'locationSource', 'magSource'],
      dtype='object')

In [7]:
quake_frame.isna().sum()

Unnamed: 0               0
time                     0
latitude                 0
longitude                0
depth                    9
mag                 156449
magType             167407
nst                 881566
gap                 838549
dmin               1346742
rms                 211653
net                      0
id                       0
updated                  0
place                   11
type                     0
horizontalError    1531963
depthError          606685
magError           1781012
magNst              988917
status                   1
locationSource           0
magSource                0
dtype: int64

Okay, let's kick this off. We've had a look around the dataset and there are quite a few nans and values that appear to be either so numerous or varied that they aren't useful (like time and updated, while earthquakes might not be independent, we're not looking at a timeseries) or they are available in multiple formats (longitude/latitude and place).  
Other variables are not clear, like the Source ones which appear similar to the net, but not the same.

Right, so it looks like the source of the information is a key part of the information about what a thing is. So they should all be kept. The dmin variable looks like about 1/3 of the whole dataset has nans. Similar to horizontalError and magError. DepthError and magNst don't seem to fare much better. First dumb idea, take out any columns that are too hard to recode or useless anyway, run a model, see what happens.

In [8]:
quake_frame.drop(['id', 'Unnamed: 0', 'place', 'time', 'updated'], inplace=True, axis=1)

Brilliant. Now, there are a few interesting things to test here.  
What's the connection between the nans and the non-earthquake types? Are the nans equally distributed?  
What can we say about the correlation between the type and the number of nans? These and more questions are still to answer!

In [9]:
quake_frame['simple_label'] = quake_frame['type'] != 'earthquake'

In [10]:
quake_frame['has_null'] = quake_frame.isna().sum(axis=1)

In [11]:
quake_frame['simple_label'].corr(quake_frame['has_null'])

-0.033550227623335906

Right, now that's awesome. At first glance, there's no reason to believe that there's a common thread underneath at least whether or not something is an earthquake and whether it has NaNs. Cool. Okay, so I'd say we have a look at a few distributions. Make sure max and min are okay etc.

In [12]:
quake_frame.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
latitude,3272774.0,35.720738,20.256723,-84.422,34.118,37.576167,42.258667,87.265
longitude,3272774.0,-92.856671,80.553255,-179.999,-122.79583,-118.811167,-115.454167,180.0
depth,3272765.0,22.334946,56.320328,-10.0,3.002,7.155,15.0,735.8
mag,3116325.0,1.878941,1.352506,-9.99,0.97,1.5,2.46,9.1
nst,2391208.0,15.601496,26.606866,0.0,5.0,10.0,18.0,934.0
gap,2434225.0,130.487608,69.710621,0.0,79.0,115.0,168.26,360.0
dmin,1926032.0,0.255999,1.333459,0.0,0.02093,0.05135,0.116,141.16
rms,3061121.0,0.315205,0.399901,-1.0,0.06,0.15,0.48,104.33
horizontalError,1740811.0,1.266841,3.168282,0.0,0.3,0.48,0.93,280.6
depthError,2666089.0,5.64032,1167.801181,-1.0,0.49,0.96,2.76,1773552.5


Okay, the locations are all within sensible bounds. So is the depth. Now let's check whether there's a link between what 'type' there is and the gap. Let's see which types have no gaps.  
I think it'll be sensible to clip the depthError, though - 17 million meters is a bit much.

In [13]:
sum(quake_frame['gap'].isna())

838549

In [14]:
quake_frame.groupby('type').count()

Unnamed: 0_level_0,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,net,horizontalError,depthError,magError,magNst,status,locationSource,magSource,simple_label,has_null
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Rock Slide,1,1,1,1,1,1,1,0,1,1,0,0,0,0,1,1,1,1,1
acoustic noise,2,2,2,2,2,2,2,2,0,2,0,0,2,2,2,2,2,2,2
building collapse,5,5,5,0,0,0,0,0,0,5,0,0,0,0,5,5,5,5,5
chemical explosion,392,392,392,369,369,386,385,384,385,392,388,388,341,369,392,392,392,392,392
collapse,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,1,1
earthquake,3184269,3184269,3184262,3034628,3023675,2312690,2354425,1866907,2973090,3184269,1688270,2604029,1444712,2210544,3184269,3184269,3184269,3184269,3184269
eq,1,1,1,1,0,0,0,0,0,1,0,0,0,0,0,1,1,1,1
experimental explosion,5,5,5,5,5,0,5,5,5,5,5,5,5,5,5,5,5,5,5
explosion,20133,20133,20133,20052,20050,16510,16496,15854,20044,20133,15890,18916,16371,16525,20133,20133,20133,20133,20133
ice quake,3951,3951,3951,3951,3951,0,5,5,3951,3951,20,3951,7,7,3951,3951,3951,3951,3951


Cool. Again, it looks like there's no significant signature in the missing values by type. magError and horizontalError have the most missing types, and earthquakes are missing about 50% of each, as much as the two next biggest groups.  
This indicates that we have a fair chance to fix a whole bunch of values by running interpolation or imputation (or no chance whatsoever... but this will be a nice test!)

In [15]:
quake_frame.corr()

Unnamed: 0,latitude,longitude,depth,mag,nst,gap,dmin,rms,horizontalError,depthError,magError,magNst,simple_label,has_null
latitude,1.0,-0.451401,-0.296079,-0.507814,-0.158979,0.004029,-0.494358,-0.244619,-0.302554,8.5e-05,-0.014073,-0.061685,0.035748,-0.031423
longitude,-0.451401,1.0,0.210276,0.614025,0.247375,-0.003974,0.389691,0.48402,0.335893,0.006076,-0.049341,0.063904,-0.055807,0.307779
depth,-0.296079,0.210276,1.0,0.368889,0.18778,0.006451,0.285692,0.306884,0.311279,0.001451,-0.041375,0.081878,-0.065932,0.251632
mag,-0.507814,0.614025,0.368889,1.0,0.411418,-0.058773,0.415177,0.647656,0.386625,0.003567,-0.034934,0.271055,-0.020237,0.469126
nst,-0.158979,0.247375,0.18778,0.411418,1.0,-0.27072,-0.051689,0.255584,-0.132335,-0.043947,0.029142,0.533437,-0.046812,0.040297
gap,0.004029,-0.003974,0.006451,-0.058773,-0.27072,1.0,0.021445,-0.008536,0.295077,0.196087,0.025157,-0.248005,-0.007296,0.143001
dmin,-0.494358,0.389691,0.285692,0.415177,-0.051689,0.021445,1.0,0.315868,0.414551,0.052997,-0.032836,0.14757,-0.013744,0.106397
rms,-0.244619,0.48402,0.306884,0.647656,0.255584,-0.008536,0.315868,1.0,0.464142,0.002074,-0.021915,0.109947,-0.035358,0.576069
horizontalError,-0.302554,0.335893,0.311279,0.386625,-0.132335,0.295077,0.414551,0.464142,1.0,0.302583,-0.031127,0.079172,0.027265,0.097424
depthError,8.5e-05,0.006076,0.001451,0.003567,-0.043947,0.196087,0.052997,0.002074,0.302583,1.0,0.023596,-0.036658,0.001106,0.003371


Yeah, so it looks more like thinking that the NaNs would *not* give any information about the label was a bit hasty. However, to be precise, this is the number of NaNs, not whether or not there are any. So to truly test this, I'll transform this into a boolean column and run this again.

In [16]:
quake_frame['has_null'] = quake_frame['has_null'] > 0

In [19]:
quake_frame.corr().sort_values(by=['simple_label'], ascending=False)

Unnamed: 0,latitude,longitude,depth,mag,nst,gap,dmin,rms,horizontalError,depthError,magError,magNst,simple_label,has_null
simple_label,0.035748,-0.055807,-0.065932,-0.020237,-0.046812,-0.007296,-0.013744,-0.035358,0.027265,0.001106,-0.026695,-0.010917,1.0,-0.039124
latitude,1.0,-0.451401,-0.296079,-0.507814,-0.158979,0.004029,-0.494358,-0.244619,-0.302554,8.5e-05,-0.014073,-0.061685,0.035748,-0.062753
horizontalError,-0.302554,0.335893,0.311279,0.386625,-0.132335,0.295077,0.414551,0.464142,1.0,0.302583,-0.031127,0.079172,0.027265,0.227323
depthError,8.5e-05,0.006076,0.001451,0.003567,-0.043947,0.196087,0.052997,0.002074,0.302583,1.0,0.023596,-0.036658,0.001106,0.002267
gap,0.004029,-0.003974,0.006451,-0.058773,-0.27072,1.0,0.021445,-0.008536,0.295077,0.196087,0.025157,-0.248005,-0.007296,0.136791
magNst,-0.061685,0.063904,0.081878,0.271055,0.533437,-0.248005,0.14757,0.109947,0.079172,-0.036658,-0.039358,1.0,-0.010917,-0.003974
dmin,-0.494358,0.389691,0.285692,0.415177,-0.051689,0.021445,1.0,0.315868,0.414551,0.052997,-0.032836,0.14757,-0.013744,0.176671
mag,-0.507814,0.614025,0.368889,1.0,0.411418,-0.058773,0.415177,0.647656,0.386625,0.003567,-0.034934,0.271055,-0.020237,0.370025
magError,-0.014073,-0.049341,-0.041375,-0.034934,0.029142,0.025157,-0.032836,-0.021915,-0.031127,0.023596,1.0,-0.039358,-0.026695,0.126834
rms,-0.244619,0.48402,0.306884,0.647656,0.255584,-0.008536,0.315868,1.0,0.464142,0.002074,-0.021915,0.109947,-0.035358,0.446177


This is very interesting. It appears that the presence of nulls alone boosts the correlation into 4th place out of 17. So it is very indicative. A useful feature, then? Maybe for this dataset. A more thorough analysis would have to look for patterns in the NaNs as well, such as looking for a dependence on time etc.  
For now, it looks like there is some correlation that allows us to tell whether something is an earthquake or not from the dataset.
