# ETL Extract raw notebook

In [2]:
# import statements
import pandas as pd
import numpy as np

load dataset

In [3]:
df = pd.read_csv("../data/raw/earthquake_data_tsunami.csv")

check feature types

In [4]:
df.dtypes

magnitude    float64
cdi            int64
mmi            int64
sig            int64
nst            int64
dmin         float64
gap          float64
depth        float64
latitude     float64
longitude    float64
Year           int64
Month          int64
tsunami        int64
dtype: object

check for missing values

In [5]:
df.isna().any()

magnitude    False
cdi          False
mmi          False
sig          False
nst          False
dmin         False
gap          False
depth        False
latitude     False
longitude    False
Year         False
Month        False
tsunami      False
dtype: bool

check feature value ranges

In [8]:
[[col, np.min(df[col]), np.max(df[col])] for col in df]

[['magnitude', 6.5, 9.1],
 ['cdi', 0, 9],
 ['mmi', 1, 9],
 ['sig', 650, 2910],
 ['nst', 0, 934],
 ['dmin', 0.0, 17.654],
 ['gap', 0.0, 239.0],
 ['depth', 2.7, 670.81],
 ['latitude', -61.8484, 71.6312],
 ['longitude', -179.968, 179.662],
 ['Year', 2001, 2022],
 ['Month', 1, 12],
 ['tsunami', 0, 1]]

cdi = 0 could be a missing value and could have meaning or simply missing. A potential for investigation

Check column names

In [26]:
df.columns

Index(['magnitude', 'cdi', 'mmi', 'sig', 'nst', 'dmin', 'gap', 'depth',
       'latitude', 'longitude', 'Year', 'Month', 'tsunami'],
      dtype='object')

Check 0 value in coordinates. None found.

In [None]:
(df["latitude"] == 0).any()

False

In [None]:
(df["longitude"] == 0).any()

False

## describe

In [10]:
df.describe(include="all")

Unnamed: 0,magnitude,cdi,mmi,sig,nst,dmin,gap,depth,latitude,longitude,Year,Month,tsunami
count,782.0,782.0,782.0,782.0,782.0,782.0,782.0,782.0,782.0,782.0,782.0,782.0,782.0
mean,6.941125,4.33376,5.964194,870.108696,230.250639,1.325757,25.03899,75.883199,3.5381,52.609199,2012.280051,6.563939,0.388747
std,0.445514,3.169939,1.462724,322.465367,250.188177,2.218805,24.225067,137.277078,27.303429,117.898886,6.099439,3.507866,0.487778
min,6.5,0.0,1.0,650.0,0.0,0.0,0.0,2.7,-61.8484,-179.968,2001.0,1.0,0.0
25%,6.6,0.0,5.0,691.0,0.0,0.0,14.625,14.0,-14.5956,-71.66805,2007.0,3.25,0.0
50%,6.8,5.0,6.0,754.0,140.0,0.0,20.0,26.295,-2.5725,109.426,2013.0,7.0,0.0
75%,7.1,7.0,7.0,909.75,445.0,1.863,30.0,49.75,24.6545,148.941,2017.0,10.0,1.0
max,9.1,9.0,9.0,2910.0,934.0,17.654,239.0,670.81,71.6312,179.662,2022.0,12.0,1.0


CDI, NST: More than 25% of the values are 0. Dmin: More than 50% of the values are 0. We could consider binning and or imputation.

In [11]:
df.skew()

magnitude    1.444440
cdi         -0.197310
mmi         -0.250403
sig          3.083629
nst          0.533307
dmin         2.604580
gap          4.668607
depth        3.024869
latitude     0.200853
longitude   -0.702982
Year        -0.192450
Month       -0.067928
tsunami      0.457333
dtype: float64

In [12]:
df.kurtosis()

magnitude     2.226391
cdi          -1.357753
mmi          -0.224592
sig          12.000754
nst          -1.092793
dmin          9.283367
gap          32.027722
depth         8.384480
latitude     -0.476740
longitude    -1.088383
Year         -1.042840
Month        -1.299853
tsunami      -1.795445
dtype: float64