# Obtain and Clean the Data

## This notebook is for Data Cleaning and Feature Engineering

**=======================================================================**

## Import Libraries

In [1]:
import numpy as np
#from numpy import count_nonzero, median, mean
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
#import squarify

import datetime
from datetime import datetime, timedelta, date, time


#import os
#import zipfile
import scipy
from scipy import stats
#from scipy.stats.mstats import normaltest # D'Agostino K^2 Test
#from scipy.stats import boxcox
from collections import Counter

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import PolynomialFeatures, RobustScaler, Binarizer
from sklearn.impute import SimpleImputer, MissingIndicator, KNNImputer
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn import set_config

%matplotlib inline
#sets the default autosave frequency in seconds
%autosave 60 
sns.set_style('dark')
sns.set(font_scale=1.2)

plt.rc('axes', titlesize=9)
plt.rc('axes', labelsize=14)
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)

import warnings
warnings.filterwarnings('ignore')

# Use Feature-Engine library
import feature_engine

from feature_engine.imputation import AddMissingIndicator, CategoricalImputer, DropMissingData, MeanMedianImputer
from feature_engine.imputation import ArbitraryNumberImputer, RandomSampleImputer

from feature_engine.outliers import Winsorizer, ArbitraryOutlierCapper, OutlierTrimmer

from feature_engine.encoding import CountFrequencyEncoder, DecisionTreeEncoder, MeanEncoder, OneHotEncoder
from feature_engine.encoding import OrdinalEncoder, WoEEncoder, RareLabelEncoder, StringSimilarityEncoder

from feature_engine.discretisation import EqualWidthDiscretiser, EqualFrequencyDiscretiser, ArbitraryDiscretiser
from feature_engine.discretisation import DecisionTreeDiscretiser, EqualWidthDiscretiser

from feature_engine.datetime import DatetimeFeatures

from feature_engine.creation import CyclicalFeatures, MathFeatures, RelativeFeatures


pd.set_option('display.max_columns',None)
#pd.set_option('display.max_rows',None)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format','{:.2f}'.format)

random.seed(0)
np.random.seed(0)
np.set_printoptions(suppress=True)

Autosaving every 60 seconds


**=======================================================**

## Data Quick Glance

In [6]:
df = pd.read_csv("earthquake.csv", parse_dates=["Time","Updated"])

In [7]:
df.head()

Unnamed: 0,Time,Place,Latitude,Longitude,Depth,Mag,MagType,nst,gap,dmin,rms,net,ID,Updated,Unnamed: 14,Type,horizontalError,depthError,magError,magNst,status,locationSource,magSource
0,2023-02-17 09:37:34.868000+00:00,"130 km SW of Tual, Indonesia",-6.6,132.08,38.62,6.1,mww,119.0,51.0,2.99,0.76,us,us6000jpl7,2023-02-17 17:58:24.040000+00:00,,earthquake,6.41,5.59,0.07,23.0,reviewed,us,us
1,2023-02-16 05:37:05.138000+00:00,"7 km SW of Port-Olry, Vanuatu",-15.09,167.03,36.03,5.6,mww,81.0,26.0,0.39,0.94,us,us6000jpb1,2023-02-17 05:41:32.448000+00:00,,earthquake,5.99,6.08,0.07,18.0,reviewed,us,us
2,2023-02-15 18:10:10.060000+00:00,"Masbate region, Philippines",12.32,123.87,20.09,6.1,mww,148.0,47.0,5.49,0.54,us,us6000jp76,2023-02-16 20:12:32.595000+00:00,,earthquake,8.61,4.4,0.04,71.0,reviewed,us,us
3,2023-02-15 06:38:09.034000+00:00,"54 km WNW of Otaki, New Zealand",-40.55,174.57,74.32,5.7,mww,81.0,40.0,0.77,1.15,us,us6000jp1g,2023-02-16 06:42:09.738000+00:00,,earthquake,3.68,4.92,0.07,23.0,reviewed,us,us
4,2023-02-14 13:16:51.072000+00:00,"2 km NW of Lele?ti, Romania",45.11,23.18,10.0,5.6,mww,132.0,28.0,1.2,0.4,us,us6000jnqz,2023-02-17 09:15:18.586000+00:00,,earthquake,4.85,1.79,0.03,95.0,reviewed,us,us


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37331 entries, 0 to 37330
Data columns (total 23 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   Time             37331 non-null  datetime64[ns, UTC]
 1   Place            37047 non-null  object             
 2   Latitude         37331 non-null  float64            
 3   Longitude        37331 non-null  float64            
 4   Depth            37197 non-null  float64            
 5   Mag              37331 non-null  float64            
 6   MagType          37331 non-null  object             
 7   nst              7473 non-null   float64            
 8   gap              10087 non-null  float64            
 9   dmin             4395 non-null   float64            
 10  rms              20218 non-null  float64            
 11  net              37331 non-null  object             
 12  ID               37331 non-null  object             
 13  Updated         

In [9]:
df.dtypes.value_counts()

float64                13
object                  8
datetime64[ns, UTC]     2
dtype: int64

In [10]:
# Descriptive Statistical Analysis
df.describe(include="all")

Unnamed: 0,Time,Place,Latitude,Longitude,Depth,Mag,MagType,nst,gap,dmin,rms,net,ID,Updated,Unnamed: 14,Type,horizontalError,depthError,magError,magNst,status,locationSource,magSource
count,37331,37047,37331.0,37331.0,37197.0,37331.0,37331,7473.0,10087.0,4395.0,20218.0,37331,37331,37331,0.0,37331,3970.0,20827.0,16551.0,5372.0,37331,37331,37331
unique,37331,25800,,,,,18,,,,,16,37331,36991,,4,,,,,2,54,54
top,2023-02-17 09:37:34.868000+00:00,South Sandwich Islands region,,,,,mw,,,,,us,us6000jpl7,2018-06-04 20:43:44+00:00,,earthquake,,,,,reviewed,us,us
freq,1,664,,,,,18700,,,,,23364,1,143,,37080,,,,,37317,23069,13264
first,1900-10-09 12:25:00+00:00,,,,,,,,,,,,,2013-11-15 03:59:43.471000+00:00,,,,,,,,,
last,2023-02-17 09:37:34.868000+00:00,,,,,,,,,,,,,2023-02-17 20:33:16.918000+00:00,,,,,,,,,
mean,,,5.46,38.88,58.58,5.95,,265.48,45.01,4.32,1.0,,,,,,7.32,10.68,0.26,46.97,,,
std,,,30.79,123.09,109.56,0.46,,161.98,34.31,5.48,0.36,,,,,,5.4,10.66,0.17,60.44,,,
min,,,-77.08,-180.0,-4.0,5.5,,0.0,8.0,0.0,0.01,,,,,,0.09,0.0,0.0,0.0,,,
25%,,,-16.52,-75.81,15.0,5.6,,134.0,24.1,1.16,0.89,,,,,,5.7,3.6,0.2,17.0,,,


In [11]:
# Descriptive Statistical Analysis
df.describe(include=["int", "float"])

Unnamed: 0,Latitude,Longitude,Depth,Mag,nst,gap,dmin,rms,Unnamed: 14,horizontalError,depthError,magError,magNst
count,37331.0,37331.0,37197.0,37331.0,7473.0,10087.0,4395.0,20218.0,0.0,3970.0,20827.0,16551.0,5372.0
mean,5.46,38.88,58.58,5.95,265.48,45.01,4.32,1.0,,7.32,10.68,0.26,46.97
std,30.79,123.09,109.56,0.46,161.98,34.31,5.48,0.36,,5.4,10.66,0.17,60.44
min,-77.08,-180.0,-4.0,5.5,0.0,8.0,0.0,0.01,,0.09,0.0,0.0,0.0
25%,-16.52,-75.81,15.0,5.6,134.0,24.1,1.16,0.89,,5.7,3.6,0.2,17.0
50%,1.15,98.58,28.5,5.8,241.0,36.0,2.51,1.0,,7.1,6.1,0.2,31.0
75%,33.79,143.35,41.0,6.14,372.0,54.8,5.13,1.11,,8.5,16.2,0.33,55.0
max,87.2,180.0,700.0,9.5,934.0,360.0,39.73,42.41,,99.0,569.2,1.84,941.0


In [12]:
# Descriptive Statistical Analysis
df.describe(include="object")

Unnamed: 0,Place,MagType,net,ID,Type,status,locationSource,magSource
count,37047,37331,37331,37331,37331,37331,37331,37331
unique,25800,18,16,37331,4,2,54,54
top,South Sandwich Islands region,mw,us,us6000jpl7,earthquake,reviewed,us,us
freq,664,18700,23364,1,37080,37317,23069,13264


In [13]:
df.columns

Index(['Time', 'Place', 'Latitude', 'Longitude', 'Depth', 'Mag', 'MagType', 'nst', 'gap', 'dmin', 'rms', 'net', 'ID', 'Updated', 'Unnamed: 14', 'Type', 'horizontalError', 'depthError', 'magError', 'magNst', 'status', 'locationSource', 'magSource'], dtype='object')

In [14]:
df.isnull().sum()

Time                   0
Place                284
Latitude               0
Longitude              0
Depth                134
Mag                    0
MagType                0
nst                29858
gap                27244
dmin               32936
rms                17113
net                    0
ID                     0
Updated                0
Unnamed: 14        37331
Type                   0
horizontalError    33361
depthError         16504
magError           20780
magNst             31959
status                 0
locationSource         0
magSource              0
dtype: int64

In [15]:
df.duplicated().sum()

0

**=====================================**

## Rename columns

In [16]:
df.columns

Index(['Time', 'Place', 'Latitude', 'Longitude', 'Depth', 'Mag', 'MagType', 'nst', 'gap', 'dmin', 'rms', 'net', 'ID', 'Updated', 'Unnamed: 14', 'Type', 'horizontalError', 'depthError', 'magError', 'magNst', 'status', 'locationSource', 'magSource'], dtype='object')

In [17]:
df = df.rename(columns =  {'Depth': 'Deaths',
                           
                          })

In [18]:
# make all column headers in pandas data frame lower case

df.columns = map(str.lower, df.columns)

In [19]:
df.columns

Index(['time', 'place', 'latitude', 'longitude', 'deaths', 'mag', 'magtype', 'nst', 'gap', 'dmin', 'rms', 'net', 'id', 'updated', 'unnamed: 14', 'type', 'horizontalerror', 'deptherror', 'magerror', 'magnst', 'status', 'locationsource', 'magsource'], dtype='object')

In [20]:
df.head()

Unnamed: 0,time,place,latitude,longitude,deaths,mag,magtype,nst,gap,dmin,rms,net,id,updated,unnamed: 14,type,horizontalerror,deptherror,magerror,magnst,status,locationsource,magsource
0,2023-02-17 09:37:34.868000+00:00,"130 km SW of Tual, Indonesia",-6.6,132.08,38.62,6.1,mww,119.0,51.0,2.99,0.76,us,us6000jpl7,2023-02-17 17:58:24.040000+00:00,,earthquake,6.41,5.59,0.07,23.0,reviewed,us,us
1,2023-02-16 05:37:05.138000+00:00,"7 km SW of Port-Olry, Vanuatu",-15.09,167.03,36.03,5.6,mww,81.0,26.0,0.39,0.94,us,us6000jpb1,2023-02-17 05:41:32.448000+00:00,,earthquake,5.99,6.08,0.07,18.0,reviewed,us,us
2,2023-02-15 18:10:10.060000+00:00,"Masbate region, Philippines",12.32,123.87,20.09,6.1,mww,148.0,47.0,5.49,0.54,us,us6000jp76,2023-02-16 20:12:32.595000+00:00,,earthquake,8.61,4.4,0.04,71.0,reviewed,us,us
3,2023-02-15 06:38:09.034000+00:00,"54 km WNW of Otaki, New Zealand",-40.55,174.57,74.32,5.7,mww,81.0,40.0,0.77,1.15,us,us6000jp1g,2023-02-16 06:42:09.738000+00:00,,earthquake,3.68,4.92,0.07,23.0,reviewed,us,us
4,2023-02-14 13:16:51.072000+00:00,"2 km NW of Lele?ti, Romania",45.11,23.18,10.0,5.6,mww,132.0,28.0,1.2,0.4,us,us6000jnqz,2023-02-17 09:15:18.586000+00:00,,earthquake,4.85,1.79,0.03,95.0,reviewed,us,us


In [21]:
df.locationsource.value_counts()

us           23069
iscgem       12890
iscgemsup      587
ushis          189
ak              97
ci              94
nc              57
guc             45
a               44
doe             34
hv              22
unm             21
wel             18
pgc             16
isk             15
aeic            15
ath             14
ags             12
spe             10
teh              7
rom              7
uw               6
pr               6
nn               5
us_wel           4
casc             4
thr              3
jma              3
the              3
rspr             3
uu               3
g                2
official         2
tul              2
mdd              2
gcmt             2
car              1
tap              1
lim              1
h                1
u                1
ag               1
e                1
b                1
brk              1
csem             1
bou              1
se               1
ott              1
pt               1
beo              1
ren              1
sja         

In [23]:
#df.to_csv("noaa.csv", index=False)

**==============================================================================================================**

## Drop columns

In [24]:
df.columns

Index(['time', 'place', 'latitude', 'longitude', 'deaths', 'mag', 'magtype', 'nst', 'gap', 'dmin', 'rms', 'net', 'id', 'updated', 'unnamed: 14', 'type', 'horizontalerror', 'deptherror', 'magerror', 'magnst', 'status', 'locationsource', 'magsource'], dtype='object')

In [25]:
df.head()

Unnamed: 0,time,place,latitude,longitude,deaths,mag,magtype,nst,gap,dmin,rms,net,id,updated,unnamed: 14,type,horizontalerror,deptherror,magerror,magnst,status,locationsource,magsource
0,2023-02-17 09:37:34.868000+00:00,"130 km SW of Tual, Indonesia",-6.6,132.08,38.62,6.1,mww,119.0,51.0,2.99,0.76,us,us6000jpl7,2023-02-17 17:58:24.040000+00:00,,earthquake,6.41,5.59,0.07,23.0,reviewed,us,us
1,2023-02-16 05:37:05.138000+00:00,"7 km SW of Port-Olry, Vanuatu",-15.09,167.03,36.03,5.6,mww,81.0,26.0,0.39,0.94,us,us6000jpb1,2023-02-17 05:41:32.448000+00:00,,earthquake,5.99,6.08,0.07,18.0,reviewed,us,us
2,2023-02-15 18:10:10.060000+00:00,"Masbate region, Philippines",12.32,123.87,20.09,6.1,mww,148.0,47.0,5.49,0.54,us,us6000jp76,2023-02-16 20:12:32.595000+00:00,,earthquake,8.61,4.4,0.04,71.0,reviewed,us,us
3,2023-02-15 06:38:09.034000+00:00,"54 km WNW of Otaki, New Zealand",-40.55,174.57,74.32,5.7,mww,81.0,40.0,0.77,1.15,us,us6000jp1g,2023-02-16 06:42:09.738000+00:00,,earthquake,3.68,4.92,0.07,23.0,reviewed,us,us
4,2023-02-14 13:16:51.072000+00:00,"2 km NW of Lele?ti, Romania",45.11,23.18,10.0,5.6,mww,132.0,28.0,1.2,0.4,us,us6000jnqz,2023-02-17 09:15:18.586000+00:00,,earthquake,4.85,1.79,0.03,95.0,reviewed,us,us


In [26]:
df.drop(['place', 'magtype', 'nst', 'gap', 'dmin', 'rms', 'net', 'id', 'updated', 'unnamed: 14', 'type', 'horizontalerror', 
         'deptherror', 'magerror', 'magnst', 'status', 'magsource'], axis=1, inplace=True)

In [27]:
df.head()

Unnamed: 0,time,latitude,longitude,deaths,mag,locationsource
0,2023-02-17 09:37:34.868000+00:00,-6.6,132.08,38.62,6.1,us
1,2023-02-16 05:37:05.138000+00:00,-15.09,167.03,36.03,5.6,us
2,2023-02-15 18:10:10.060000+00:00,12.32,123.87,20.09,6.1,us
3,2023-02-15 06:38:09.034000+00:00,-40.55,174.57,74.32,5.7,us
4,2023-02-14 13:16:51.072000+00:00,45.11,23.18,10.0,5.6,us


In [32]:
#df.to_csv("noaa.csv", index=False)

**=============================================================================**