In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

## Display all the columns of the dataframe
pd.pandas.set_option('display.max_columns',None)


In [2]:
earthquake_df = pd.read_csv('../dataset/final_dataset/raw_dataset_eda.csv')
earthquake_df.head()

Unnamed: 0,date_time,latitude,longitude,depth,magnitude,magnitude_type,nst,gap,depth_min,rms,net,id,updated_date,place,type,horizontal_error,depth_error,magnitude_error,magnitude_nst,status,location_source,magnitude_source,distance,url,gravity,force,year,month,day,hour,minutes,day_name
0,2011-01-01 09:56:58,-26.803,-63.136,576.8,7.0,mww,607.0,18.4,,0.85,us,usp000hsdc,2022-10-24T02:25:20.934Z,"26 km NNE of El Hoyo, Argentina",earthquake,,,,,reviewed,us,us,380543,https://geodesy.noaa.gov/api/gravd/gp?lat=-26....,9.818623,2.020842e+20,2011,1,1,9,56,Saturday
1,2011-01-01 09:59:36,-26.851,-63.217,592.9,5.78,mw,,,,,iscgem,iscgem16394566,2022-04-08T21:40:35.688Z,"19 km N of El Hoyo, Argentina",earthquake,,25.0,0.4,,reviewed,iscgem,iscgem,380548,https://geodesy.noaa.gov/api/gravd/gp?lat=-26....,9.79405,2.020789e+20,2011,1,1,9,59,Saturday
2,2011-01-02 09:23:13,-59.414,-24.469,35.0,5.1,mb,52.0,95.5,,1.2,us,usp000hsez,2014-11-07T01:43:20.059Z,South Sandwich Islands region,earthquake,,,,13.0,reviewed,us,us,383580,https://geodesy.noaa.gov/api/gravd/gp?lat=-59....,9.793398,1.988969e+20,2011,1,2,9,23,Sunday
3,2011-01-02 15:19:31,-4.458,101.428,21.6,5.7,mwb,271.0,28.9,,1.08,us,usp000hsfe,2022-04-08T21:28:10.812Z,"118 km SW of Bengkulu, Indonesia",earthquake,,3.2,,,reviewed,us,us,384394,https://geodesy.noaa.gov/api/gravd/gp?lat=-4.4...,9.802458,1.980554e+20,2011,1,2,15,19,Sunday
4,2011-01-02 20:20:17,-38.355,-73.326,24.0,7.2,mww,397.0,30.1,,1.1,us,usp000hsfq,2022-08-09T03:48:09.579Z,"42 km NNW of Carahue, Chile",earthquake,,,,,reviewed,us,us,385093,https://geodesy.noaa.gov/api/gravd/gp?lat=-38....,9.790649,1.973371e+20,2011,1,2,20,20,Sunday


### Remove column which are not important for observation and research

In [3]:

column_to_drop = [
    'nst', 
    'gap', 
    'rms', 
    'net', 
    'id', 
    'updated_date', 
    'place', 
    'type', 
    'horizontal_error', 
    'depth_error',
    'magnitude_error',
    'magnitude_nst',
    'status',
    'location_source',
    'magnitude_source',
    'url', 
    'depth_min',
    'day_name', 
    'date_time'
]
earthquake_df = earthquake_df.drop(columns=column_to_drop, axis=1 )

In [4]:
earthquake_df.head(2)

Unnamed: 0,latitude,longitude,depth,magnitude,magnitude_type,distance,gravity,force,year,month,day,hour,minutes
0,-26.803,-63.136,576.8,7.0,mww,380543,9.818623,2.020842e+20,2011,1,1,9,56
1,-26.851,-63.217,592.9,5.78,mw,380548,9.79405,2.020789e+20,2011,1,1,9,59


### Bringing reatures onto same scale

## Feature Engineering for Training Dataset 

### Handel Missing Value

#### Missing value for Categorical Feature

In [5]:
# there is no any missing values in the categorical columns
earthquake_df.isnull().sum()

latitude          0
longitude         0
depth             0
magnitude         0
magnitude_type    0
distance          0
gravity           0
force             0
year              0
month             0
day               0
hour              0
minutes           0
dtype: int64

##### There is not any missing value in Categorical Feature

#### Missing value for Numerical Feature

In [6]:
# also there is no any missing value in the numerical columns
earthquake_df.select_dtypes(exclude=object).isnull().sum()

latitude     0
longitude    0
depth        0
magnitude    0
distance     0
gravity      0
force        0
year         0
month        0
day          0
hour         0
minutes      0
dtype: int64

### Perform log transformation to make skewed data to form Gaussian distribution

In [7]:
numerical_feature = ['depth', 'distance', 'gravity', 'force']
for feature in numerical_feature:
    earthquake_df[feature]=np.log(earthquake_df[feature])

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [8]:
earthquake_df.head()

Unnamed: 0,latitude,longitude,depth,magnitude,magnitude_type,distance,gravity,force,year,month,day,hour,minutes
0,-26.803,-63.136,6.357496,7.0,mww,12.849354,2.284281,46.755216,2011,1,1,9,56
1,-26.851,-63.217,6.385026,5.78,mw,12.849368,2.281775,46.75519,2011,1,1,9,59
2,-59.414,-24.469,3.555348,5.1,mb,12.857303,2.281708,46.739318,2011,1,2,9,23
3,-4.458,101.428,3.072693,5.7,mwb,12.859423,2.282633,46.735078,2011,1,2,15,19
4,-38.355,-73.326,3.178054,7.2,mww,12.86124,2.281428,46.731445,2011,1,2,20,20


#### Handling Rare Categorical Feature

Remove categorical variables that are present less than 1% of the observation


In [9]:
# getting the magnitude type which occoured more than 1% in the data set and replacing with 'rare_mt'
magnitude_type = (earthquake_df['magnitude_type'].value_counts() / len(earthquake_df) * 100 > 1)

false_types = magnitude_type[magnitude_type == False].index.tolist()
earthquake_df['magnitude_type'] = earthquake_df['magnitude_type'].apply(lambda x: 'rare_mt' if x in false_types else x)
earthquake_df['magnitude_type'].value_counts()

magnitude_type
mww        9239
mb         6237
mwc        1345
mwb         857
mwr         200
rare_mt     181
Name: count, dtype: int64

### Handling Categorical Feature into numerical Variable

In [10]:

earthquake_df = pd.concat([earthquake_df.drop(columns=['magnitude_type'], axis=1),pd.get_dummies(earthquake_df['magnitude_type'])], axis=1)


boolean_columns = earthquake_df.select_dtypes(include=['bool']).columns

# Convert boolean columns to integers
earthquake_df[boolean_columns] = earthquake_df[boolean_columns].astype(int)
earthquake_df = earthquake_df[np.isfinite(earthquake_df).all(1)]

In [11]:
train_df_scaled, test_df_scaled= train_test_split(earthquake_df, test_size=0.2, random_state=101)

Unnamed: 0,latitude,longitude,depth,magnitude,distance,gravity,force,year,month,day,hour,minutes,mb,mwb,mwc,mwr,mww,rare_mt
5797,-6.7306,155.0899,3.555348,5.1,12.808255,2.280247,46.837415,2014,12,23,21,19,1,0,0,0,0,0
3186,2.4210,90.4340,3.430756,5.1,12.866583,2.282207,46.720759,2012,12,6,2,24,1,0,0,0,0,0
13233,-20.3307,-173.8138,2.302585,5.1,12.821645,2.281148,46.810636,2021,2,3,11,11,1,0,0,0,0,0
16503,-59.4030,-17.9994,2.302585,5.9,12.913470,2.280851,46.626985,2023,3,3,4,53,0,0,0,0,1,0
8853,-54.3240,159.1913,2.302585,5.1,12.865658,2.280331,46.722609,2017,6,1,20,4,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5698,5.7531,61.3264,2.302585,5.4,12.821045,2.280327,46.811836,2014,11,28,13,23,0,0,1,0,0,0
8012,44.0505,148.1035,3.367296,5.9,12.863841,2.282966,46.726243,2016,10,23,20,25,0,0,0,0,1,0
17751,8.5266,126.4161,3.688879,7.6,12.902644,2.283954,46.648637,2023,12,2,14,37,0,0,0,0,1,0
17937,-53.1275,-117.7908,2.302585,5.4,12.883997,2.283723,46.685931,2023,12,27,15,36,0,0,0,0,1,0


In [14]:
train_df_scaled, test_df_scaled= train_test_split(earthquake_df, test_size=0.2, random_state=101)
train_df_scaled.to_csv('../dataset/final_dataset/gkm/train.csv', index=False)
test_df_scaled.to_csv('../dataset/final_dataset/gkm/test.csv', index=False)