In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

## Display all the columns of the dataframe
pd.pandas.set_option('display.max_columns',None)


In [2]:
earthquake_df = pd.read_csv('../dataset/final_dataset/raw_dataset_eda.csv')
earthquake_df.head()

Unnamed: 0,date_time,latitude,longitude,depth,magnitude,magnitude_type,nst,gap,depth_min,rms,net,id,updated_date,place,type,horizontal_error,depth_error,magnitude_error,magnitude_nst,status,location_source,magnitude_source,distance,url,gravity,force,year,month,day,hour,minutes,day_name
0,2011-01-01 09:56:58,-26.803,-63.136,576.8,7.0,mww,607.0,18.4,,0.85,us,usp000hsdc,2022-10-24T02:25:20.934Z,"26 km NNE of El Hoyo, Argentina",earthquake,,,,,reviewed,us,us,380543,https://geodesy.noaa.gov/api/gravd/gp?lat=-26....,9.818623,2.020842e+20,2011,1,1,9,56,Saturday
1,2011-01-01 09:59:36,-26.851,-63.217,592.9,5.78,mw,,,,,iscgem,iscgem16394566,2022-04-08T21:40:35.688Z,"19 km N of El Hoyo, Argentina",earthquake,,25.0,0.4,,reviewed,iscgem,iscgem,380548,https://geodesy.noaa.gov/api/gravd/gp?lat=-26....,9.79405,2.020789e+20,2011,1,1,9,59,Saturday
2,2011-01-02 09:23:13,-59.414,-24.469,35.0,5.1,mb,52.0,95.5,,1.2,us,usp000hsez,2014-11-07T01:43:20.059Z,South Sandwich Islands region,earthquake,,,,13.0,reviewed,us,us,383580,https://geodesy.noaa.gov/api/gravd/gp?lat=-59....,9.793398,1.988969e+20,2011,1,2,9,23,Sunday
3,2011-01-02 15:19:31,-4.458,101.428,21.6,5.7,mwb,271.0,28.9,,1.08,us,usp000hsfe,2022-04-08T21:28:10.812Z,"118 km SW of Bengkulu, Indonesia",earthquake,,3.2,,,reviewed,us,us,384394,https://geodesy.noaa.gov/api/gravd/gp?lat=-4.4...,9.802458,1.980554e+20,2011,1,2,15,19,Sunday
4,2011-01-02 20:20:17,-38.355,-73.326,24.0,7.2,mww,397.0,30.1,,1.1,us,usp000hsfq,2022-08-09T03:48:09.579Z,"42 km NNW of Carahue, Chile",earthquake,,,,,reviewed,us,us,385093,https://geodesy.noaa.gov/api/gravd/gp?lat=-38....,9.790649,1.973371e+20,2011,1,2,20,20,Sunday


### Remove column which are not important for observation and research

In [3]:
column_to_drop = [
    'nst', 
    'gap', 
    'rms', 
    'net', 
    'id', 
    'updated_date', 
    'place', 
    'type', 
    'horizontal_error', 
    'depth_error',
    'magnitude_error',
    'magnitude_nst',
    'status',
    'location_source',
    'magnitude_source',
    'url'
]
earthquake_df = earthquake_df.drop(columns=column_to_drop, axis=1 )

In [4]:
earthquake_df.head(2)

Unnamed: 0,date_time,latitude,longitude,depth,magnitude,magnitude_type,depth_min,distance,gravity,force,year,month,day,hour,minutes,day_name
0,2011-01-01 09:56:58,-26.803,-63.136,576.8,7.0,mww,,380543,9.818623,2.020842e+20,2011,1,1,9,56,Saturday
1,2011-01-01 09:59:36,-26.851,-63.217,592.9,5.78,mw,,380548,9.79405,2.020789e+20,2011,1,1,9,59,Saturday


#### Divide Dataset into Train and Test
Always remember there way always be a chance of data leakage and overfitting so we need to split the data first and then apply feature Engineering to minimize the chances of data leakage and overfitting.

In [5]:
# train_df, test_df = train_test_split(earthquake_df, test_size=0.2, random_state=0)

In [6]:
# train_df.shape, test_df.shape

((14447, 16), (3612, 16))

#### Save the training and testing sets into separate CSV files

In [7]:
# train_df.to_csv('../dataset/final_dataset/train.csv', index=False)
# test_df.to_csv('../dataset/final_dataset/test.csv', index=False)

In [8]:
training_df = pd.read_csv('../dataset/final_dataset/train.csv')
training_df.head()

Unnamed: 0,date_time,latitude,longitude,depth,magnitude,magnitude_type,depth_min,distance,gravity,force,year,month,day,hour,minutes,day_name
0,2012-04-03 07:25:08,-19.638,-69.093,117.0,5.2,mb,,374151,9.778848,2.09048e+20,2012,4,3,7,25,Tuesday
1,2020-08-15 22:29:20,-22.7378,-112.2942,10.0,5.1,mb,37.584,382488,9.780385,2.000342e+20,2020,8,15,22,29,Saturday
2,2014-10-06 14:04:05,15.4121,147.2265,4.0,5.7,mww,1.389,362501,9.788502,2.227007e+20,2014,10,6,14,4,Monday
3,2013-09-05 15:27:03,-7.2654,144.0324,9.63,5.3,mwb,3.754,391734,9.811485,1.907029e+20,2013,9,5,15,27,Thursday
4,2013-07-19 14:05:31,-8.719,157.63,10.0,5.2,mb,,362789,9.78195,2.223472e+20,2013,7,19,14,5,Friday


In [9]:
testing_df = pd.read_csv('../dataset/final_dataset/test.csv')

In [10]:
print('training_df: ', training_df.shape)
print('testing_df: ',testing_df.shape)

training_df:  (14447, 16)
testing_df:  (3612, 16)


In [11]:
training_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14447 entries, 0 to 14446
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date_time       14447 non-null  object 
 1   latitude        14447 non-null  float64
 2   longitude       14447 non-null  float64
 3   depth           14447 non-null  float64
 4   magnitude       14447 non-null  float64
 5   magnitude_type  14447 non-null  object 
 6   depth_min       11035 non-null  float64
 7   distance        14447 non-null  int64  
 8   gravity         14447 non-null  float64
 9   force           14447 non-null  float64
 10  year            14447 non-null  int64  
 11  month           14447 non-null  int64  
 12  day             14447 non-null  int64  
 13  hour            14447 non-null  int64  
 14  minutes         14447 non-null  int64  
 15  day_name        14447 non-null  object 
dtypes: float64(7), int64(6), object(3)
memory usage: 1.8+ MB


## Feature Engineering for Training Dataset 

### Handel Missing Value

#### Missing value for Categorical Feature

In [12]:
def fun_cat_feature_nan(dataset, data_type):
    
    cat_feature_nan = [feature for feature in dataset.columns if dataset[feature].isnull().sum()> 1 and dataset[feature].dtypes == data_type]
    
    print("Categorical Feature with NAN :",cat_feature_nan)

    for feature in cat_feature_nan:
        return (print("{}: {}% missing values".format(feature,np.round(dataset[feature].isnull().mean(),4))))
    

In [13]:
# Training 
fun_cat_feature_nan(training_df, 'O')

# Testing
fun_cat_feature_nan(testing_df, 'O')


Categorical Feature with NAN : []
Categorical Feature with NAN : []


##### There is not any missing value in Categorical Feature

In [14]:
# If there were missing categorical feature use below function to replace with new label

## Replace missing value with a new label
# def replace_cat_feature(dataset,features_nan):
#     data=dataset.copy()
#     data[features_nan]=data[features_nan].fillna('Missing')
#     return data

# dataset=replace_cat_feature(training_df,cat_feature_nan)

# dataset[features_nan].isnull().sum()

#### Missing value for Numerical Feature

In [15]:
def fun_num_feature_nan(dataset, data_type):
    
    num_feature_nan = [feature for feature in dataset.columns if dataset[feature].isnull().sum()> 1 and dataset[feature].dtypes != data_type]
    
    print("Numerical Feature with NAN :",num_feature_nan)

    for feature in num_feature_nan:
        print("{}: {}% missing values".format(feature,np.round(dataset[feature].isnull().mean(),4)))
        return num_feature_nan

# Training
numerical_feat_with_nan = fun_num_feature_nan(training_df, 'O')

Numerical Feature with NAN : ['depth_min']
depth_min: 0.2362% missing values


In [16]:
# Testing
numerical_feat_with_nan_test = fun_num_feature_nan(testing_df, 'O')


Numerical Feature with NAN : ['depth_min']
depth_min: 0.2281% missing values


In [17]:
## Replacing the numerical Missing Values

def replace_num_feature(dataset, feature_nan):
    for feature in feature_nan:
        ## We will replace by using median since there are outliers
        median_value=dataset[feature].median()
        dataset[feature].fillna(median_value,inplace=True)
        return dataset

training_df = replace_num_feature(training_df, numerical_feat_with_nan)    

In [18]:
training_df[numerical_feat_with_nan].isnull().sum()

depth_min    0
dtype: int64

In [19]:
# Testing
testing_df = replace_num_feature(testing_df, numerical_feat_with_nan_test) 

In [20]:
testing_df[numerical_feat_with_nan_test].isnull().sum()

depth_min    0
dtype: int64

### Perform log transformation to make skewed data to form Gaussian distribution

In [21]:
numerical_feature = ['depth','depth_min','distance','gravity','force','day']
for feature in numerical_feature:
    training_df[feature]=np.log(training_df[feature])

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [22]:
training_df.head()

Unnamed: 0,date_time,latitude,longitude,depth,magnitude,magnitude_type,depth_min,distance,gravity,force,year,month,day,hour,minutes,day_name
0,2012-04-03 07:25:08,-19.638,-69.093,4.762174,5.2,mb,0.955511,12.832415,2.280222,46.789096,2012,4,1.098612,7,25,Tuesday
1,2020-08-15 22:29:20,-22.7378,-112.2942,2.302585,5.1,mb,3.626578,12.854453,2.280379,46.74502,2020,8,2.70805,22,29,Saturday
2,2014-10-06 14:04:05,15.4121,147.2265,1.386294,5.7,mww,0.328584,12.800783,2.281208,46.85236,2014,10,1.791759,14,4,Monday
3,2013-09-05 15:27:03,-7.2654,144.0324,2.264883,5.3,mwb,1.322822,12.878338,2.283554,46.697249,2013,9,1.609438,15,27,Thursday
4,2013-07-19 14:05:31,-8.719,157.63,2.302585,5.2,mb,0.955511,12.801577,2.280539,46.850772,2013,7,2.944439,14,5,Friday


In [23]:
# Training
for feature in numerical_feature:
    testing_df[feature]=np.log(testing_df[feature])

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [24]:
testing_df.head()

Unnamed: 0,date_time,latitude,longitude,depth,magnitude,magnitude_type,depth_min,distance,gravity,force,year,month,day,hour,minutes,day_name
0,2022-09-16 04:36:30,11.8258,141.2532,2.302585,5.1,mb,1.370927,12.892351,2.281258,46.669223,2022,9,2.772589,4,36,Friday
1,2019-01-17 12:56:37,29.6356,-114.1301,2.302585,5.3,mww,0.425268,12.829602,2.280469,46.794722,2019,1,2.833213,12,56,Thursday
2,2023-04-03 21:44:13,14.3888,56.3258,2.302585,5.5,mww,2.351661,12.896644,2.280648,46.660637,2023,4,1.098612,21,44,Monday
3,2021-11-14 12:07:03,27.7158,56.0743,2.197225,6.0,mwb,1.015593,12.881786,2.280301,46.690353,2021,11,2.639057,12,7,Sunday
4,2013-09-03 22:29:31,51.3277,-130.2195,2.287471,5.5,mwr,0.447886,12.893056,2.282069,46.667813,2013,9,1.098612,22,29,Tuesday


### Handling Rare Categorical Feature
Remove categorical variables that are present less than 1% of the observation

In [25]:
categorical_feat = [feature for feature in training_df.columns if training_df[feature].dtype == 'O' and feature != 'date_time' ]
for feature in categorical_feat:
    data = training_df.groupby(feature)['magnitude'].count()/len(training_df)
    data_df = data[data > 0.01].index
    training_df[feature] = np.where(training_df[feature].isin(data_df), training_df[feature],'rare_mag')
    

In [26]:
# Testing
categorical_feat_test = [feature for feature in testing_df.columns if testing_df[feature].dtype == 'O' and feature != 'date_time' ]
for feature in categorical_feat_test:
    data = testing_df.groupby(feature)['magnitude'].count()/len(testing_df)
    data_df = data[data > 0.01].index
    testing_df[feature] = np.where(testing_df[feature].isin(data_df), testing_df[feature],'rare_mag')

### Handling Categorical Feature into numerical Variable

In [27]:
 from sklearn.preprocessing import LabelEncoder

In [28]:
l_encoder = LabelEncoder()
training_df['magnitude_type'] = l_encoder.fit_transform(training_df['magnitude_type'])
training_df['day_name'] = l_encoder.fit_transform(training_df['day_name'])

In [29]:
training_df.head(2)

Unnamed: 0,date_time,latitude,longitude,depth,magnitude,magnitude_type,depth_min,distance,gravity,force,year,month,day,hour,minutes,day_name
0,2012-04-03 07:25:08,-19.638,-69.093,4.762174,5.2,0,0.955511,12.832415,2.280222,46.789096,2012,4,1.098612,7,25,5
1,2020-08-15 22:29:20,-22.7378,-112.2942,2.302585,5.1,0,3.626578,12.854453,2.280379,46.74502,2020,8,2.70805,22,29,2


In [30]:
#Testing
testing_df['magnitude_type'] = l_encoder.fit_transform(testing_df['magnitude_type'])
testing_df['day_name'] = l_encoder.fit_transform(testing_df['day_name'])

In [31]:
testing_df.head(2)

Unnamed: 0,date_time,latitude,longitude,depth,magnitude,magnitude_type,depth_min,distance,gravity,force,year,month,day,hour,minutes,day_name
0,2022-09-16 04:36:30,11.8258,141.2532,2.302585,5.1,0,1.370927,12.892351,2.281258,46.669223,2022,9,2.772589,4,36,0
1,2019-01-17 12:56:37,29.6356,-114.1301,2.302585,5.3,4,0.425268,12.829602,2.280469,46.794722,2019,1,2.833213,12,56,4


### Feature Scaling
We use StandardScalar to scale the test data because feature has different scale like km, m/s2, latitude and longtitude in degree

    - Fit and Transform on Train data
    - Transform and Predict on Test data because we don't need to scale test data further

In [32]:
feature_scale = [feature for feature in training_df.columns if feature not in ['date_time','magnitude']]
feature_scale

['latitude',
 'longitude',
 'depth',
 'magnitude_type',
 'depth_min',
 'distance',
 'gravity',
 'force',
 'year',
 'month',
 'day',
 'hour',
 'minutes',
 'day_name']

In [33]:
# Function to find and fix columns with infinite values or very large values
def find_and_fix_problematic_features(df):
    problematic_features = []
    for column in df.columns:
        if pd.api.types.is_numeric_dtype(df[column]):
            if np.any(np.isinf(df[column])) or np.any(np.abs(df[column]) > np.finfo(np.float64).max):
                problematic_features.append(column)
                # Replace infinite values with NaN
                df[column].replace([np.inf, -np.inf], np.nan, inplace=True)
                # Optionally fill NaNs with the mean of the column
                df[column].fillna(df[column].mean(), inplace=True)
    return problematic_features

# Fix problematic features
problematic_features = find_and_fix_problematic_features(training_df)
print("Fixed problematic features:", problematic_features)

# Convert non-numeric columns to numeric (if applicable)
for column in [feature for feature in training_df.columns if feature not in ['date_time']]:
    if not pd.api.types.is_numeric_dtype(training_df[column]):
        training_df[column] = pd.to_numeric(training_df[column], errors='coerce')

Fixed problematic features: ['depth', 'depth_min']


In [34]:
# Fix problematic features
problematic_features = find_and_fix_problematic_features(testing_df)
print("Fixed problematic features:", problematic_features)

Fixed problematic features: ['depth']


In [35]:
# Convert non-numeric columns to numeric (if applicable)
for column in [feature for feature in testing_df.columns if feature not in ['date_time']]:
    if not pd.api.types.is_numeric_dtype(testing_df[column]):
        testing_df[column] = pd.to_numeric(testing_df[column], errors='coerce')

In [36]:
training_df.columns

Index(['date_time', 'latitude', 'longitude', 'depth', 'magnitude',
       'magnitude_type', 'depth_min', 'distance', 'gravity', 'force', 'year',
       'month', 'day', 'hour', 'minutes', 'day_name'],
      dtype='object')

In [37]:
from sklearn.preprocessing import StandardScaler

In [38]:
scaler = StandardScaler()

# Fit and transform for the training data
training_df_scaled = scaler.fit(training_df[feature_scale])
training_df_scaled = scaler.transform(training_df[feature_scale])

# Transform only for the testing data
testing_df_scaled = scaler.transform(testing_df[feature_scale])

In [41]:
## add date_time and magnitude on scaled data to form a dataframe
# Transform is done during concatination

training_data = pd.concat([training_df[['date_time', 'magnitude']].reset_index(drop=True),
                    pd.DataFrame(training_df_scaled, columns=feature_scale)],
                    axis=1)

testing_data = pd.concat([testing_df[['date_time', 'magnitude']].reset_index(drop=True),
                    pd.DataFrame(testing_df_scaled, columns=feature_scale)],
                    axis=1)

In [42]:
training_data.head()

Unnamed: 0,date_time,magnitude,latitude,longitude,depth,magnitude_type,depth_min,distance,gravity,force,year,month,day,hour,minutes,day_name
0,2012-04-03 07:25:08,5.2,-0.594496,-0.864363,1.419168,-1.257601,0.030465,-0.702866,-1.083048,0.702866,-1.264868,-0.659727,-1.679736,-0.644475,-0.237976,0.99541
1,2020-08-15 22:29:20,5.1,-0.694799,-1.214476,-0.782342,-1.257601,2.867303,-0.138567,-0.957501,0.138567,0.735685,0.484529,0.271565,1.513084,-0.007552,-0.491273
2,2014-10-06 14:04:05,5.7,0.539654,0.888743,-1.602489,0.890936,-0.63537,-1.512839,-0.295189,1.512839,-0.76473,1.056657,-0.839356,0.362386,-1.447702,-0.986834
3,2013-09-05 15:27:03,5.3,-0.194144,0.862857,-0.816088,-0.720467,0.420572,0.47305,1.57738,-0.47305,-1.014799,0.770593,-1.060405,0.506223,-0.122764,0.499849
4,2013-07-19 14:05:31,5.2,-0.241179,0.973055,-0.782342,-1.257601,0.030465,-1.492504,-0.829802,1.492504,-1.014799,0.198465,0.558166,0.362386,-1.390096,-1.482395


In [43]:
testing_data.head()

Unnamed: 0,date_time,magnitude,latitude,longitude,depth,magnitude_type,depth_min,distance,gravity,force,year,month,day,hour,minutes,day_name
0,2022-09-16 04:36:30,5.1,0.423608,0.840334,-0.782342,-1.257601,0.471662,0.831866,-0.255875,-0.831866,1.235823,0.770593,0.349812,-1.075987,0.39569,-1.482395
1,2019-01-17 12:56:37,5.3,0.999897,-1.229354,-0.782342,0.890936,-0.532686,-0.774895,-0.885216,0.774895,0.485616,-1.517919,0.423314,0.074711,1.54781,0.499849
2,2023-04-03 21:44:13,5.5,0.506542,0.152062,-0.782342,0.890936,1.513262,0.941785,-0.742771,-0.941785,1.485892,-0.659727,-1.679736,1.369247,0.856538,-0.986834
3,2021-11-14 12:07:03,6.0,0.937776,0.150023,-0.876648,-0.720467,0.094276,0.561338,-1.019693,-0.561338,0.985754,1.342721,0.187917,0.074711,-1.274884,0.004288
4,2013-09-03 22:29:31,5.5,1.701809,-1.359747,-0.79587,0.353801,-0.508664,0.849907,0.39194,-0.849907,-1.014799,0.770593,-1.679736,1.513084,-0.007552,0.99541


In [44]:
# # Training Data after Feature Engineering
# training_data.to_csv('../dataset/final_dataset/training_data.csv', index=False)

# # Testing Data after Feature Engineering
# testing_data.to_csv('../dataset/final_dataset/testing_data.csv', index=False)

In [45]:
print(training_data.shape)
print(testing_data.shape)

(14447, 16)
(3612, 16)
