##  Earthquake

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

In [2]:
data = pd.read_csv('C:/Users/Gugo_ML/Downloads/earthquake-database/database.csv')
data.head(5)

Unnamed: 0,Date,Time,Latitude,Longitude,Type,Depth,Depth Error,Depth Seismic Stations,Magnitude,Magnitude Type,...,Magnitude Seismic Stations,Azimuthal Gap,Horizontal Distance,Horizontal Error,Root Mean Square,ID,Source,Location Source,Magnitude Source,Status
0,01/02/1965,13:44:18,19.246,145.616,Earthquake,131.6,,,6.0,MW,...,,,,,,ISCGEM860706,ISCGEM,ISCGEM,ISCGEM,Automatic
1,01/04/1965,11:29:49,1.863,127.352,Earthquake,80.0,,,5.8,MW,...,,,,,,ISCGEM860737,ISCGEM,ISCGEM,ISCGEM,Automatic
2,01/05/1965,18:05:58,-20.579,-173.972,Earthquake,20.0,,,6.2,MW,...,,,,,,ISCGEM860762,ISCGEM,ISCGEM,ISCGEM,Automatic
3,01/08/1965,18:49:43,-59.076,-23.557,Earthquake,15.0,,,5.8,MW,...,,,,,,ISCGEM860856,ISCGEM,ISCGEM,ISCGEM,Automatic
4,01/09/1965,13:32:50,11.938,126.427,Earthquake,15.0,,,5.8,MW,...,,,,,,ISCGEM860890,ISCGEM,ISCGEM,ISCGEM,Automatic


In [3]:
data = data.fillna(value=0)
data.head(5)

Unnamed: 0,Date,Time,Latitude,Longitude,Type,Depth,Depth Error,Depth Seismic Stations,Magnitude,Magnitude Type,...,Magnitude Seismic Stations,Azimuthal Gap,Horizontal Distance,Horizontal Error,Root Mean Square,ID,Source,Location Source,Magnitude Source,Status
0,01/02/1965,13:44:18,19.246,145.616,Earthquake,131.6,0.0,0.0,6.0,MW,...,0.0,0.0,0.0,0.0,0.0,ISCGEM860706,ISCGEM,ISCGEM,ISCGEM,Automatic
1,01/04/1965,11:29:49,1.863,127.352,Earthquake,80.0,0.0,0.0,5.8,MW,...,0.0,0.0,0.0,0.0,0.0,ISCGEM860737,ISCGEM,ISCGEM,ISCGEM,Automatic
2,01/05/1965,18:05:58,-20.579,-173.972,Earthquake,20.0,0.0,0.0,6.2,MW,...,0.0,0.0,0.0,0.0,0.0,ISCGEM860762,ISCGEM,ISCGEM,ISCGEM,Automatic
3,01/08/1965,18:49:43,-59.076,-23.557,Earthquake,15.0,0.0,0.0,5.8,MW,...,0.0,0.0,0.0,0.0,0.0,ISCGEM860856,ISCGEM,ISCGEM,ISCGEM,Automatic
4,01/09/1965,13:32:50,11.938,126.427,Earthquake,15.0,0.0,0.0,5.8,MW,...,0.0,0.0,0.0,0.0,0.0,ISCGEM860890,ISCGEM,ISCGEM,ISCGEM,Automatic


In [4]:
magnitude = data.Magnitude
y = magnitude.values
del data["Magnitude"]
del data['ID'] 

In [5]:
data['Date'] =  pd.to_datetime(data['Date'])
data['Time'] =  pd.to_datetime(data['Time'])
data["Date"] = data["Date"].map(str) + data["Time"].map(str)
data['Date'] =  pd.to_datetime(data['Date'])
del data["Time"]
data['Date'] = pd.to_numeric(data['Date'])

data.head(5)

Unnamed: 0,Date,Latitude,Longitude,Type,Depth,Depth Error,Depth Seismic Stations,Magnitude Type,Magnitude Error,Magnitude Seismic Stations,Azimuthal Gap,Horizontal Distance,Horizontal Error,Root Mean Square,Source,Location Source,Magnitude Source,Status
0,-157569342000000000,19.246,145.616,Earthquake,131.6,0.0,0.0,MW,0.0,0.0,0.0,0.0,0.0,0.0,ISCGEM,ISCGEM,ISCGEM,Automatic
1,-157404611000000000,1.863,127.352,Earthquake,80.0,0.0,0.0,MW,0.0,0.0,0.0,0.0,0.0,0.0,ISCGEM,ISCGEM,ISCGEM,Automatic
2,-157294442000000000,-20.579,-173.972,Earthquake,20.0,0.0,0.0,MW,0.0,0.0,0.0,0.0,0.0,0.0,ISCGEM,ISCGEM,ISCGEM,Automatic
3,-157032617000000000,-59.076,-23.557,Earthquake,15.0,0.0,0.0,MW,0.0,0.0,0.0,0.0,0.0,0.0,ISCGEM,ISCGEM,ISCGEM,Automatic
4,-156965230000000000,11.938,126.427,Earthquake,15.0,0.0,0.0,MW,0.0,0.0,0.0,0.0,0.0,0.0,ISCGEM,ISCGEM,ISCGEM,Automatic


In [6]:
data.dtypes

Date                            int64
Latitude                      float64
Longitude                     float64
Type                           object
Depth                         float64
Depth Error                   float64
Depth Seismic Stations        float64
Magnitude Type                 object
Magnitude Error               float64
Magnitude Seismic Stations    float64
Azimuthal Gap                 float64
Horizontal Distance           float64
Horizontal Error              float64
Root Mean Square              float64
Source                         object
Location Source                object
Magnitude Source               object
Status                         object
dtype: object

In [7]:
label_encoder = LabelEncoder()
data.is_copy = False
data['Type'] = label_encoder.fit_transform(data['Type'])
data['Magnitude Type'] = label_encoder.fit_transform(data['Magnitude Type'].astype(str))
data['Source'] = label_encoder.fit_transform(data['Source'])
data['Location Source'] = label_encoder.fit_transform(data['Location Source'])
data['Magnitude Source'] = label_encoder.fit_transform(data['Magnitude Source'])
data['Status'] = label_encoder.fit_transform(data['Status'])

  object.__getattribute__(self, name)
  return object.__setattr__(self, name, value)


In [9]:
x = data.values 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=4)
models = {}

In [10]:
#Decision Tree
tree = GridSearchCV(DecisionTreeRegressor(), param_grid={'min_samples_split' : np.arange(10, 100, 20),
                    'max_depth': [9, 10, 11, 12],
                    'min_samples_leaf':[2, 3, 4, 5]}, cv=10, 
                    verbose=1, n_jobs=4, error_score='mse', return_train_score='mse')
tree.fit(x_train, y_train)

Fitting 10 folds for each of 80 candidates, totalling 800 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.1s
[Parallel(n_jobs=4)]: Done 289 tasks      | elapsed:    8.8s
[Parallel(n_jobs=4)]: Done 793 out of 800 | elapsed:   20.9s remaining:    0.1s
[Parallel(n_jobs=4)]: Done 800 out of 800 | elapsed:   21.0s finished


GridSearchCV(cv=10, error_score='mse',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'min_samples_split': array([10, 30, 50, 70, 90]), 'max_depth': [9, 10, 11, 12], 'min_samples_leaf': [2, 3, 4, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='mse',
       scoring=None, verbose=1)

In [11]:
tree.best_params_

{'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 70}

In [12]:
tree_predicts = tree.predict(x_test)
models['Decision Tree'] = mean_squared_error(y_test, tree_predicts)
print('Loss:', models['Decision Tree'])

Loss: 0.14236869453171838


In [13]:
#Random Forest
random_forest = GridSearchCV(RandomForestRegressor(), param_grid = {'n_estimators': [250, 300],
                    'min_samples_split': [2, 3, 4],
                    'min_samples_leaf':[2, 3, 4],
                    'max_depth' : [8, 10, 12],},
                    cv=10, error_score='mse',return_train_score='mse', verbose=1, n_jobs=4)
random_forest.fit(x_train, y_train)

Fitting 10 folds for each of 54 candidates, totalling 540 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  3.0min
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed: 14.1min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed: 36.2min
[Parallel(n_jobs=4)]: Done 540 out of 540 | elapsed: 45.8min finished


GridSearchCV(cv=10, error_score='mse',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'n_estimators': [250, 300], 'min_samples_split': [2, 3, 4], 'min_samples_leaf': [2, 3, 4], 'max_depth': [8, 10, 12]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='mse',
       scoring=None, verbose=1)

In [14]:
random_forest.best_params_

{'max_depth': 12,
 'min_samples_leaf': 2,
 'min_samples_split': 3,
 'n_estimators': 300}

In [17]:
random_forest_predicts = random_forest.predict(x_test)
models["Random Forest"] = mean_squared_error(y_test, random_forest_predicts)
print("Loss: ", models["Random Forest"])

Loss:  0.12936396068126133


In [21]:
print("The best score is: {0} - {1}".format(min(models, key=models.get), models[min(models, key=models.get)]))

The best score is: Random Forest - 0.12936396068126133
