## Build Model with Reduced Feature Set

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.decomposition import PCA

In [5]:
# Import scaled earthquake data with reduced feature set
df = pd.read_csv('../../Resources/earthquake_data_reduced.csv')
display(df.head())
df.info()

Unnamed: 0,magnitude,depth,latitude,soil_density,nodal_plane_1_strike,percent_double_couple,min_station_distance,mmi_class
0,1.259068,-0.138401,0.217395,-1.885829,-0.187953,-0.131701,-0.621166,2
1,0.278647,2.957343,1.453026,0.643055,-0.143028,-0.016844,0.182939,0
2,0.8669,1.29798,1.705352,-0.257281,-0.187953,-0.045558,2.000126,0
3,0.808074,-0.776489,0.63995,1.440778,0.620696,-0.138879,-0.566141,0
4,0.8669,-0.567035,0.91787,-0.138119,1.56412,-0.009665,1.537912,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 957 entries, 0 to 956
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   magnitude              957 non-null    float64
 1   depth                  957 non-null    float64
 2   latitude               957 non-null    float64
 3   soil_density           957 non-null    float64
 4   nodal_plane_1_strike   957 non-null    float64
 5   percent_double_couple  957 non-null    float64
 6   min_station_distance   957 non-null    float64
 7   mmi_class              957 non-null    int64  
dtypes: float64(7), int64(1)
memory usage: 59.9 KB


In [6]:
# Create X and y
X = df.drop(columns='mmi_class', axis=1)
# y = df['mmi_class'].values.reshape(-1, 1)
y = df['mmi_class']
display(X.head())
display(y[:5])

Unnamed: 0,magnitude,depth,latitude,soil_density,nodal_plane_1_strike,percent_double_couple,min_station_distance
0,1.259068,-0.138401,0.217395,-1.885829,-0.187953,-0.131701,-0.621166
1,0.278647,2.957343,1.453026,0.643055,-0.143028,-0.016844,0.182939
2,0.8669,1.29798,1.705352,-0.257281,-0.187953,-0.045558,2.000126
3,0.808074,-0.776489,0.63995,1.440778,0.620696,-0.138879,-0.566141
4,0.8669,-0.567035,0.91787,-0.138119,1.56412,-0.009665,1.537912


0    2
1    0
2    0
3    0
4    1
Name: mmi_class, dtype: int64

In [7]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### Apply Randomized Search of Best Hyperparameters

In [8]:
# Define a hyperparameter grid
param_grid = {'n_estimators': range(100, 1000, 10),
              'max_features': ['sqrt', 'log2'],
              'max_depth': range(1, 40, 2),
              'min_samples_split': [2, 3, 4, 5],
              'min_samples_leaf': [1, 2, 4],
              'bootstrap': [True, False]}

# Instantiate a Random Forest Classifier model
rfc = RandomForestClassifier(random_state=0)

In [9]:
# Instantiate the Randomized Search Estimator
random_rfc = RandomizedSearchCV(rfc, param_grid, random_state=0, verbose=3)

# Fit the Randomized Search Estimator on train data
random_rfc.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END bootstrap=False, max_depth=23, max_features=log2, min_samples_leaf=1, min_samples_split=4, n_estimators=580;, score=0.597 total time=   0.9s
[CV 2/5] END bootstrap=False, max_depth=23, max_features=log2, min_samples_leaf=1, min_samples_split=4, n_estimators=580;, score=0.590 total time=   0.7s
[CV 3/5] END bootstrap=False, max_depth=23, max_features=log2, min_samples_leaf=1, min_samples_split=4, n_estimators=580;, score=0.594 total time=   0.7s
[CV 4/5] END bootstrap=False, max_depth=23, max_features=log2, min_samples_leaf=1, min_samples_split=4, n_estimators=580;, score=0.636 total time=   0.7s
[CV 5/5] END bootstrap=False, max_depth=23, max_features=log2, min_samples_leaf=1, min_samples_split=4, n_estimators=580;, score=0.685 total time=   0.7s
[CV 1/5] END bootstrap=False, max_depth=1, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=170;, score=0.507 total time=   0.0s
[CV 2/5] END boo

In [10]:
# Print best hyperparameters
print(random_rfc.best_params_)

{'n_estimators': 630, 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 39, 'bootstrap': True}


In [11]:
# Make predictions with the hypertuned model
random_rfc_train_predict = random_rfc.predict(X_train)
random_rfc_test_predict = random_rfc.predict(X_test)

# Calculate and print the balanced accuracies
print(f"Accuracy - Train: {balanced_accuracy_score(y_train, random_rfc_train_predict):.3f}")
print(f"Accuracy - Test: {balanced_accuracy_score(y_test, random_rfc_test_predict):.3f}")

# Calculate the classification report
print(classification_report(y_test, random_rfc_test_predict))

Accuracy - Train: 0.903
Accuracy - Test: 0.631
              precision    recall  f1-score   support

           0       0.69      0.66      0.67        89
           1       0.54      0.63      0.58        83
           2       0.72      0.60      0.66        68

    accuracy                           0.63       240
   macro avg       0.65      0.63      0.64       240
weighted avg       0.64      0.63      0.64       240



### Refine Hyperparameter Set Using Grid Search

In [16]:
# Define a new hyperparameter grid around the best hyperparameters from before
param_grid = {'n_estimators': range(625, 635),
              'max_features': ['sqrt'],
              'max_depth': range(35, 40),
              'min_samples_split': [4],
              'min_samples_leaf': [4],
              'bootstrap': [True]}

# Instantiate a Random Forest Classifier model
rfc2 = RandomForestClassifier(random_state=0)

In [17]:
# Use a grid search this time.random_rfc2.random_rfc2
# Instantiate the Grid Search Estimator
grid_rfc2 = GridSearchCV(rfc2, param_grid, verbose=3)

# Fit the Randomized Search Estimator on train data
grid_rfc2.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5] END bootstrap=True, max_depth=35, max_features=sqrt, min_samples_leaf=4, min_samples_split=4, n_estimators=625;, score=0.646 total time=   0.9s
[CV 2/5] END bootstrap=True, max_depth=35, max_features=sqrt, min_samples_leaf=4, min_samples_split=4, n_estimators=625;, score=0.618 total time=   0.8s
[CV 3/5] END bootstrap=True, max_depth=35, max_features=sqrt, min_samples_leaf=4, min_samples_split=4, n_estimators=625;, score=0.622 total time=   0.8s
[CV 4/5] END bootstrap=True, max_depth=35, max_features=sqrt, min_samples_leaf=4, min_samples_split=4, n_estimators=625;, score=0.664 total time=   0.8s
[CV 5/5] END bootstrap=True, max_depth=35, max_features=sqrt, min_samples_leaf=4, min_samples_split=4, n_estimators=625;, score=0.643 total time=   0.8s
[CV 1/5] END bootstrap=True, max_depth=35, max_features=sqrt, min_samples_leaf=4, min_samples_split=4, n_estimators=626;, score=0.646 total time=   0.9s
[CV 2/5] END bootstr

In [18]:
# Print best hyperparameters
print(grid_rfc2.best_params_)

{'bootstrap': True, 'max_depth': 35, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 4, 'n_estimators': 633}


In [19]:
# Make predictions with the hypertuned model
grid_rfc2_train_predict = grid_rfc2.predict(X_train)
grid_rfc2_test_predict = grid_rfc2.predict(X_test)

# Calculate and print the balanced accuracies
print(f"Accuracy - Train: {balanced_accuracy_score(y_train, grid_rfc2_train_predict):.3f}")
print(f"Accuracy - Test: {balanced_accuracy_score(y_test, grid_rfc2_test_predict):.3f}")

# Calculate the classification report
print(classification_report(y_test, grid_rfc2_test_predict))

Accuracy - Train: 0.903
Accuracy - Test: 0.626
              precision    recall  f1-score   support

           0       0.68      0.66      0.67        89
           1       0.54      0.63      0.58        83
           2       0.71      0.59      0.65        68

    accuracy                           0.63       240
   macro avg       0.64      0.63      0.63       240
weighted avg       0.64      0.63      0.63       240



**Conclusion:** This model performs worse in terms of the test accuracy score than the model built on all of the features. As expected for a model with a reduced number of features, the overfitting improved.