### Training the Advanced ML models on Preprocessed Dataset

In [7]:
# Import the necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [8]:
# Load the dataset
import pandas as pd
earth_df = pd.read_csv('/content/preprocessed_earthquake_data.csv')
earth_df.sample(5)

Unnamed: 0,Latitude,Longitude,Type,Depth,Magnitude,Magnitude Type,Root Mean Square,Source,Status,Year,...,Source_ISCGEM,Source_ISCGEMSUP,Source_NC,Source_NN,Source_OFFICIAL,Source_PR,Source_SE,Source_US,Source_UW,Status_Reviewed
1133,1.641076,-1.745146,Earthquake,-0.210094,-0.667832,MW,-0.103839,ISCGEM,Automatic,-1.63824,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2041,-0.518755,1.016053,Earthquake,0.910989,-0.667832,MW,-0.103839,ISCGEM,Automatic,-1.499598,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20969,-0.742217,1.036386,Earthquake,0.932187,2.641418,MWW,-0.165332,US,Reviewed,1.273237,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
15941,-0.167274,0.845563,Earthquake,-0.307934,-0.195082,MWC,0.63407,US,Reviewed,0.649349,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
5087,-0.061105,-0.457018,Earthquake,-0.495461,-0.667832,MS,-0.103839,US,Reviewed,-0.945031,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


#### 3. Advanced Model-3: LightGBM Regressor

In [10]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import numpy as np

# Define target and categorical columns (update if needed)
target = 'Magnitude'
categorical_cols = ['Type', 'Magnitude Type', 'Source', 'Status']

# Prepare features and target variable
# Assuming earth_df is already loaded from previous steps
X = earth_df.drop(columns=[target]+categorical_cols)
y = earth_df[target]

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the LightGBM Regressor
lgb_reg = lgb.LGBMRegressor(random_state=42)

# Hyperparameter grid for tuning
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'num_leaves': [31, 50],
    'max_depth': [-1, 10],
    'min_child_samples': [20, 30]
}

# Setup GridSearchCV with 5-fold cross-validation using negative MAE scoring
grid_search_lgb = GridSearchCV(estimator=lgb_reg,
                               param_grid=param_grid,
                               cv=5,
                               scoring='neg_mean_absolute_error',
                               n_jobs=-1,
                               verbose=2)

# Fit GridSearchCV on training data
grid_search_lgb.fit(X_train, y_train)

# Best model from tuning
best_lgb = grid_search_lgb.best_estimator_

# Predict on test data
y_pred_lgb = best_lgb.predict(X_test)

# Calculate evaluation metrics
mae_lgb = mean_absolute_error(y_test, y_pred_lgb)
mse_lgb = mean_squared_error(y_test, y_pred_lgb)
r2_lgb = r2_score(y_test, y_pred_lgb)

# Cross-validation scores for best model on full data
cv_mae_scores_lgb = cross_val_score(best_lgb, X, y, cv=5, scoring='neg_mean_absolute_error')
cv_mse_scores_lgb = cross_val_score(best_lgb, X, y, cv=5, scoring='neg_mean_squared_error')
cv_r2_scores_lgb = cross_val_score(best_lgb, X, y, cv=5, scoring='r2')

# Print results
print("Best LightGBM model parameters:", grid_search_lgb.best_params_)
print(f"LightGBM Test MAE: {mae_lgb:.4f}")
print(f"LightGBM Test MSE: {mse_lgb:.4f}")
print(f"LightGBM Test R2 score: {r2_lgb:.4f}")
print(f"LightGBM 5-Fold CV Mean MAE: {-np.mean(cv_mae_scores_lgb):.4f} ± {np.std(cv_mae_scores_lgb):.4f}")
print(f"LightGBM 5-Fold CV Mean MSE: {-np.mean(cv_mse_scores_lgb):.4f} ± {np.std(cv_mse_scores_lgb):.4f}")
print(f"LightGBM 5-Fold CV Mean R2: {np.mean(cv_r2_scores_lgb):.4f} ± {np.std(cv_r2_scores_lgb):.4f}")

# Save the best trained model
model_filename_lgb = 'lightgbm_regressor_model.pkl'
joblib.dump(best_lgb, model_filename_lgb)
print(f"LightGBM Model saved to {model_filename_lgb}")

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002471 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1075
[LightGBM] [Info] Number of data points in the train set: 18727, number of used features: 25
[LightGBM] [Info] Start training from score -0.000570
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001257 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1064
[LightGBM] [Info] Number of data points in the train set: 18727, number of used features: 23
[LightGBM] [Info] Start training from score -0.024085
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001413 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug

In [11]:
# Define target and categorical columns (update if needed)
target = 'Magnitude'
categorical_cols = ['Type', 'Magnitude Type', 'Source', 'Status']

# Prepare features and target variable
X = earth_df.drop(columns=[target]+categorical_cols)
y = earth_df[target]

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.head()

Unnamed: 0,Latitude,Longitude,Depth,Root Mean Square,Year,Day,Month_sin,Month_cos,Hour_sin,Hour_cos,...,Source_ISCGEM,Source_ISCGEMSUP,Source_NC,Source_NN,Source_OFFICIAL,Source_PR,Source_SE,Source_US,Source_UW,Status_Reviewed
16953,-0.078008,0.755211,-0.495461,1.003024,0.787991,-1.116502,-1.215716,0.705254,-0.713894,1.234637,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
15800,-0.471466,1.014252,1.194724,-1.087719,0.649349,0.844142,1.218537,-0.717954,-1.006043,-0.995938,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
9014,0.213764,-0.62189,-0.495461,0.511085,-0.321143,0.498146,0.704119,-1.238884,-0.713894,-1.221272,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
15516,-1.274852,0.310949,-0.495461,-0.534287,0.580028,0.036818,-1.215716,-0.717954,-1.419204,0.006682,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
17837,0.01046,0.456978,-0.405774,-0.718764,0.926632,1.074806,1.218537,-0.717954,-1.419204,0.006682,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


#### 1. Advanced Model-1 : GBM Regressor

In [12]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# Initialize the Gradient Boosting Regressor
gbr = GradientBoostingRegressor(random_state=42)

# Hyperparameter grid for tuning
param_grid = {


}

# Setup GridSearchCV with 5-fold cross-validation using negative MAE scoring
grid_search = GridSearchCV(estimator=gbr,
                           param_grid=param_grid,
                           cv=5,
                           scoring='neg_mean_absolute_error',
                           n_jobs=-1,
                           verbose=2)

# Fit GridSearchCV on training data
grid_search.fit(X_train, y_train)

# Best model from tuning
best_gbr = grid_search.best_estimator_

# Predict on test data
y_pred = best_gbr.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Cross-validation scores for best model on full data
cv_mae_scores = cross_val_score(best_gbr, X, y, cv=5, scoring='neg_mean_absolute_error')
cv_mse_scores = cross_val_score(best_gbr, X, y, cv=5, scoring='neg_mean_squared_error')
cv_r2_scores = cross_val_score(best_gbr, X, y, cv=5, scoring='r2')

# Print results
print("Best model parameters:", grid_search.best_params_)
print(f"Test MAE: {mae:.4f}")
print(f"Test MSE: {mse:.4f}")
print(f"Test R2 score: {r2:.4f}")
print(f"5-Fold CV Mean MAE: {-np.mean(cv_mae_scores):.4f} ± {np.std(cv_mae_scores):.4f}")
print(f"5-Fold CV Mean MSE: {-np.mean(cv_mse_scores):.4f} ± {np.std(cv_mse_scores):.4f}")
print(f"5-Fold CV Mean R2: {np.mean(cv_r2_scores):.4f} ± {np.std(cv_r2_scores):.4f}")

# Save the best trained model
model_filename = 'gbm_regressor_model.pkl'
joblib.dump(best_gbr, model_filename)
print(f"Model saved to {model_filename}")


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best model parameters: {}
Test MAE: 0.6810
Test MSE: 0.8941
Test R2 score: 0.1394
5-Fold CV Mean MAE: 0.7183 ± 0.0267
5-Fold CV Mean MSE: 0.9400 ± 0.0671
5-Fold CV Mean R2: 0.0582 ± 0.0241
Model saved to gbm_regressor_model.pkl


#### Advanced Model-2: catGBM Regressor

In [13]:
from sklearn.preprocessing import LabelEncoder
target = 'Magnitude'
categorical_cols = ['Type', 'Magnitude Type', 'Source', 'Status']

# Prepare features and target
X = earth_df.drop(columns=[target])
y = earth_df[target]

# Label encode categorical columns
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
from catboost import CatBoostRegressor, Pool

# Create CatBoost Pool for train and test sets (specify categorical features)
train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_cols)
test_pool = Pool(data=X_test, label=y_test, cat_features=categorical_cols)

# Initialize CatBoost Regressor
catboost_model = CatBoostRegressor(
    random_seed=42,
    verbose=0
)

# Define hyperparameter grid for GridSearch
param_grid = {
    'iterations': [100, 200],
    'depth': [4, 6, 8],
    'learning_rate': [0.03, 0.1],
    'l2_leaf_reg': [1, 3, 5]
}

# Set up GridSearchCV (CatBoost supports sklearn API)
grid_search = GridSearchCV(
    estimator=catboost_model,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=2
)

# Fit GridSearchCV - using DataFrame without Pool
# CatBoost handles categorical features by column name in dataframe during sklearn API use
grid_search.fit(X_train, y_train, cat_features=categorical_cols)

# Best model from grid search
best_model = grid_search.best_estimator_

# Predict on test set
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Cross-validation scores on full dataset
cv_mae_scores = cross_val_score(best_model, X, y, cv=5, scoring='neg_mean_absolute_error')

print("Best parameters:", grid_search.best_params_)
print(f"Test MAE: {mae:.4f}")
print(f"Test MSE: {mse:.4f}")
print(f"Test R2 score: {r2:.4f}")
print(f"5-Fold CV Mean MAE: {-np.mean(cv_mae_scores):.4f} ± {np.std(cv_mae_scores):.4f}")

# Save the trained model
model_filename = 'catboost_regressor_model.cbm'
best_model.save_model(model_filename)
print(f"Model saved to {model_filename}")

ModuleNotFoundError: No module named 'catboost'

In [16]:
%pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


#### Task:
- Build, train and save LightGBM Regressor or Classifier with cross-validataion and hyperparameter tuning to get best model.