In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv("preprocessed_earthquake_data.csv")
df.head()

Unnamed: 0,Latitude,Longitude,Type,Depth,Magnitude,Magnitude Type,Root Mean Square,Source,Status,Year,...,Source_ISCGEM,Source_ISCGEMSUP,Source_NC,Source_NN,Source_OFFICIAL,Source_PR,Source_SE,Source_US,Source_UW,Status_Reviewed
0,0.583377,0.844368,Earthquake,0.495984,0.277668,MW,-0.103839,ISCGEM,Automatic,-1.915523,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.006109,0.698849,Earthquake,0.075272,-0.195082,MW,-0.103839,ISCGEM,Automatic,-1.915523,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.739162,-1.701962,Earthquake,-0.413928,0.750418,MW,-0.103839,ISCGEM,Automatic,-1.915523,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-2.017599,-0.503524,Earthquake,-0.454694,-0.195082,MW,-0.103839,ISCGEM,Automatic,-1.915523,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.340688,0.691479,Earthquake,-0.454694,-0.195082,MW,-0.103839,ISCGEM,Automatic,-1.915523,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
print("Dataset shape:", df.shape)
print("\nColumn names:\n", df.columns.tolist())
df.info()

Dataset shape: (23409, 40)

Column names:
 ['Latitude', 'Longitude', 'Type', 'Depth', 'Magnitude', 'Magnitude Type', 'Root Mean Square', 'Source', 'Status', 'Year', 'Day', 'Month_sin', 'Month_cos', 'Hour_sin', 'Hour_cos', 'Type_Explosion', 'Type_Nuclear Explosion', 'Type_Rock Burst', 'Magnitude Type_MD', 'Magnitude Type_MH', 'Magnitude Type_ML', 'Magnitude Type_MS', 'Magnitude Type_MW', 'Magnitude Type_MWB', 'Magnitude Type_MWC', 'Magnitude Type_MWR', 'Magnitude Type_MWW', 'Source_ATLAS', 'Source_CI', 'Source_GCMT', 'Source_ISCGEM', 'Source_ISCGEMSUP', 'Source_NC', 'Source_NN', 'Source_OFFICIAL', 'Source_PR', 'Source_SE', 'Source_US', 'Source_UW', 'Status_Reviewed']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23409 entries, 0 to 23408
Data columns (total 40 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Latitude                23409 non-null  float64
 1   Longitude               23409 non-null  float64
 2  

In [4]:
target_candidates = [c for c in df.columns if 'mag' in c.lower() or 'magnitude' in c.lower()]
target_col = target_candidates[0] if target_candidates else df.select_dtypes(include=[np.number]).columns[-1]
print("Using target column:", target_col)

Using target column: Magnitude


In [5]:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
feature_cols = [c for c in numeric_cols if c != target_col]

df_clean = df[feature_cols + [target_col]].dropna()
X = df_clean[feature_cols].values
y = df_clean[target_col].values

print("Number of features:", len(feature_cols))
print("Rows after cleaning:", X.shape[0])

Number of features: 35
Rows after cleaning: 23409


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

In [7]:
lr = LinearRegression()
lr.fit(X_train_s, y_train)
y_pred_lr = lr.predict(X_test_s)

In [8]:
alphas = np.logspace(-3, 2, 10)  # [0.001 ... 100]
ridge_cv = RidgeCV(alphas=alphas, cv=5, scoring='neg_root_mean_squared_error')
ridge_cv.fit(X_train_s, y_train)
y_pred_ridge = ridge_cv.predict(X_test_s)

print("Best alpha found for Ridge:", ridge_cv.alpha_)


Best alpha found for Ridge: 2.1544346900318843


In [12]:
def rmse(true, pred): 
    return np.sqrt(mean_squared_error(true, pred)) 

results = {
    "Model": ["LinearRegression (default)", "RidgeCV (tuned alpha)"],
    "RMSE": [rmse(y_test, y_pred_lr), rmse(y_test, y_pred_ridge)],
    "R2": [r2_score(y_test, y_pred_lr), r2_score(y_test, y_pred_ridge)]
}

results_df = pd.DataFrame(results)
results_df


Unnamed: 0,Model,RMSE,R2
0,LinearRegression (default),0.971518,0.091453
1,RidgeCV (tuned alpha),0.971524,0.091442


In [10]:

sample_df = pd.DataFrame({
    "y_true": y_test[:20],
    "y_pred_lr": y_pred_lr[:20],
    "y_pred_ridge": y_pred_ridge[:20]
})
sample_df


Unnamed: 0,y_true,y_pred_lr,y_pred_ridge
0,-0.904207,0.115827,0.115493
1,0.986793,0.215582,0.215564
2,-0.667832,-0.517252,-0.517201
3,-0.195082,0.424208,0.424008
4,-0.904207,-0.103667,-0.103833
5,-0.195082,-0.540626,-0.540554
6,4.059668,0.337533,0.337381
7,0.986793,0.272005,0.271906
8,0.041293,0.007907,0.007838
9,-0.431457,-0.285076,-0.285051


In [13]:
results_df.to_csv("magnitude_model_comparison_results.csv", index=False)
print("Results saved to 'magnitude_model_comparison_results.csv'")


Results saved to 'magnitude_model_comparison_results.csv'
