In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import joblib
import warnings
warnings.filterwarnings('ignore')


In [14]:
# Load the dataset
df = pd.read_csv("earthquakes.csv")
df.head()


Unnamed: 0,time,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,...,place,type,horizontalError,depthError,magError,magNst,status,locationSource,magSource,Alert
0,1976-03-25 00:41:20.500000+00:00,35.59,-90.48,15.0,4.62,mw,,,,,...,"8 km NW of Marked Tree, Arkansas",earthquake,,,,0.0,reviewed,nm,nm,green
1,1979-10-16 06:58:43.450000+00:00,32.998667,-115.5575,14.19,5.8,ml,7.0,79.0,0.05768,0.17,...,"3km NW of Brawley, CA",earthquake,1.03,1.78,,0.0,reviewed,ci,ci,yellow
2,1980-07-27 18:52:21.600000+00:00,38.19,-83.95,10.0,5.0,md,,,,,...,"2 km SW of Sharpsburg, Kentucky",earthquake,,,,,reviewed,se,se,green
3,1981-04-19 09:02:50.520000+00:00,35.816,-117.816333,4.766,4.7,ml,13.0,135.0,,0.66,...,"16km SSE of Little Lake, CA",earthquake,2.51,31.61,0.424,9.0,reviewed,ci,ci,green
4,1981-04-26 12:09:28.290000+00:00,33.0955,-115.6245,18.904,5.75,ml,81.0,34.0,,0.34,...,"The 1981 Westmorland, California Earthquake",earthquake,0.56,0.67,0.161,6.0,reviewed,ci,ci,green


In [15]:
# Convert time column to datetime
df['time'] = pd.to_datetime(df['time'], errors='coerce')

# Drop rows with missing target (Alert)
df = df[df['Alert'].notna()]

# Drop columns with >50% missing or irrelevant for prediction
df.drop(['id', 'updated', 'net'], axis=1, inplace=True, errors='ignore')

# Fill numeric NaNs with median
for col in df.select_dtypes(include=[np.number]).columns:
    df[col].fillna(df[col].median(), inplace=True)

# Fill object NaNs with mode
for col in df.select_dtypes(include=['object']).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)


In [16]:
# Extract time features
df['year'] = df['time'].dt.year
df['month'] = df['time'].dt.month
df['hour'] = df['time'].dt.hour

# Drop 'time' and 'place' for simplicity
df.drop(['time', 'place'], axis=1, inplace=True)

# Encode categorical columns
cat_cols = df.select_dtypes(include=['object']).columns
encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

# Feature-target split
X = df.drop("Alert", axis=1)
y = df["Alert"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [22]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Fill missing values in NumPy arrays using column means
col_means = np.nanmean(X_train, axis=0)
X_train[np.isnan(X_train)] = np.take(col_means, np.where(np.isnan(X_train))[1])
X_test[np.isnan(X_test)] = np.take(col_means, np.where(np.isnan(X_test))[1])  # use train means for test

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "SVM": SVC(probability=True),
    "KNN": KNeighborsClassifier()
}

# Train and evaluate
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds, average='weighted')
    results[name] = {"accuracy": acc, "f1_score": f1}
    print(f"\n{name} Report:")
    print(classification_report(y_test, preds))



Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1475
           1       0.00      0.00      0.00         9
           2       0.00      0.00      0.00         8
           3       0.09      0.02      0.03        52

    accuracy                           0.95      1544
   macro avg       0.26      0.25      0.25      1544
weighted avg       0.92      0.95      0.94      1544


Random Forest Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1475
           1       0.00      0.00      0.00         9
           2       0.00      0.00      0.00         8
           3       0.29      0.04      0.07        52

    accuracy                           0.96      1544
   macro avg       0.31      0.26      0.26      1544
weighted avg       0.93      0.96      0.94      1544


Gradient Boosting Report:
              precision    recall  f1-score   suppo

In [23]:
# Select best model by F1-score
best_model_name = max(results, key=lambda x: results[x]['f1_score'])
best_model = models[best_model_name]

# Save model and scaler
joblib.dump(best_model, "earthquake_alert_model.joblib")
joblib.dump(scaler, "scaler.joblib")
print(f"Saved best model: {best_model_name}")


Saved best model: Gradient Boosting
