In [26]:
CONFIG = {
    'full_dataset' : "../data/full_dataset.csv",
    'target' : 'Mean_freq'
    }
features = {"Rainfall": True, "Temp" : True, "Wind" : True, "Pressure" : True, "Humidity" : True}
CONFIG['features'] = [k for k,v in features.items() if v]

In [27]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv(CONFIG['full_dataset'], sep=",")

df.drop(columns=['Mean_am', 'Std_am', 'Unnamed: 0'], axis=1, inplace=True)

In [28]:
df['Dates UTC'] = pd.to_datetime(df['Dates UTC'])

if 'Temp' in CONFIG['features']:
    pass
    df['Temp'] = df['Temp'].shift(2).bfill()

if 'Humidity' in CONFIG['features']:
    pass
    df['Humidity'] = df['Humidity'].shift(13).bfill()

if 'Rainfall' in CONFIG['features']:
    pass
    df['Rainfall'] = df['Rainfall'].shift(106).bfill()

In [29]:
split_date_earthquake = '2024-08-13'

normal_df = df[df['Dates UTC'] < split_date_earthquake].copy()
earthquake_df = df[df['Dates UTC'] >= split_date_earthquake].copy()


In [30]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.discriminant_analysis import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR

class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, mean_windows=None):
        self.mean_windows = mean_windows 

    def fit(self, X, y=None):
        return self  # No fitting is required for this transformer

    def transform(self, X):
        df = X.copy()
        if self.mean_windows is not None:
            for feature, window in self.mean_windows.items():
                window_size = window
                if window != 0:
                        df[f'{feature}_rolling_mean'] = df[feature].rolling(window=window_size).mean()

        df.dropna(inplace=True)

        return df
    
class CustomRidge(Ridge):
    def fit(self, X, y, sample_weight=None):
        y = y.tail(len(X))
        return super().fit(X, y, sample_weight)
    
def custom_mse(y_true, y_pred):
    y_true = y_true.tail(len(y_pred))
    mse = mean_squared_error(y_true, y_pred)
    return -mse 

In [31]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('custom_transformer', CustomTransformer()),
    ('scaler', StandardScaler()),
    ('custom_ridge', CustomRidge())
], memory=None)

mean_windows = [
    {'Temp': temp_ws, 'Pressure': press_ws, 'Rainfall': rain_ws, 'Humidity': hum_ws}
    for press_ws in range (0, 1, 50)
    for temp_ws in range (0, 301, 50)
    for hum_ws in range (0, 1, 50)
    for rain_ws in range (0, 601, 50)
]
param_grid = [
    {
        'custom_transformer__mean_windows': mean_windows,
        'custom_ridge__alpha': [0.1, 1, 10, 100, 1000],
    }
]

In [32]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit

custom_scorer = make_scorer(custom_mse)

X_train, y_train = normal_df[CONFIG['features']].copy(), normal_df[CONFIG['target']].copy()
X_test, y_test = earthquake_df[CONFIG['features']].copy(), earthquake_df[CONFIG['target']].copy()

# Grid search for the pipeline
grid_search = GridSearchCV(pipeline, param_grid, cv=TimeSeriesSplit(n_splits=3), scoring=custom_scorer, verbose=1, n_jobs=8, error_score='raise')
grid_search.fit(X_train, y_train)

print(np.sqrt(-grid_search.best_score_))

grid_search

Fitting 3 folds for each of 455 candidates, totalling 1365 fits
0.052149637217548145


In [33]:
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
# Dummy model that predicts the average value of the training set
dummy_model = DummyRegressor(strategy='mean')
cv_results_dummy = cross_val_score(dummy_model, X_train, y_train, cv=TimeSeriesSplit(n_splits=3), scoring=custom_scorer)
print("Dummy model CV score:", np.sqrt(-cv_results_dummy.mean()))

Dummy model CV score: 0.07890173457930696


In [34]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from sklearn.metrics import mean_squared_error
from sklearn.dummy import DummyRegressor

# Combine datasets
combined_df = pd.concat([normal_df, earthquake_df])

# Predict the entire dataset
X_combined = combined_df[CONFIG['features']].copy()
y_combined = combined_df[CONFIG['target']].copy()
y_pred_combined = grid_search.predict(X_combined)

# Handle NaN values from rolling windows
combined_df, X_combined, y_combined = [df.tail(len(y_pred_combined)) for df in [combined_df, X_combined, y_combined]]


# Define time splits
split_date_earthquake = pd.to_datetime(split_date_earthquake)
recovery_end_date = split_date_earthquake + pd.DateOffset(days=2.5 * 30)

train_mask = combined_df['Dates UTC'] < split_date_earthquake
recovery_mask = (combined_df['Dates UTC'] >= split_date_earthquake) & (combined_df['Dates UTC'] < recovery_end_date)
test_mask = combined_df['Dates UTC'] >= recovery_end_date

# Split data
train_df, recovery_df, test_df = combined_df[train_mask], combined_df[recovery_mask], combined_df[test_mask]
y_pred_train, y_pred_recovery, y_pred_test = y_pred_combined[train_mask], y_pred_combined[recovery_mask], y_pred_combined[test_mask]

# Compute RMSE
rmse_train = np.sqrt(mean_squared_error(train_df['Mean_freq'], y_pred_train))
rmse_test = np.sqrt(mean_squared_error(test_df['Mean_freq'], y_pred_test))
combined_rmse = np.sqrt(mean_squared_error(
    pd.concat([train_df['Mean_freq'], test_df['Mean_freq']]), 
    np.concatenate([y_pred_train, y_pred_test])
))

# Plot Actual vs Predicted
fig = go.Figure()

fig.add_trace(go.Scatter(x=train_df['Dates UTC'], y=train_df['Mean_freq'], 
                         mode='lines', name='Actual', line=dict(color='blue'), showlegend=False))
fig.add_trace(go.Scatter(x=train_df['Dates UTC'], y=y_pred_train, 
                         mode='lines', name='Train Predicted', line=dict(color='red'), showlegend=False))

fig.add_trace(go.Scatter(x=recovery_df['Dates UTC'], y=recovery_df['Mean_freq'], 
                         mode='lines', name='Test Actual', line=dict(color='blue'), showlegend=False))
fig.add_trace(go.Scatter(x=recovery_df['Dates UTC'], y=y_pred_recovery, 
                         mode='lines', name='Test Predicted', line=dict(color='grey'), showlegend=False))

fig.add_trace(go.Scatter(x=test_df['Dates UTC'], y=test_df['Mean_freq'], 
                         mode='lines', name='Test Actual', line=dict(color='blue'), showlegend=False))
fig.add_trace(go.Scatter(x=test_df['Dates UTC'], y=y_pred_test, 
                         mode='lines', name='Test Predicted', line=dict(color='green'), showlegend=False))

fig.add_trace(go.Scatter(
    x=[split_date_earthquake],  # Dummy x-coordinate
    y=[train_df['Mean_freq'].max()],  # Dummy y-coordinate (ensures it's visible in legend)
    mode='markers',
    marker=dict(size=10, color="LightSalmon", opacity=0.5),
    name="Conservative<br>Recovery Period",
    showlegend=False
))

# Add the recovery period rectangle
fig.add_vrect(
    x0=split_date_earthquake, x1=recovery_end_date,
    fillcolor="LightSalmon", opacity=0.5, layer="below", line_width=0
)
OFFSET = 100
x_start_offset = combined_df['Dates UTC'].iloc[OFFSET]  # Start x-axis at the 100th sample
x_end = combined_df['Dates UTC'].iloc[-1] 
fig.update_layout(
    title={
        "text": f"Actual vs Predicted (with Past Rainfall Rolling Window)<br>RMSE Train: {rmse_train:.4f}, RMSE Test: {rmse_test:.4f}, "
                f"Train+Test RMSE: {combined_rmse:.4f}",
        "font": {"size": 26}  # Increase title font size
    },
    xaxis={
        "title": "Date",
        "titlefont": {"size": 22},  # Increase X-axis label font size
        "tickfont": {"size": 16},  # Increase X-axis tick labels font size
        "range": [x_start_offset, x_end]
    },
    yaxis={
        "title": "Frequency (Hz)",
        "titlefont": {"size": 22},  # Increase Y-axis label font size
        "tickfont": {"size": 16}  # Increase Y-axis tick labels font size
    },
    legend={
        "font": {"size": 24}  # Increase legend font size
    },
    height=600
)

fig.show()

# Dummy model for baseline comparison
dummy_model = DummyRegressor(strategy='mean')
dummy_model.fit(X_combined[train_mask], y_combined[train_mask])

# Predict and evaluate dummy model
y_pred_dummy_train = dummy_model.predict(X_combined[train_mask])
y_pred_dummy_recovery = dummy_model.predict(X_combined[recovery_mask])
y_pred_dummy_test = dummy_model.predict(X_combined[test_mask])

rmse_dummy_train = np.sqrt(mean_squared_error(y_combined[train_mask], y_pred_dummy_train))
rmse_dummy_test = np.sqrt(mean_squared_error(y_combined[test_mask], y_pred_dummy_test))

print(f"Dummy Model RMSE - Train: {rmse_dummy_train:.4f}, Test: {rmse_dummy_test:.4f}")
print(f"Model RMSE - Train: {rmse_train:.4f}, Test: {rmse_test:.4f}")


Dummy Model RMSE - Train: 0.0670, Test: 0.0648
Model RMSE - Train: 0.0469, Test: 0.0414


In [35]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('custom_transformer', CustomTransformer()),
    ('scaler', StandardScaler()),
    ('custom_ridge', CustomRidge())
], memory=None)

mean_windows = [
    {'Temp': temp_ws, 'Pressure': press_ws, 'Rainfall': rain_ws, 'Humidity': hum_ws}
    for press_ws in range (0, 301, 50)
    for temp_ws in range (0, 301, 50)
    for hum_ws in range (0, 601, 50)
    for rain_ws in range (0, 1, 50)
]
param_grid = [
    {
        'custom_transformer__mean_windows': mean_windows,
        'custom_ridge__alpha': [0.1, 1, 10, 100, 1000],
    }
]

In [36]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit

custom_scorer = make_scorer(custom_mse)

X_train, y_train = normal_df[CONFIG['features']].copy(), normal_df[CONFIG['target']].copy()
X_test, y_test = earthquake_df[CONFIG['features']].copy(), earthquake_df[CONFIG['target']].copy()

# Grid search for the pipeline
grid_search_no_rainfall = GridSearchCV(pipeline, param_grid, cv=TimeSeriesSplit(n_splits=3), scoring=custom_scorer, verbose=1, n_jobs=8, error_score='raise')
grid_search_no_rainfall.fit(X_train, y_train)

print(np.sqrt(-grid_search_no_rainfall.best_score_))

grid_search_no_rainfall

Fitting 3 folds for each of 3185 candidates, totalling 9555 fits
0.05998994976086385


In [37]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from sklearn.metrics import mean_squared_error
from sklearn.dummy import DummyRegressor

# Combine datasets
combined_df = pd.concat([normal_df, earthquake_df])

# Predict the entire dataset
X_combined = combined_df[CONFIG['features']].copy()
y_combined = combined_df[CONFIG['target']].copy()
y_pred_combined = grid_search_no_rainfall.predict(X_combined)

# Handle NaN values from rolling windows
combined_df, X_combined, y_combined = [df.tail(len(y_pred_combined)) for df in [combined_df, X_combined, y_combined]]

# Define time splits
split_date_earthquake = pd.to_datetime(split_date_earthquake)
recovery_end_date = split_date_earthquake + pd.DateOffset(days=2.5 * 30)

train_mask = combined_df['Dates UTC'] < split_date_earthquake
recovery_mask = (combined_df['Dates UTC'] >= split_date_earthquake) & (combined_df['Dates UTC'] < recovery_end_date)
test_mask = combined_df['Dates UTC'] >= recovery_end_date

# Split data
train_df, recovery_df, test_df = combined_df[train_mask], combined_df[recovery_mask], combined_df[test_mask]
y_pred_train, y_pred_recovery, y_pred_test = y_pred_combined[train_mask], y_pred_combined[recovery_mask], y_pred_combined[test_mask]

# Compute RMSE
rmse_train = np.sqrt(mean_squared_error(train_df['Mean_freq'], y_pred_train))
rmse_test = np.sqrt(mean_squared_error(test_df['Mean_freq'], y_pred_test))
combined_rmse = np.sqrt(mean_squared_error(
    pd.concat([train_df['Mean_freq'], test_df['Mean_freq']]), 
    np.concatenate([y_pred_train, y_pred_test])
))

# Plot Actual vs Predicted
fig = go.Figure()

fig.add_trace(go.Scatter(x=train_df['Dates UTC'], y=train_df['Mean_freq'], 
                         mode='lines', name='Train Actual', line=dict(color='blue'), showlegend=False))
fig.add_trace(go.Scatter(x=train_df['Dates UTC'], y=y_pred_train, 
                         mode='lines', name='Train Predicted', line=dict(color='red'), showlegend=False))

fig.add_trace(go.Scatter(x=recovery_df['Dates UTC'], y=recovery_df['Mean_freq'], 
                         mode='lines', name='Test Actual', line=dict(color='blue'), showlegend=False))
fig.add_trace(go.Scatter(x=recovery_df['Dates UTC'], y=y_pred_recovery, 
                         mode='lines', name='Test Predicted', line=dict(color='grey'), showlegend=False))

fig.add_trace(go.Scatter(x=test_df['Dates UTC'], y=test_df['Mean_freq'], 
                         mode='lines', name='Test Actual', line=dict(color='blue'), showlegend=False))
fig.add_trace(go.Scatter(x=test_df['Dates UTC'], y=y_pred_test, 
                         mode='lines', name='Test Predicted', line=dict(color='green'), showlegend=False))

fig.add_trace(go.Scatter(
    x=[split_date_earthquake],  # Dummy x-coordinate
    y=[train_df['Mean_freq'].max()],  # Dummy y-coordinate (ensures it's visible in legend)
    mode='markers',
    marker=dict(size=10, color="LightSalmon", opacity=0.5),
    name="Conservative<br>Recovery Period", 
    showlegend=False
))

# Add the recovery period rectangle
fig.add_vrect(
    x0=split_date_earthquake, x1=recovery_end_date,
    fillcolor="LightSalmon", opacity=0.5, layer="below", line_width=0
)

fig.update_layout(
    title={
        "text": f"Actual vs Predicted (without Past Rainfall Rolling Window)<br>RMSE Train: {rmse_train:.4f}, RMSE Test: {rmse_test:.4f}, "
                f"Train+Test RMSE: {combined_rmse:.4f}",
        "font": {"size": 26}  # Increase title font size
    },
    xaxis={
        "title": "Date",
        "titlefont": {"size": 22},  # Increase X-axis label font size
        "tickfont": {"size": 16}  # Increase X-axis tick labels font size
    },
    yaxis={
        "title": "Frequency (Hz)",
        "titlefont": {"size": 22},  # Increase Y-axis label font size
        "tickfont": {"size": 16}  # Increase Y-axis tick labels font size
    },
    legend={
        "font": {"size": 24}  # Increase legend font size
    },
    height=600
)

fig.show()

# Dummy model for baseline comparison
dummy_model = DummyRegressor(strategy='mean')
dummy_model.fit(X_combined[train_mask], y_combined[train_mask])

# Predict and evaluate dummy model
y_pred_dummy_train = dummy_model.predict(X_combined[train_mask])
y_pred_dummy_recovery = dummy_model.predict(X_combined[recovery_mask])
y_pred_dummy_test = dummy_model.predict(X_combined[test_mask])

rmse_dummy_train = np.sqrt(mean_squared_error(y_combined[train_mask], y_pred_dummy_train))
rmse_dummy_test = np.sqrt(mean_squared_error(y_combined[test_mask], y_pred_dummy_test))

print(f"Dummy Model RMSE - Train: {rmse_dummy_train:.4f}, Test: {rmse_dummy_test:.4f}")
print(f"Model RMSE - Train: {rmse_train:.4f}, Test: {rmse_test:.4f}")


Dummy Model RMSE - Train: 0.0673, Test: 0.0648
Model RMSE - Train: 0.0545, Test: 0.0459
