In [11]:
import numpy as np
import pandas as pd
import scipy.io
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt

%matplotlib inline

In [12]:
mat = scipy.io.loadmat('water_dataset.mat')

In [13]:
X_tr = mat['X_tr'] 
X_te = mat['X_te']  
Y_tr = mat['Y_tr']
Y_te = mat['Y_te']

In [14]:
station_idx = 0


train_stack = np.stack(X_tr.squeeze())
test_stack = np.stack(X_te.squeeze())


X_train = train_stack[:, station_idx, :]
X_test = test_stack[:, station_idx, :]
y_train = Y_tr[station_idx]
y_test = Y_te[station_idx]

In [6]:
feature_names = [
    'Specific conductance (Max)', 'pH (Max)', 'pH (Min)',
    'Specific conductance (Min)', 'Specific conductance (Mean)',
    'Dissolved oxygen (Max)', 'Dissolved oxygen (Mean)', 'Dissolved oxygen (Min)',
    'Temperature (Mean)', 'Temperature (Min)', 'Temperature (Max)'
]
df = pd.DataFrame(X_train, columns=feature_names)
df['pH_target'] = y_train


In [7]:
df.head()

Unnamed: 0,Specific conductance (Max),pH (Max),pH (Min),Specific conductance (Min),Specific conductance (Mean),Dissolved oxygen (Max),Dissolved oxygen (Mean),Dissolved oxygen (Min),Temperature (Mean),Temperature (Min),Temperature (Max),pH_target
0,0.001131,0.884615,0.00112,0.001113,0.677632,0.841463,0.765152,0.787402,0.29375,0.298077,0.276163,0.648148
1,0.001131,0.884615,0.00112,0.001113,0.677632,0.841463,0.757576,0.779528,0.29375,0.288462,0.287791,0.648148
2,0.001131,0.884615,0.00112,0.001133,0.677632,0.841463,0.765152,0.779528,0.3,0.294872,0.293605,0.648148
3,0.001131,0.884615,0.001139,0.001133,0.677632,0.841463,0.757576,0.779528,0.30625,0.301282,0.296512,0.648148
4,0.001131,0.884615,0.001139,0.001133,0.664474,0.841463,0.75,0.771654,0.325,0.317308,0.325581,0.648148


In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
lr_model = LinearRegression()
rf_model = RandomForestRegressor(n_estimators=200, random_state=42)
gb_model = GradientBoostingRegressor(random_state=42)

In [10]:
lr_model.fit(X_train_scaled, y_train)
rf_model.fit(X_train_scaled, y_train)
gb_model.fit(X_train_scaled, y_train)

In [11]:
pred_lr = lr_model.predict(X_test_scaled)
pred_rf = rf_model.predict(X_test_scaled)
pred_gb = gb_model.predict(X_test_scaled)

In [12]:
rmse_lr = np.sqrt(mean_squared_error(y_test, pred_lr))
mae_lr = mean_absolute_error(y_test, pred_lr)
mape_lr = np.mean(np.abs((y_test - pred_lr) / (y_test + 1e-8))) * 100

In [13]:
rmse_rf = np.sqrt(mean_squared_error(y_test, pred_rf))
mae_rf = mean_absolute_error(y_test, pred_rf)
mape_rf = np.mean(np.abs((y_test - pred_rf) / (y_test + 1e-8))) * 100

In [14]:
rmse_gb = np.sqrt(mean_squared_error(y_test, pred_gb))
mae_gb = mean_absolute_error(y_test, pred_gb)
mape_gb = np.mean(np.abs((y_test - pred_gb) / (y_test + 1e-8))) * 100

In [15]:
results = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest', 'Gradient Boosting'],
    'RMSE': [rmse_lr, rmse_rf, rmse_gb],
    'MAE': [mae_lr, mae_rf, mae_gb],
    'MAPE (%)': [mape_lr, mape_rf, mape_gb]
})

In [16]:
results

Unnamed: 0,Model,RMSE,MAE,MAPE (%)
0,Linear Regression,0.005548,0.003501,0.531028
1,Random Forest,0.005506,0.004013,0.611361
2,Gradient Boosting,0.005366,0.003922,0.599217
