# 3. linear regression

In [1]:
import joblib

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import matplotlib.pyplot as plt

## 2.1 load and inpsect data

In [3]:
X_train = np.load("../data/X_train.npy")
y_train = np.load("../data/y_train.npy")

X_val = np.load("../data/X_val.npy")
y_val = np.load("../data/y_val.npy")

X_test = np.load("../data/X_test.npy")
y_test = np.load("../data/y_test.npy")

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")

# match shape for linear regression (expects 2D input)
if len(X_train.shape) > 2:
    X_train = X_train.reshape(X_train.shape[0], -1)
    X_val = X_val.reshape(X_val.shape[0], -1)
    X_test = X_test.reshape(X_test.shape[0], -1)

X_train shape: (294359, 12, 13)
y_train shape: (294359, 13)


## 2.2 train the model

In [4]:
model = LinearRegression()
model.fit(X_train, y_train)

joblib.dump(model, "../models/linear_regression_weather.pkl")

['../models/linear_regression_weather.pkl']

## 2.3 evaluate on test set 

In [7]:
model = joblib.load("../models/linear_regression_weather.pkl")

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

In [8]:
test_loss = mean_squared_error(y_test, y_pred_test)
test_mae = mean_absolute_error(y_test, y_pred_test)
print("Test loss:", test_loss)
print("Test MAE:", test_mae)

Test loss: 0.001833838159962901
Test MAE: 0.012039638188533965


In [None]:
y_pred = y_pred_test

NUMERIC_COLS = [
    "p (mbar)",
    "T (degC)",
    "Tdew (degC)",
    "rh (%)",
    "VPmax (mbar)",
    "VPact (mbar)",
    "VPdef (mbar)",
    "sh (g/kg)",
    "H2OC (mmol/mol)",
    "rho (g/m**3)",
    "wv (m/s)",
    "max. wv (m/s)",
    "wd (deg)"
]


n_features = len(NUMERIC_COLS)
n_rows, n_cols = n_features, 1
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 15))
axes = axes.flatten()

for i in range(n_features):
    ax = axes[i]
    ax.plot(y_test[:200, i], label="Actual", color='orange')
    ax.plot(y_pred[:200, i], label="Predicted", color='blue')
    ax.set_title(NUMERIC_COLS[i], fontsize=10)
    ax.legend(fontsize=8)
    ax.grid(True)

fig.suptitle("Actual vs predicted values (Linear Regression)", fontsize=14)
fig.tight_layout(rect=[0, 0, 1, 0.97])
plt.show()