In [1]:
import numpy as np
import os
import uuid

import joblib # for persisting models

# models
from scipy.optimize import curve_fit # multip. linear regression
from sklearn.svm import SVR # support vector forrest
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.model_selection import GridSearchCV # select hyper parameters

import pandas as pd
import time
from datetime import datetime

df = pd.read_csv('../aggregated_measurements_data.csv', index_col=0)

df_cleaned = df.dropna()
print("Dropped ", len(df) - len(df_cleaned), " rows with NaN values")
print("Remaining rows: ", len(df_cleaned))

X = df_cleaned[['distance', 'c_walls', 'w_walls', 'co2', 'humidity', 'pm25', 'pressure', 'temperature', 'snr']]
y = df_cleaned['exp_pl']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Dropped  780  rows with NaN values
Remaining rows:  492451


In [2]:
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

regressor = DecisionTreeRegressor(random_state=42)

grid_search = GridSearchCV(estimator=regressor, 
                           param_grid=param_grid, 
                           cv=5,              # 5-fold cross-validation
                           scoring='neg_mean_squared_error', # needs to be negated for search
                           verbose=1,
                           n_jobs=-1)         # Use all available CPUs

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best CV Score (MSE):", -grid_search.best_score_)  # Convert negative MSE back to positive, TODO needed?

best_model = grid_search.best_estimator_

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'max_depth': None, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 10}
Best CV Score (MSE): 11.707168150319111
Test MSE: 11.809730883322613


In [3]:
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("Training Set Performance:")
print(f"  MSE: {train_mse:.4f}")
print(f"  R²: {train_r2:.4f}")

print("Test Set Performance:")
print(f"  MSE: {test_mse:.4f}")
print(f"  R²: {test_r2:.4f}")

if train_mse < test_mse:
    print("Warning: Model may be overfitting (training MSE < test MSE).")
else:
    print("Model performance looks balanced.")

Training Set Performance:
  MSE: 4.0101
  R²: 0.9928
Test Set Performance:
  MSE: 11.8097
  R²: 0.9790
