In [1]:
import numpy as np
import os
import uuid

import joblib # for persisting models

# models
from scipy.optimize import curve_fit # multip. linear regression
from sklearn.svm import SVR # support vector forrest
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

import pandas as pd
import time
from datetime import datetime

from pathlib import Path

In [2]:
model = SVR()

# get available parameters for model
print(model.get_params())

# check if random_state is included in the parameters
print('random_state' in model.get_params())

{'C': 1.0, 'cache_size': 200, 'coef0': 0.0, 'degree': 3, 'epsilon': 0.1, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'shrinking': True, 'tol': 0.001, 'verbose': False}


In [None]:
model = SVR()

# get available parameters for models/

In [7]:
file_path = Path('results/stats.json')

df = pd.read_json(file_path, orient='records')

ValueError: Trailing data

In [3]:
df = pd.read_csv('../aggregated_measurements_data.csv', index_col=0)

df_cleaned = df.dropna()
print("Dropped ", len(df) - len(df_cleaned), " rows with NaN values")

Dropped  780  rows with NaN values


In [7]:
import importlib
import svr_test_bed

importlib.reload(svr_test_bed) # reload exported functions

test_sizes = []
sample_sizes = []
features = []
model_arguments = []

test_sizes.append(0.45)
sample_sizes.append(50000)
features.append(['distance', 'c_walls', 'w_walls', 'co2', 'humidity', 'pm25', 'pressure', 'temperature', 'snr'])
model_arguments.append({'kernel': 'rbf'})



test_sizes.append(0.80)
sample_sizes.append(150000)
features.append(['distance', 'c_walls', 'co2', 'humidity', 'pm25', 'pressure', 'temperature', 'snr'])
model_arguments.append({'kernel': 'rbf'})

test_specification = pd.DataFrame({
    "test_size": test_sizes,
    "sample_size": sample_sizes,
    "features": features,
    "model_arguments": model_arguments,
})

# assign ids to tests
test_specification['id'] = [str(uuid.uuid4()).replace('-', '')[:8] for _ in range(len(test_specification))]

test_specification['output_file'] = './results/results.csv'

(test_results_svr, best_svr_model, best_svr_model_id) = svr_test_bed.svr_test_bed(
    data=df_cleaned,
    test_specification=test_specification,
)

# save best model
#create directory if it does not exist
os.makedirs("./best_models", exist_ok=True)
joblib.dump(best_svr_model, f"./best_models/svr_model_{best_svr_model_id}.joblib")

Test 1 of 1 with sample_size 50000, test_size 0.45 and features ['distance', 'c_walls', 'w_walls', 'co2', 'humidity', 'pm25', 'pressure', 'temperature', 'snr']
Creating Model
Fitting Model
Predicting
Test took 00:02:05
Best model id:  613ab369  with r2:  0.45562360832518234


['./best_models/svr_model_613ab369.joblib']

In [11]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Example DataFrame (Replace this with your actual data)
# Define features (X) and target (y)
X = df_cleaned[['distance', 'c_walls', 'w_walls', 'co2', 'humidity', 'pm25', 'pressure', 'temperature', 'snr']]
y = df_cleaned['exp_pl']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Initialize the Decision Tree Regressor
regressor = DecisionTreeRegressor(random_state=42)

# Fit the model
regressor.fit(X_train, y_train)

# Make predictions
y_pred = regressor.predict(X_test)
y_pred2 = regressor.predict(X_train)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

print("Training Data Set")
print("Mean Squared Error:", mean_squared_error(y_train, y_pred2)) # overfitting here
print("R-squared:", r2_score(y_train, y_pred2))

# Optional: Display feature importances
feature_importances = pd.Series(regressor.feature_importances_, index=X.columns)
print("Feature Importances:\n", feature_importances)

Mean Squared Error: 15.798026205573127
R-squared: 0.9717673509900925
Training Data Set
Mean Squared Error: 8.291873963515754e-05
R-squared: 0.9999998517598937
Feature Importances:
 distance       0.099850
c_walls        0.021315
w_walls        0.325613
co2            0.005956
humidity       0.009128
pm25           0.005125
pressure       0.012777
temperature    0.007746
snr            0.512489
dtype: float64


In [5]:
print(df_cleaned.corr()['exp_pl'].sort_values(ascending=False))

exp_pl         1.000000
distance       0.586789
w_walls        0.454843
toa            0.356896
SF             0.290373
c_walls        0.281712
f_count        0.043549
humidity       0.026794
pressure       0.003594
frequency      0.002861
p_count       -0.000204
pm25          -0.017484
co2           -0.074096
temperature   -0.084946
snr           -0.670766
n_power       -0.992823
esp           -0.993180
rssi          -1.000000
Name: exp_pl, dtype: float64
