In [1]:
import numpy as np
import pandas as pd
from scripts import get_data
from scripts import train_test_scaled_split
from scripts import Model

X_train, y_train, X_test, y_test, cols, dates = train_test_scaled_split(get_data(smooth = True), include_x = True)

In [32]:
import time
from tqdm import tqdm
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import RandomForestRegressor as RFR


param_grid = {
    'bootstrap': [True, False],
    'max_features': ['auto', 'sqrt', 'log2'],
    'n_estimators': [50, 100, 200],
    'criterion': ['mse', 'mae'],
    'max_depth': [5, 10, 15, 20, None], 
    }

n = 3
df = []
for params in tqdm(list(ParameterGrid(param_grid))):
    
    ex_time = 0
    scores = np.zeros(3)
    for _ in range(n):
        start_time = time.time()
        model = Model(estimator_fn = RFR, dates=dates, M = 120, **params).fit(X_train, y_train)
        names, current_scores = zip(*model.score(X_test, y_test, all_three = True).to_dict().items())
        scores += np.array(current_scores)
        ex_time += time.time() - start_time

    d = dict(params)
    
    d['time_avg'] = ex_time / n
    scores /= n
    names = list(map(lambda x: x + '_avg', names))
    scores = dict(zip(names, list(scores)))
    df.append({**d, **scores})


df = pd.DataFrame(df)
df.sort_values('test_avg', axis = 0, ascending = False).head()

100%|██████████| 180/180 [1:03:09<00:00, 21.05s/it]


Unnamed: 0,bootstrap,criterion,max_depth,max_features,n_estimators,time_avg,sin_params_avg,train_avg,test_avg
82,True,mae,,auto,100,11.155625,0.999,0.760667,0.269333
64,True,mae,15.0,auto,100,11.210051,0.999,0.752333,0.246667
9,True,mse,10.0,auto,50,1.715407,0.999,0.745,0.223333
55,True,mae,10.0,auto,100,15.165274,0.998667,0.747333,0.219333
81,True,mae,,auto,50,6.312004,0.999,0.750667,0.203333


In [11]:
from sklearn.ensemble import RandomForestRegressor as RFR

model = Model(estimator_fn = RFR, 
              M = 120,
              dates = dates,
              vis_predict = True,
              write_imgs = True,
              bootstrap = True,
              criterion = 'mae',
              max_features = 'auto',
              n_estimators = 100
             )
_ = model.fit(X_train, y_train)
_ = model.predict(X_test, y_test)

for name, e in model.estimators().items():
    print(pd.Series(e.feature_importances_, index = cols, name = name).sort_values(ascending = False))
    print('\n\n')

sin_params    0.999
train         0.760
test          0.205
Name: R2 of RandomForestRegressor(criterion='mae', n_estimators=200, random_state=0), dtype: float64


x           0.826298
rsum        0.094095
t_inside    0.022235
co2         0.012992
t_avg       0.011686
h_avg       0.008529
p_avg       0.008388
dp_avg      0.008015
lamps       0.007434
imp_day     0.000330
Name: sin_a, dtype: float64



x           0.557243
dp_avg      0.112814
t_inside    0.093834
rsum        0.053345
lamps       0.048445
t_avg       0.045286
p_avg       0.034403
co2         0.026222
h_avg       0.026086
imp_day     0.002322
Name: sin_b, dtype: float64



x           0.662148
dp_avg      0.093208
lamps       0.068368
t_avg       0.045328
t_inside    0.033513
p_avg       0.032429
rsum        0.026450
co2         0.019495
h_avg       0.018734
imp_day     0.000327
Name: sin_c, dtype: float64



x           0.382843
t_inside    0.239305
dp_avg      0.157825
t_avg       0.071254
lamps       0.051187
rsum        0.033794
co2         0.023934
h_avg       0.021482
p_avg       0.017969
imp_day     0.000405
Name: sin_d, dtype: float64



