In [None]:
!pip install optuna

In [None]:
import optuna
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split

In [None]:
# получение данных
data = pd.read_csv('/content/sample_data/energy_data_2.csv')
print(data.head())

             timestamp  meter_reading  air_temperature  cloud_coverage  \
0  2016-01-30 08:00:00        43.6839              8.3             0.0   
1  2016-01-31 05:00:00        37.5408             12.8             0.0   
2  2016-01-31 17:00:00        52.5571             20.6             0.0   
3  2016-04-08 14:00:00        59.3827             21.7             2.0   
4  2016-05-01 19:00:00       448.0000             31.1             0.0   

   dew_temperature  precip_depth_1_hr  sea_level_pressure  wind_speed  \
0              6.1                0.0              1019.0         2.1   
1             10.0                0.0              1021.9         0.0   
2             11.7                0.0              1020.9         1.5   
3             14.4                0.0              1015.1         3.1   
4             17.2                0.0              1016.1         4.1   

   wind_direction_sin  wind_direction_cos  air_temperature1  hour  
0        5.877853e-01        8.090170e-01       

In [None]:
# нормализация данных
data_norm = pd.DataFrame(MinMaxScaler().fit_transform(data[['air_temperature', 
                                                           'air_temperature1', 
                                                           'sea_level_pressure']]))
print(data_norm.head())

          0         1         2
0  0.106109  0.556180  0.739726
1  0.250804  0.623596  0.819178
2  0.501608  0.780899  0.791781
3  0.536977  0.842697  0.632877
4  0.839228  0.747191  0.660274


In [None]:
# РАЗДЕЛЕНИЕ ДАННЫХ
# на обучающие и проверочные
train, test, y_train, y_test = train_test_split(data_norm, data['meter_reading'], test_size = 0.2)

In [None]:
# МОДЕЛИ РЕГРЕССИИ
# Линейная и изотоническая регрессия
def rmsle_err (y, y_pred):
  return((np.log(1 + y) - np.log(1 + y_pred))**2).mean()**0.5

In [None]:
y = data['meter_reading']
model1 = LinearRegression().fit(train, y_train)
print('RMSLE: {0:.5}'.format(rmsle_err(y_train, model1.predict(train))))

RMSLE: 0.22112


In [None]:
model2 = IsotonicRegression(out_of_bounds='clip').fit(train[0], y_train)
print("RMSLE: {0:.5}".format(rmsle_err(y_train, model2.predict(train[0]))))

RMSLE: 0.21775


In [None]:
# ОБЪЕДИНЕНИЕ МОДЕЛЕЙ
# Применение Optuna для поиска оптимального коэффициента
def objective (trial):
  alpha = trial.suggest_float('alpha', 1e-10, 1, log=True)
  y_pred = (alpha*model1.predict(test) + 
            (1 - alpha)*model2.predict(test[0]))
  return mean_squared_log_error(y_test, y_pred)

In [None]:
study = optuna.create_study()
study.optimize(objective, n_trials=100)

[32m[I 2022-05-29 18:38:46,071][0m A new study created in memory with name: no-name-6c1388c6-d4c1-45dd-b0c1-264115257da2[0m
[32m[I 2022-05-29 18:38:46,519][0m Trial 0 finished with value: 0.044155444815048635 and parameters: {'alpha': 0.07386967390919558}. Best is trial 0 with value: 0.044155444815048635.[0m
[32m[I 2022-05-29 18:38:46,525][0m Trial 1 finished with value: 0.044167244638939014 and parameters: {'alpha': 0.03143850473913334}. Best is trial 0 with value: 0.044155444815048635.[0m
[32m[I 2022-05-29 18:38:46,535][0m Trial 2 finished with value: 0.04418070281250125 and parameters: {'alpha': 2.372588428960705e-07}. Best is trial 0 with value: 0.044155444815048635.[0m
[32m[I 2022-05-29 18:38:46,545][0m Trial 3 finished with value: 0.04417935918064203 and parameters: {'alpha': 0.0027625870928715715}. Best is trial 0 with value: 0.044155444815048635.[0m
[32m[I 2022-05-29 18:38:46,553][0m Trial 4 finished with value: 0.044180702829554816 and parameters: {'alpha': 2.

In [None]:
y_pred1 = model1.predict(data_norm)
y_pred2 = model2.predict(data_norm[0])
y_pred = (study.best_params['alpha'] * y_pred1 + (1 - study.best_params['alpha'] * y_pred2))
print('RMSLE линейной регрессии {0:.5}'.format(rmsle_err(y, y_pred1)))
print('RMSLE изотонической регрессии {0:.5}'.format(rmsle_err(y, y_pred2)))
print('RMSLE ансамбля моделей {0:.5}'.format(rmsle_err(y, y_pred)))

RMSLE линейной регрессии 0.21966
RMSLE изотонической регрессии 0.21626
RMSLE ансамбля моделей 5.0419


  after removing the cwd from sys.path.
