In [4]:
import optuna
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from sklearn.metrics import mean_squared_log_error

In [7]:
data = pd.read_csv('energy_data_2.csv')
print(data.head())

             timestamp  meter_reading  air_temperature  cloud_coverage  \
0  2016-01-30 08:00:00        43.6839              8.3             0.0   
1  2016-01-31 05:00:00        37.5408             12.8             0.0   
2  2016-01-31 17:00:00        52.5571             20.6             0.0   
3  2016-04-08 14:00:00        59.3827             21.7             2.0   
4  2016-05-01 19:00:00       448.0000             31.1             0.0   

   dew_temperature  precip_depth_1_hr  sea_level_pressure  wind_speed  \
0              6.1                0.0              1019.0         2.1   
1             10.0                0.0              1021.9         0.0   
2             11.7                0.0              1020.9         1.5   
3             14.4                0.0              1015.1         3.1   
4             17.2                0.0              1016.1         4.1   

   wind_direction_sin  wind_direction_cos  air_temperature1  hour  
0        5.877853e-01        8.090170e-01       

In [8]:
# нормализация параметров

data_norm = MinMaxScaler().fit_transform(data[['air_temperature', 
                                              'sea_level_pressure',
                                              'cloud_coverage',
                                              'dew_temperature',
                                              'precip_depth_1_hr',
                                              'wind_speed',
                                              'wind_direction_sin',
                                              'wind_direction_cos',
                                              'air_temperature1']])

In [13]:
# Модель линейной регрессии
# без регуляризации

def rmsle_err(y, y_pred):
    return((np.log(1 + y) - np.log(1 + y_pred))**2).mean()**0.5

In [14]:
x = data_norm
y = data['meter_reading']
model = LinearRegression().fit(x , y)
print("RMSLE: {0:.5}".format(rmsle_err(y, model.predict(x))))

RMSLE: 0.21213


In [17]:
# Случайная оптимизация
# Используем случайный поиск для двух гиперпараметров alpha и l1_ratio

model_el = ElasticNet()

In [16]:
# Зададим случайные выборки для гиперпараметров

distribution = dict(alpha = uniform(loc = 0, scale = 1),
                   l1_ratio = uniform(loc = 0, scale = 1))

In [18]:
clf = RandomizedSearchCV(model_el, distribution, random_state = 0)

In [19]:
# Проведем поиск

search = clf.fit(x , y)
print(search.best_params_)

{'alpha': 0.02021839744032572, 'l1_ratio': 0.832619845547938}


In [20]:
# создадим оптимальную модель и оценим ее точность

model_el = ElasticNet(alpha = search.best_params_['alpha'],
                     l1_ratio = search.best_params_['l1_ratio']).fit(x , y)
print("RMSLE: {0:.5}".format(rmsle_err(y, model_el.predict(x))))

RMSLE: 0.21203


In [22]:
# OPTUNA
# Объединим процесс обучения и проверки модели с гиперпараметрами в один шаг

def objective(trial):
    alpha = trial.suggest_float('alpha', 1e-8, 1, log = True)
    l1_ratio = trial.suggest_float('l1_ratio', 1e-3, 1, log = True)
    regressor_obj = ElasticNet(alpha = alpha, l1_ratio = l1_ratio)
    regressor_obj.fit(x , y)
    y_pred = regressor_obj.predict(x)
    return mean_squared_log_error(y, y_pred)

In [25]:
study = optuna.create_study()
study.optimize(objective, n_trials = 100)

[32m[I 2022-05-15 21:40:49,663][0m A new study created in memory with name: no-name-f482db05-7bcc-41f1-832c-f68f0159d8f6[0m
[32m[I 2022-05-15 21:40:49,675][0m Trial 0 finished with value: 0.04499789822993763 and parameters: {'alpha': 1.2360747354830308e-06, 'l1_ratio': 0.003453372749616612}. Best is trial 0 with value: 0.04499789822993763.[0m
[32m[I 2022-05-15 21:40:49,694][0m Trial 1 finished with value: 0.045509221864268604 and parameters: {'alpha': 0.009174106530026553, 'l1_ratio': 0.03591286553968817}. Best is trial 0 with value: 0.04499789822993763.[0m
[32m[I 2022-05-15 21:40:49,712][0m Trial 2 finished with value: 0.04558793648858625 and parameters: {'alpha': 0.009662291578815995, 'l1_ratio': 0.029771058162605665}. Best is trial 0 with value: 0.04499789822993763.[0m
[32m[I 2022-05-15 21:40:49,728][0m Trial 3 finished with value: 0.04499806570332884 and parameters: {'alpha': 2.9299606853857364e-08, 'l1_ratio': 0.014434271629454353}. Best is trial 0 with value: 0.0449

In [26]:
print(study.best_params)

{'alpha': 0.0020517131926672673, 'l1_ratio': 0.0021104131881332167}


In [30]:
model_optuna = ElasticNet(alpha = study.best_params['alpha'],
                         l1_ratio = study.best_params['l1_ratio']).fit(x , y)
print("RMSLE: {0:.5}".format(rmsle_err(y, model_optuna.predict(x))))

RMSLE: 0.21183


In [None]:
# ЗАДАНИЕ 
# Постройте модели ElasticNet с разным набором параметров погоды и выясните, какой набор погодных параметров
# оптимизируется к наименьшей метрике RMSLE через Optuna