# Полиномиальная регрессия

In [None]:
!pip install optuna

In [30]:
import optuna
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.metrics import mean_squared_log_error

## получение данных

In [31]:
data = pd.read_csv('/content/sample_data/energy_data_2.csv')
print(data.head())

             timestamp  meter_reading  air_temperature  cloud_coverage  \
0  2016-01-30 08:00:00        43.6839              8.3             0.0   
1  2016-01-31 05:00:00        37.5408             12.8             0.0   
2  2016-01-31 17:00:00        52.5571             20.6             0.0   
3  2016-04-08 14:00:00        59.3827             21.7             2.0   
4  2016-05-01 19:00:00       448.0000             31.1             0.0   

   dew_temperature  precip_depth_1_hr  sea_level_pressure  wind_speed  \
0              6.1                0.0              1019.0         2.1   
1             10.0                0.0              1021.9         0.0   
2             11.7                0.0              1020.9         1.5   
3             14.4                0.0              1015.1         3.1   
4             17.2                0.0              1016.1         4.1   

   wind_direction_sin  wind_direction_cos  air_temperature1  hour  
0        5.877853e-01        8.090170e-01       

## получение факторов второго порядка

In [32]:
columns_iterate = data.columns
columns = list(data.columns)
for column1 in columns_iterate:
  for column2 in columns_iterate:
    if (column1 not in ['timestamp', 'meter_reading'] and 
        column2 not in ['timestamp', 'meter_reading']): 
        c = column1 + '_' + column2 
        data[c] = np.multiply(data[column1], data[column2]) 
        columns.append(c)
columns.remove('timestamp')
columns.remove('meter_reading')

  


## нормализация данных

In [33]:
data_norm = MinMaxScaler().fit_transform(data[columns])

# Модель линейной регрессии

In [34]:
def rmsle_err (y, y_pred):
  return((np.log(1 + y) - np.log(1 + y_pred))**2).mean()**0.5

In [35]:
x = data_norm
y = data['meter_reading']
model = LinearRegression().fit(x, y)
print('RMSLE: {0:.5}'.format(rmsle_err(y, model.predict(x))))

RMSLE: 0.18903


# Оптимизация гиперпараметров

## использование Optuna

In [39]:
def objective (trial):
  alpha = trial.suggest_float('alpha', 1e-8, 1, log = True)
  l1_ratio = trial.suggest_float('l1_ratio', 1e-3, 1, log = True)
  regressor_obj = ElasticNet(alpha = alpha, l1_ratio = l1_ratio, max_iter = 1000)
  regressor_obj.fit(x, y)
  y_pred = regressor_obj.predict(x)
  return mean_squared_log_error(y, y_pred)

In [41]:
study = optuna.create_study()
study.optimize(objective, n_trials = 100)

[32m[I 2022-05-22 10:23:50,158][0m A new study created in memory with name: no-name-d3221010-1913-47de-b685-7ade4381c47d[0m
[32m[I 2022-05-22 10:23:50,204][0m Trial 0 finished with value: 0.0499277605231467 and parameters: {'alpha': 0.3003485783923512, 'l1_ratio': 0.002412119675239047}. Best is trial 0 with value: 0.0499277605231467.[0m
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
[32m[I 2022-05-22 10:23:51,302][0m Trial 1 finished with value: 0.03762718867086002 and parameters: {'alpha': 2.1371804577260404e-07, 'l1_ratio': 0.017598527847859084}. Best is trial 1 with value: 0.03762718867086002.[0m
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
[32m[I 2022-05-22 10:23:51,935][0m Trial 2 finished with value: 0.03772653043874018 and parameters: {'alpha': 7.9144707339489e-05, 'l1_ratio': 0.6776127636653178}. Best is trial 1 with value: 0.03762718867086002.[0m
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
[32m[I 

In [42]:
model_optuna = ElasticNet(alpha = study.best_params['alpha'], 
                          l1_ratio = study.best_params['l1_ratio'], max_iter = 1000).fit(x, y)
print('RMSLE: {0:.5}'.format(rmsle_err(y, model_optuna.predict(x))))

RMSLE: 0.19397


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
