In [1]:
import pandas as pd
import numpy as np
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score

### Работа с данными

In [2]:
X_data = pd.read_csv('X_data.csv', sep=';')
Y_train = pd.read_csv('Y_train.csv', sep=';', header=None)
Y_subs = pd.read_csv('Y_submit.csv', sep=';', header=None)

In [3]:
X_data.rename(columns={"Unnamed: 0":'date'}, inplace=True)

In [4]:
X_data.head()

Unnamed: 0,date,T_data_1_1,T_data_1_2,T_data_1_3,T_data_2_1,T_data_2_2,T_data_2_3,T_data_3_1,T_data_3_2,T_data_3_3,T_data_4_1,T_data_4_2,T_data_4_3,T_data_5_1,T_data_5_2,T_data_5_3,H_data,AH_data
0,2015-01-01 00:00:00,212,210,211,347,353,347,474,473,481,346,348,355,241,241,243,167.85,9.22
1,2015-01-01 00:01:00,212,211,211,346,352,346,475,473,481,349,348,355,241,241,243,162.51,9.22
2,2015-01-01 00:02:00,212,211,211,345,352,346,476,473,481,352,349,355,242,241,242,164.99,9.22
3,2015-01-01 00:03:00,213,211,211,344,351,346,477,473,481,355,349,355,242,241,242,167.34,9.22
4,2015-01-01 00:04:00,213,211,211,343,350,346,478,473,482,358,349,355,243,241,242,163.04,9.22


In [5]:
Y_train.rename(columns={0:'date', 1:'target'}, inplace=True)

In [6]:
Y_train.head()

Unnamed: 0,date,target
0,2015-01-04 00:05:00,392
1,2015-01-04 01:05:00,384
2,2015-01-04 02:05:00,393
3,2015-01-04 03:05:00,399
4,2015-01-04 04:05:00,400


In [7]:
Y_subs.rename(columns={0:"date",1:"target"}, inplace=True)

In [8]:
Y_subs.head()

Unnamed: 0,date,target
0,2018-05-04 00:05:00,420
1,2018-05-04 01:05:00,420
2,2018-05-04 02:05:00,420
3,2018-05-04 03:05:00,420
4,2018-05-04 04:05:00,420


In [9]:
df = X_data.merge(Y_train, on='date')

In [10]:
df.head()

Unnamed: 0,date,T_data_1_1,T_data_1_2,T_data_1_3,T_data_2_1,T_data_2_2,T_data_2_3,T_data_3_1,T_data_3_2,T_data_3_3,T_data_4_1,T_data_4_2,T_data_4_3,T_data_5_1,T_data_5_2,T_data_5_3,H_data,AH_data,target
0,2015-01-04 00:05:00,277,326,273,322,335,352,505,501,670,326,379,337,231,236,242,153.77,7.9,392
1,2015-01-04 01:05:00,277,253,272,320,333,355,500,501,687,337,396,335,234,242,230,158.27,6.96,384
2,2015-01-04 02:05:00,262,218,260,326,336,330,505,499,443,347,399,332,243,251,240,153.36,7.29,393
3,2015-01-04 03:05:00,243,238,252,327,329,308,520,498,540,342,387,334,257,258,246,153.21,7.11,399
4,2015-01-04 04:05:00,236,238,245,323,320,318,522,501,524,343,371,344,264,263,265,195.71,7.97,400


In [11]:
X = df.drop(['date','target'], axis=1)
y = df.target

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=127)

### Обучение модели

На данный момент CatBoost считается одним из лучших алгоритмов в задачах регрессии. Попробуем его и LightGBM.

In [13]:
model_catboost = CatBoostRegressor(iterations = 2000,
                           learning_rate=0.1,
                           depth = 10,
                           loss_function='RMSE',
                           reg_lambda=7,
                           bootstrap_type='Bernoulli',
                           subsample=0.8,
                           random_state=322,
                           verbose=False
                          )

In [14]:
model_catboost.fit(X_train, y_train)

<catboost.core.CatBoostRegressor at 0x7fc6c91177d0>

In [15]:
preds = model_catboost.predict(X_test)
print("MAE: " , mean_absolute_error(y_test, preds))
print("R2: ", r2_score(y_test, preds))

MAE:  9.41164328236063
R2:  0.9257392105892747


In [16]:
model_lgbm = LGBMRegressor(boosting_type='gbdt',
                       colsample_bytree=0.8,
                       learning_rate=0.05,
                       n_estimators=2300,
                       num_leaves=14,
                       subsample=0.6,
                       reg_lambda=7.5,
                       metric='mae',
                       random_state=0,
                       )

In [17]:
model_lgbm.fit(X_train, y_train)

LGBMRegressor(colsample_bytree=0.8, learning_rate=0.05, metric='mae',
              n_estimators=2300, num_leaves=14, random_state=0, reg_lambda=7.5,
              subsample=0.6)

In [18]:
preds = model_lgbm.predict(X_test)
print("MAE: " , mean_absolute_error(y_test, preds))
print("R2: ", r2_score(y_test, preds))

MAE:  9.846704445010033
R2:  0.9210279950470792


In [19]:
model_xgb = XGBRegressor(eta = 0.1,
                         max_depth = 7,
                        subsample = 1)

In [20]:
model_xgb.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, eta=0.1, gamma=0,
             gpu_id=-1, importance_type='gain', interaction_constraints='',
             learning_rate=0.100000001, max_delta_step=0, max_depth=7,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [21]:
preds = model_xgb.predict(X_test)
print("MAE: " , mean_absolute_error(y_test, preds))
print("R2: ", r2_score(y_test, preds))

MAE:  10.025015055408252
R2:  0.9177363353392795


Составим ансамбль из трёх моделей с коэфициентами 0.5, 0.3 и 0.2 соотственно результатам на валидационном наборе

### Загрузим submission

In [22]:
x_submit = X[X.index.isin(Y_subs.index)]

In [23]:
x_submit.head()

Unnamed: 0,T_data_1_1,T_data_1_2,T_data_1_3,T_data_2_1,T_data_2_2,T_data_2_3,T_data_3_1,T_data_3_2,T_data_3_3,T_data_4_1,T_data_4_2,T_data_4_3,T_data_5_1,T_data_5_2,T_data_5_3,H_data,AH_data
0,277,326,273,322,335,352,505,501,670,326,379,337,231,236,242,153.77,7.9
1,277,253,272,320,333,355,500,501,687,337,396,335,234,242,230,158.27,6.96
2,262,218,260,326,336,330,505,499,443,347,399,332,243,251,240,153.36,7.29
3,243,238,252,327,329,308,520,498,540,342,387,334,257,258,246,153.21,7.11
4,236,238,245,323,320,318,522,501,524,343,371,344,264,263,265,195.71,7.97


In [24]:
preds_catboost = model_catboost.predict(x_submit)
preds_lgbm = model_lgbm.predict(x_submit)
preds_xgb = model_xgb.predict(x_submit)
preds = 0.5 * preds_catboost + 0.3 * preds_lgbm + 0.2 * preds_xgb
preds

array([392.96219366, 391.80028545, 399.34068333, ..., 418.05458666,
       430.40211916, 439.9435781 ])

In [25]:
Y_subs.target = preds
Y_subs

Unnamed: 0,date,target
0,2018-05-04 00:05:00,392.962194
1,2018-05-04 01:05:00,391.800285
2,2018-05-04 02:05:00,399.340683
3,2018-05-04 03:05:00,383.645535
4,2018-05-04 04:05:00,402.281265
...,...,...
5803,2018-12-31 19:05:00,384.105214
5804,2018-12-31 20:05:00,396.347789
5805,2018-12-31 21:05:00,418.054587
5806,2018-12-31 22:05:00,430.402119


In [26]:
Y_subs.to_csv('submissions_severstal.csv')