In [138]:
import pandas as pd
import numpy
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

In [168]:
data = pd.read_csv('gb_dfs_concat_dropna.csv')

data = data.sort_values(['month', 'day', 'hour'])

In [None]:
# mean_delay
# trafic
# count

In [163]:
def make_time_features(data, timestamps=[24, 12, 6, 5, 4, 3, 2, 1], target='mean_delay'):
    targets = []
    features = []
    for i in range(data.shape[0]):
        temp_arr = []
        for s in timestamps:
            temp = data.shift(s)[target].loc[i]
            temp_arr.append(temp)
            
        targets.append(data[target].loc[i])
        features.append(temp_arr)
    
    res_df = pd.DataFrame(features)
    res_df['target_{}'.format(target)] = targets
    
    rename_dict = {}
    for i in range(len(timestamps)):
        rename_dict[i] = 'lag_{}_hours'.format(timestamps[i])
    res_df = res_df.rename(columns=rename_dict)
    
    return res_df

In [164]:
def make_mean_features(data, timestamps=[24, 12, 6, 5, 4, 3, 2], target='mean_delay'):
    target_column = data['target_{}'.format(target)]
    features = []
    for timestamp in timestamps:
        column = [np.nan] * (timestamp-1)
        for i in range(timestamp, len(data)):
            data_slice = target_column[i-timestamp:i]
            column.append(np.mean(data_slice))
        features.append(column)
        
    res_df = pd.DataFrame(np.array(features).T, columns=['mean_{}_hours'.format(timestamps[j])
                          for j in range(len(timestamps))])
    #res_df['target_{}'.format(target)] = target_column
    return res_df

In [179]:
def make_data(data, target='mean_delay'):
    lags = make_time_features(data, target=target)
    X = make_mean_features(lags, target=target)
    X = pd.concat((X, lags), axis=1)
    y = X['target_{}'.format(target)]
    X = X.fillna(0).drop('target_{}'.format(target), axis=1)
    return X, y

In [182]:
data_mean_delay = make_data(data)
data_count = make_data(data, target='count')
data_mean_input_trafic = make_data(data, target='mean_input_trafic')
data_mean_output_trafic = make_data(data, target='mean_output_trafic')

In [128]:
class TargetForecasting(object):
    def __init__(self, estimator='cat', **params):
        if estimator == 'cat':
            self.estimator = CatBoostRegressor(**params)
        elif estimator == 'rf':
            self.estimator = RandomForestRegressor(**params)
        elif estimator == 'lin':
            self.estimator = LinearRegression(**params)
        
    def fit(self, X, y, **fit_params):
        self.estimator.fit(X, y, **fit_params)
    
    def predict(self, X):
        return self.estimator.predict(X)

In [185]:
def make_model(data_target):
    X, y = data_target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = TargetForecasting(estimator='rf', n_estimators=40)
    model.fit(X_train, y_train)
    print('MAE:', mean_absolute_error(y_test, model.predict(X_test)))

In [187]:
make_model(data_mean_delay)
make_model(data_count)
make_model(data_mean_input_trafic)
make_model(data_mean_output_trafic)

MAE: 0.01685603499301018
MAE: 6780.501704545456
MAE: 61317128.29828878
MAE: 258767596.0361783
