In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf
from tqdm.notebook import tqdm
from itertools import product

In [None]:
train_df = pd.read_parquet('data/train.parquet')
index_split = train_df.index.str.split('_')
train_df['row_num'] = index_split.str.get(0)
train_df['row_type'] = index_split.str.get(1)
train_df['timestamp'] = index_split.str.get(-1).astype(int)
train_df['row_id'] = train_df['row_num'] + '_' + train_df['row_type']

time_line_df = train_df.pivot(index='row_id', columns='timestamp', values='Values')
time_line_df.columns = ['tr_' + str(col) for col in time_line_df.columns]

In [None]:
train_df.head()

In [None]:
time_line_df.head()

In [None]:
day_period = 288
week_period = 288 * 7
n_prev_values = 20

In [None]:
def create_target_df(tl_df, start_time, step_time):
    taregt_df = tl_df\
        .iloc[:, start_time::step_time]\
        .melt(ignore_index=False, value_name='target')\
        .reset_index()

    taregt_df['time'] = taregt_df['variable'].str.split('_').str.get(1).astype(int)
    return taregt_df[['row_id', 'time', 'target']]

target_df = create_target_df(time_line_df, week_period, 5)

In [None]:
target_df.head()

In [None]:
def get_prev_weeks_mean(tl_df, week_period):
    dfs = []

    for i in tqdm(range(tl_df.shape[1])):
        if i < week_period:
            continue
        
        cur_time_df = tl_df\
            .iloc[:, i % week_period:i:week_period]\
            .mean(axis=1)\
            .reset_index()\
            .rename(columns={0: 'mean_prev_weeks'})

        cur_time_df['time'] = i
        dfs.append(cur_time_df)

    return pd.concat(dfs)

prev_week_mean = get_prev_weeks_mean(time_line_df, week_period)
target_df = target_df.merge(prev_week_mean, on=['row_id', 'time'])
target_df.head()

In [None]:
def get_prev_values(tl_df, n_prev_values):
    dfs = []

    for i in tqdm(range(tl_df.shape[1])):
        if i < n_prev_values:
            continue
        
        cur_time_df = tl_df\
            .iloc[:, i - n_prev_values: i]\
            .set_axis([f'{n_prev_values - j}_prev' for j in range(n_prev_values)], axis=1)\
            .reset_index()

        cur_time_df['time'] = i
        dfs.append(cur_time_df)

    return pd.concat(dfs)

prev_values = get_prev_values(time_line_df, n_prev_values)
target_df = target_df.merge(prev_values, on=['row_id', 'time'])
target_df.head()

In [None]:
target_df.to_parquet('data/target.parquet')

# Train test split 

In [None]:
target_df = pd.read_parquet('data/target.parquet')

In [None]:
test_size = week_period
threshold = target_df['time'].max() - test_size
train_df = target_df[target_df['time'] < threshold].copy()
test_df = target_df[target_df['time'] >= threshold].copy()

# Simple model

In [None]:
from sklearn.linear_model import Ridge

In [None]:
row_id = '0_0'

In [None]:
test_df['predict'] = np.nan

In [None]:
ids = train_df['row_id'] == row_id
X_train = train_df[ids].drop(['row_id', 'time', 'target', 'mean_prev_weeks'], axis=1)
y_train = train_df.loc[ids, 'target']

ids = test_df['row_id'] == row_id
X_test = test_df[ids].drop(['row_id', 'time', 'target', 'mean_prev_weeks', 'predict'], axis=1)
y_test = test_df.loc[ids, 'target']

In [None]:
sns.lineplot(test_df.loc[ids], x='time', y='target', label='target')
sns.lineplot(test_df.loc[ids], x='time', y='mean_prev_weeks', label='mean_prev_weeks')

plt.show()

In [None]:
model = Ridge()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
test_df.loc[ids, 'predict'] = y_pred

In [None]:
sns.lineplot(test_df.loc[ids], x='time', y='target', label='target')
sns.lineplot(test_df.loc[ids], x='time', y='predict', label='predict')
sns.lineplot(test_df.loc[ids], x='time', y='mean_prev_weeks', label='mean_prev_weeks')

plt.show()

In [None]:
model.coef_