In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf
from tqdm.notebook import tqdm
from itertools import product

In [4]:
train_df = pd.read_parquet('data/train.parquet')
index_split = train_df.index.str.split('_')
train_df['row_num'] = index_split.str.get(0)
train_df['row_type'] = index_split.str.get(1)
train_df['timestamp'] = index_split.str.get(-1).astype(int)
train_df['row_id'] = train_df['row_num'] + '_' + train_df['row_type']

In [5]:
train_df = train_df.reset_index()[['row_id', 'timestamp', 'Values']].rename(columns={'Values': 'value'})
train_df.head()

Unnamed: 0,row_id,timestamp,value
0,0_0,0,133.0
1,0_0,1,114.0
2,0_0,2,140.0
3,0_0,3,106.0
4,0_0,4,117.0


In [6]:
n_predicted_values = 100

for i in tqdm(range(1, n_predicted_values + 1)):
    target_df = train_df[['row_id', 'timestamp', 'value']]\
        .copy()\
        .rename(columns={'value': f'target_{i}'})
    
    target_df['timestamp'] -= i

    train_df = train_df.merge(target_df, on=['row_id', 'timestamp'])

train_df.head()

  0%|          | 0/100 [00:00<?, ?it/s]

Unnamed: 0,row_id,timestamp,value,target_1,target_2,target_3,target_4,target_5,target_6,target_7,...,target_91,target_92,target_93,target_94,target_95,target_96,target_97,target_98,target_99,target_100
0,0_0,0,133.0,114.0,140.0,106.0,117.0,110.0,130.0,112.0,...,454.0,440.0,478.0,472.0,400.0,445.0,454.0,461.0,483.0,449.0
1,0_0,1,114.0,140.0,106.0,117.0,110.0,130.0,112.0,115.0,...,440.0,478.0,472.0,400.0,445.0,454.0,461.0,483.0,449.0,457.0
2,0_0,2,140.0,106.0,117.0,110.0,130.0,112.0,115.0,93.0,...,478.0,472.0,400.0,445.0,454.0,461.0,483.0,449.0,457.0,457.0
3,0_0,3,106.0,117.0,110.0,130.0,112.0,115.0,93.0,90.0,...,472.0,400.0,445.0,454.0,461.0,483.0,449.0,457.0,457.0,462.0
4,0_0,4,117.0,110.0,130.0,112.0,115.0,93.0,90.0,93.0,...,400.0,445.0,454.0,461.0,483.0,449.0,457.0,457.0,462.0,467.0


In [11]:
day_period = 288
week_period = 288 * 7

In [15]:
max_prev_weeks = 4
for i in tqdm(range(1, max_prev_weeks + 1)):
    prev_week = train_df[['row_id', 'timestamp', 'value']]\
        .rename(columns={'value': f'prev_week_{i}'})\
        .copy()
    
    prev_week['timestamp'] += week_period * i
    train_df = train_df.merge(
        prev_week,
        on=['row_id', 'timestamp'],
        how='left'
    )

train_df.head()

  0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,row_id,timestamp,value,target_1,target_2,target_3,target_4,target_5,target_6,target_7,...,target_95,target_96,target_97,target_98,target_99,target_100,prev_week_1,prev_week_2,prev_week_3,prev_week_4
0,0_0,0,133.0,114.0,140.0,106.0,117.0,110.0,130.0,112.0,...,400.0,445.0,454.0,461.0,483.0,449.0,,,,
1,0_0,1,114.0,140.0,106.0,117.0,110.0,130.0,112.0,115.0,...,445.0,454.0,461.0,483.0,449.0,457.0,,,,
2,0_0,2,140.0,106.0,117.0,110.0,130.0,112.0,115.0,93.0,...,454.0,461.0,483.0,449.0,457.0,457.0,,,,
3,0_0,3,106.0,117.0,110.0,130.0,112.0,115.0,93.0,90.0,...,461.0,483.0,449.0,457.0,457.0,462.0,,,,
4,0_0,4,117.0,110.0,130.0,112.0,115.0,93.0,90.0,93.0,...,483.0,449.0,457.0,457.0,462.0,467.0,,,,


In [19]:
prev_week_columns = [f'prev_week_{i}' for i in range(1, max_prev_weeks + 1)]
train_df['prev_week_mean'] = train_df[prev_week_columns].mean(axis=1)
train_df.head()

Unnamed: 0,row_id,timestamp,value,target_1,target_2,target_3,target_4,target_5,target_6,target_7,...,target_96,target_97,target_98,target_99,target_100,prev_week_1,prev_week_2,prev_week_3,prev_week_4,prev_week_mean
2016,0_0,2016,124.0,115.0,105.0,116.0,95.0,127.0,117.0,106.0,...,424.0,421.0,443.0,474.0,419.0,133.0,,,,133.0
2017,0_0,2017,115.0,105.0,116.0,95.0,127.0,117.0,106.0,96.0,...,421.0,443.0,474.0,419.0,445.0,114.0,,,,114.0
2018,0_0,2018,105.0,116.0,95.0,127.0,117.0,106.0,96.0,107.0,...,443.0,474.0,419.0,445.0,442.0,140.0,,,,140.0
2019,0_0,2019,116.0,95.0,127.0,117.0,106.0,96.0,107.0,97.0,...,474.0,419.0,445.0,442.0,419.0,106.0,,,,106.0
2020,0_0,2020,95.0,127.0,117.0,106.0,96.0,107.0,97.0,83.0,...,419.0,445.0,442.0,419.0,445.0,117.0,,,,117.0


In [20]:
number_of_prev_values = 20

for i in tqdm(range(number_of_prev_values)):
    prev_value = train_df[['row_id', 'timestamp', 'value']]\
        .copy()\
        .rename(columns={'value': f'prev_value_{i}'})
    
    prev_value['timestamp'] += i

    train_df = train_df.merge(prev_value, on=['row_id', 'timestamp'])

train_df.head()

  0%|          | 0/20 [00:00<?, ?it/s]

Unnamed: 0,row_id,timestamp,value,target_1,target_2,target_3,target_4,target_5,target_6,target_7,...,prev_value_10,prev_value_11,prev_value_12,prev_value_13,prev_value_14,prev_value_15,prev_value_16,prev_value_17,prev_value_18,prev_value_19
0,0_0,2206,385.0,379.0,400.0,403.0,382.0,424.0,396.0,453.0,...,426.0,451.0,456.0,480.0,453.0,434.0,422.0,430.0,403.0,440.0
1,0_0,2207,379.0,400.0,403.0,382.0,424.0,396.0,453.0,487.0,...,422.0,426.0,451.0,456.0,480.0,453.0,434.0,422.0,430.0,403.0
2,0_0,2208,400.0,403.0,382.0,424.0,396.0,453.0,487.0,491.0,...,407.0,422.0,426.0,451.0,456.0,480.0,453.0,434.0,422.0,430.0
3,0_0,2209,403.0,382.0,424.0,396.0,453.0,487.0,491.0,443.0,...,475.0,407.0,422.0,426.0,451.0,456.0,480.0,453.0,434.0,422.0
4,0_0,2210,382.0,424.0,396.0,453.0,487.0,491.0,443.0,453.0,...,402.0,475.0,407.0,422.0,426.0,451.0,456.0,480.0,453.0,434.0


In [21]:
train_df.to_parquet('data/train.parquet')

# Train test split 

In [None]:
target_df = pd.read_parquet('data/target.parquet')

In [None]:
test_size = week_period
threshold = target_df['time'].max() - test_size
train_df = target_df[target_df['time'] < threshold].copy()
test_df = target_df[target_df['time'] >= threshold].copy()

# Simple model

In [None]:
from sklearn.linear_model import Ridge

In [None]:
row_id = '0_0'

In [None]:
test_df['predict'] = np.nan

In [None]:
ids = train_df['row_id'] == row_id
X_train = train_df[ids].drop(['row_id', 'time', 'target', 'mean_prev_weeks'], axis=1)
y_train = train_df.loc[ids, 'target']

ids = test_df['row_id'] == row_id
X_test = test_df[ids].drop(['row_id', 'time', 'target', 'mean_prev_weeks', 'predict'], axis=1)
y_test = test_df.loc[ids, 'target']

In [None]:
sns.lineplot(test_df.loc[ids], x='time', y='target', label='target')
sns.lineplot(test_df.loc[ids], x='time', y='mean_prev_weeks', label='mean_prev_weeks')

plt.show()

In [None]:
model = Ridge()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
test_df.loc[ids, 'predict'] = y_pred

In [None]:
sns.lineplot(test_df.loc[ids], x='time', y='target', label='target')
sns.lineplot(test_df.loc[ids], x='time', y='predict', label='predict')
sns.lineplot(test_df.loc[ids], x='time', y='mean_prev_weeks', label='mean_prev_weeks')

plt.show()

In [None]:
model.coef_