In [1]:
import pandas as pd
import numpy as np
import pickle
import gc
import hyperdash as hd

%load_ext autoreload

In [None]:
#'store_nbr', 'n_city', 'n_state', 'n_type', 'cluster', 'item_nbr', 'n_family', 'class', 'perishable'

In [2]:
unit_mean, unit_std = pd.read_csv('data/mean_std.csv', index_col=0).T[0]

In [3]:
df_stores = pd.read_csv(
    'data/num_stores.csv.gz',
     dtype={
         'store_nbr': np.uint8,
         'n_city': np.uint8,
         'n_state': np.uint8,
         'n_type': np.uint8,
         'cluster': np.uint8
     }

)
df_items = pd.read_csv(
    'data/num_items.csv.gz',
    dtype={
        'item_nbr': np.int32,
        'n_family': np.uint8,
        'class': np.int16,
        'perishable': np.int8,
    }
)

for stores_col in ['n_city', 'n_state', 'n_type', 'cluster']:
    df_stores[stores_col] = df_stores[stores_col] - df_stores[stores_col].min()
    
for items_col in ['n_family', 'class', 'perishable']:
    df_items[items_col] = df_items[items_col] - df_items[items_col].min()

In [4]:
%%time
df = pd.read_csv(
    'data/ts.csv.gz',
    parse_dates=[0],
    #nrows=1000000,
    dtype={
        'item_nbr': np.int32,
        'store_nbr': np.int8,
        'unit_sales_scaled': np.float32,
        'onpromotion': np.int8,
        'holiday': np.int8,
        'weekend': np.int8,
        'waged_day': np.int8,
        'dow_0': np.int8,
        'dow_1': np.int8,
        'dow_2': np.int8,
        'dow_3': np.int8,
        'dow_4': np.int8,
        'dow_5': np.int8,
        'dow_6': np.int8,
    }
)
df.head()

CPU times: user 6min 38s, sys: 12.2 s, total: 6min 51s
Wall time: 6min 25s


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212277456 entries, 0 to 212277455
Data columns (total 15 columns):
date                 datetime64[ns]
store_nbr            int8
item_nbr             int32
unit_sales_scaled    float32
onpromotion          int8
holiday              int8
weekend              int8
waged_day            int8
dow_0                int8
dow_1                int8
dow_2                int8
dow_3                int8
dow_4                int8
dow_5                int8
dow_6                int8
dtypes: datetime64[ns](1), float32(1), int32(1), int8(12)
memory usage: 5.5 GB


In [9]:
df = df[
    df['date'] > '2015-06-01'
]
gc.collect()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 179149968 entries, 152 to 212277455
Data columns (total 15 columns):
date                 datetime64[ns]
store_nbr            int8
item_nbr             int32
unit_sales_scaled    float32
onpromotion          int8
holiday              int8
weekend              int8
waged_day            int8
dow_0                int8
dow_1                int8
dow_2                int8
dow_3                int8
dow_4                int8
dow_5                int8
dow_6                int8
dtypes: datetime64[ns](1), float32(1), int32(1), int8(12)
memory usage: 6.0 GB


In [10]:
%%time
ts_columns = df.columns[3:]
      
attr_cols = [
    'store_nbr', 'n_city', 'n_state', 'n_type', 'cluster',
    'item_nbr', 'n_family', 'class',
    'perishable'
]

df_pivot = df.pivot_table(
    index=['store_nbr', 'item_nbr'],
    columns=['date'],
    values=ts_columns
).reset_index()

df_pivot = df_pivot.merge(df_items, on='item_nbr')
df_pivot['store_nbr'] = df_pivot[('store_nbr', '')]
df_pivot = df_pivot.merge(df_stores, on='store_nbr')



CPU times: user 13min 8s, sys: 2min 44s, total: 15min 53s
Wall time: 12min 25s


In [11]:
df_pivot.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 217944 entries, 0 to 217943
Columns: 9875 entries, item_nbr to cluster
dtypes: float32(822), int16(1), int64(4), int8(9043), uint8(5)
memory usage: 2.5 GB


In [17]:
def get_date_cols(date, history=20, skip=0, predict_days=1, ts_columns=ts_columns):
                  #date, days=1, attr_cols=attr_columns_wo_means, ts_cols=ts_columns, attr=True):
    
    if type(date) != pd.Timestamp:
        date = pd.to_datetime(date)
        
    X_start_date = date - pd.Timedelta('{} days'.format(history-1))
    #X_end_date = date
    y_start_date = date + pd.Timedelta('{} days'.format(predict_day))
    #y_end_date = date + pd.Timedelta('{} days'.format(predict_days))

    X_cols, y_cols, y_day_attr_cols = [], [], []
    
    for d in pd.date_range(X_start_date, periods=history, freq='D'):
        for elem in ts_columns:
            X_cols.append((elem, d))
            
    for d in pd.date_range(y_start_date, periods=1, freq='D'):
        y_cols.append(('unit_sales_scaled', d))
        for elem in ts_columns[1:]:
            y_day_attr_cols.append((elem, d))
            
    return X_cols, y_cols, y_day_attr_cols



X_cols, y_cols, y_day_attr_cols = get_date_cols('2017-08-15', predict_day=3)
    
df_pivot.loc[:, X_cols].shape

(217944, 240)

In [17]:
def get_random_train_test(df_pivot,
        date, window=3, size=2000, history=1, predict_day=1, epochs=2, freq=1
        shuffle_dates=True, shuffle_indexes=True, attr_cols=attr_cols, ts_columns=ts_columns):
    
    num_items = df_pivot.shape[0]
    
    date = pd.to_datetime(date)
    start_window =  date - pd.Timedelta('{} days'.format(window*freq))
    end_date = date
    
    dates = pd.date_range(start_window, end_date, freq='D'.format(freq))
    
    patches = []
    #end_X_date = end_date - pd.Timedelta('{} days'.format(label_dates))
    if shuffle_dates and shuffle_indexes:
        permutated_dates = np.random.permutation(dates)
        permutated_indx = np.random.permutation(num_items)   
        for epoch in range(epochs):
            for i in range(num_items//size+1):
                s = size * i
                e = size * (i+1)
                indexes = permutated_indx[s:e]

                for date in permutated_dates:
                    patches.append([indexes, date])

        patches = np.random.permutation(patches)
        
    elif not shuffle_dates and shuffle_indexes:
        permutated_indx = np.random.permutation(num_items)
        for date in dates:
            for epoch in range(epochs):
                for i in range(num_items//size+1):
                    s = size * i
                    e = size * (i+1)
                    indexes = permutated_indx[s:e]
                    patches.append([indexes, date])

    for indexes, date in patches:
        df_pivot_slice = df_pivot.iloc[indexes]
        X_cols, y_cols, y_day_attr_cols = get_date_cols(
            date, history=history, predict_day=predict_day, ts_columns=ts_columns
        )

        X = np.array(
            df_pivot_slice.loc[:, X_cols]
        ).reshape([-1, history, len(ts_columns)])

        y_day_attr = np.array(
            df_pivot_slice.loc[:, y_day_attr_cols]
        )
        
        y = np.array(df_pivot_slice.loc[:, y_cols])
        features = [X, y_day_attr, y]
        for feature in attr_cols:
            features.append(
                np.array(df_pivot_slice.loc[:, feature])
            )
        for i in range(len(indexes)):
            yield tuple([elem[i] for elem in features])

tmp = get_random_train_test(df_pivot, '2017-07-15', window=0, history=90, predict_day=5)
tmp1 = next(tmp)
print(tmp1[0].shape, tmp1[1].shape, tmp1[2].shape, tmp1[3].shape)

(90, 12) (11,) (1,) ()


In [147]:
for i in range(2000):
    next(tmp)

In [148]:
next(tmp)

(array([[-0.71689729,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [-0.71689729,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [-0.71689729,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        ..., 
        [-0.71689729,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [-0.71689729,  0.        ,  0.        , ...,  1.        ,
          0.        ,  0.        ],
        [-0.71689729,  0.        ,  0.        , ...,  0.        ,
          1.        ,  0.        ]]), array([[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]], dtype=int8), array([-0.71689729, -0.71689729, -0.71689729, -0.71689729, -0.71689729]), 36, 11, 6, 4, 9, 1960806, 30, 106)

In [20]:
def get_validation(df_pivot,
        date, history=1, predict_day=1, attr_cols=attr_cols, ts_columns=ts_columns):
    
    X_cols, y_cols, y_day_attr_cols = get_date_cols(
        date, history=history, predict_day=predict_day, ts_columns=ts_columns
    )

    X = np.array(
        df_pivot.loc[:, X_cols]
    ).reshape([-1, history, len(ts_columns)])
    
    y_day_attr = np.array(
        df_pivot.loc[:, y_day_attr_cols]
    )

    y = np.array(df_pivot.loc[:, y_cols])
    features = [X, y_day_attr, y]
    for feature in attr_cols:
        features.append(
            np.array(df_pivot.loc[:, feature])
        )

    return features

tmp = get_validation(df_pivot, '2017-07-15', history=90, predict_day=5)
print(tmp[0].shape, tmp[1].shape, tmp[2].shape, tmp[3].shape)

(210654, 90, 12) (210654, 11) (210654, 1) (210654,)


In [59]:
%autoreload 2

history = 200
day_to_predict = 1

last_day_train = '2017-07-15'
window=21
epochs = 100
validation_day = pd.to_datetime(last_day_train) + pd.Timedelta('{} days'.format(day_to_predict))
batch_size = 2000
sum_W = 3574368.0/16

print(validation_day)

batch_gen = get_random_train_test(
    df_pivot,
    last_day_train,
    window=window,
    history=history,
    size=batch_size,
    predict_day=day_to_predict,
    epochs=epochs
)

val_set = get_validation(df_pivot, validation_day, history=history, predict_day=day_to_predict)

from model import RNNModel

m = RNNModel(
    history=history,
    #n_days_predict=time_to_predict,
    clip_gradients=10.,
    starter_learning_rate=0.001,
    #starter_learning_rate=0.0005,
    n_layers_rnn=1,
    rnn_size_encoder=100,
    rnn_size_decoder=100,
)
print(1)
m.build_graph(batch_gen)


try:
    hd_exp.end()
except NameError:
    pass

hd_exp = hd.Experiment('RNN fav 2')

m.train(val_set, coef=unit_std, sum_W=sum_W,
        report_every=100, validate_every=1000,
        hd_exp=hd_exp)

2017-07-16 00:00:00
1
g_step: 100 loss std/mean: 17.086471557617188 7.704306125640869
| Loss std:  17.086472 |
| Loss mean:   7.704306 |
g_step: 200 loss std/mean: 0.5421836376190186 4.720633506774902
| Loss std:   0.542184 |
| Loss mean:   4.720634 |
g_step: 300 loss std/mean: 0.4817917048931122 4.4661478996276855
| Loss std:   0.481792 |
| Loss mean:   4.466148 |
g_step: 400 loss std/mean: 0.5824440121650696 4.5725016593933105
| Loss std:   0.582444 |
| Loss mean:   4.572502 |
g_step: 500 loss std/mean: 0.4691454768180847 4.491879463195801
| Loss std:   0.469145 |
| Loss mean:   4.491879 |
g_step: 600 loss std/mean: 0.5033274292945862 4.711548328399658
| Loss std:   0.503327 |
| Loss mean:   4.711548 |
g_step: 700 loss std/mean: 0.4511660933494568 4.478338241577148
| Loss std:   0.451166 |
| Loss mean:   4.478338 |
g_step: 800 loss std/mean: 0.5127571821212769 4.584293365478516
| Loss std:   0.512757 |
| Loss mean:   4.584293 |
g_step: 900 loss std/mean: 0.5348780751228333 4.54532432