In [7]:
import tensorflow as tf
import numpy as np
import pandas as pd
import gc
from sklearn.metrics import mean_squared_error

In [2]:
unit_mean, unit_std = pd.read_csv('../RNN/data/mean_std.csv', index_col=0).T[0]

In [3]:
train_csv = pd.read_csv(
    '../input/train.csv',
    skiprows=122100000,
    dtype={
        'id': np.int64,
        'store_nbr': np.int64,
        'item_nbr': np.int64,
        'onpromotion': np.bool
    },
    converters={
        'unit_sales': lambda x: float(x) if float(x) > 0 else 0,
    },
    parse_dates=[1]
)
train_csv.columns = ['id', 'date', 'store_nbr', 'item_nbr', 'unit_sales_real', 'onpromotion']
train_csv['unit_sales_real'] = train_csv['unit_sales_real'].apply(lambda x : x if x > 0 else 0)

In [4]:
df_test = pd.read_csv(
    '../input/test.csv',
    parse_dates=[1],
    dtype={
        'item_nbr': np.int32,
        'store_nbr': np.int8,
        'unit_sales': np.float32,
    },
    converters={
        'onpromotion': lambda x: 1 if x == 'True' else 0
    }
)

df_stores = pd.read_csv(
    '../RNN/data/num_stores.csv.gz',
     dtype={
         'store_nbr': np.uint8,
         'n_city': np.uint32,
         'n_state': np.uint32,
         'n_type': np.uint32,
         'cluster': np.uint32
     }

)
df_items = pd.read_csv(
    '../RNN/data/num_items.csv.gz',
    dtype={
        'item_nbr': np.int32,
        'n_family': np.int32,
        'class': np.int32,
        'perishable': np.int8,
    }
)

df_items['weight'] = df_items['perishable'] * 0.25 + 1

for stores_col in ['n_city', 'n_state', 'n_type', 'cluster']:
    df_stores[stores_col] = df_stores[stores_col] - df_stores[stores_col].min()
    
for items_col in ['n_family', 'class', 'perishable']:
    df_items[items_col] = df_items[items_col] - df_items[items_col].min()

In [5]:
%%time
df = pd.read_csv(
    '../RNN/data/ts.csv.gz',
    parse_dates=[0],
    #nrows=1000000,
    dtype={
        'item_nbr': np.int32,
        'store_nbr': np.int8,
        'unit_sales': np.float32,
        'onpromotion': np.int8,
        'holiday': np.int8,
        'weekend': np.int8,
        'waged_day': np.int8,
        'dow_0': np.int8,
        'dow_1': np.int8,
        'dow_2': np.int8,
        'dow_3': np.int8,
        'dow_4': np.int8,
        'dow_5': np.int8,
        'dow_6': np.int8,
    }
)
df.head()

CPU times: user 6min 49s, sys: 11.6 s, total: 7min
Wall time: 6min 23s


In [8]:
df = df[
    df['date'] > '2016-01-01'
]
gc.collect()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 132509952 entries, 366 to 212277455
Data columns (total 15 columns):
date                 datetime64[ns]
store_nbr            int8
item_nbr             int32
unit_sales_scaled    float64
onpromotion          int8
holiday              int8
weekend              int8
waged_day            int8
dow_0                int8
dow_1                int8
dow_2                int8
dow_3                int8
dow_4                int8
dow_5                int8
dow_6                int8
dtypes: datetime64[ns](1), float64(1), int32(1), int8(12)
memory usage: 4.9 GB


In [23]:
df[
    (df['date'] == '2017-08-16') &
    (df['store_nbr'] == 54) &
    #(df['item_nbr'] == 108797)
    (df['item_nbr'] == 2113343)
]

Unnamed: 0,date,store_nbr,item_nbr,unit_sales_scaled,onpromotion,holiday,weekend,waged_day,dow_0,dow_1,dow_2,dow_3,dow_4,dow_5,dow_6
169590656,2017-08-16,54,2113343,-0.716897,0,0,0,0,0,0,1,0,0,0,0


In [9]:
%%time
ts_columns = df.columns[3:]
      
attr_cols = [
    'store_nbr', 'n_city', 'n_state', 'n_type', 'cluster',
    'item_nbr', 'n_family', 'class',
    'weight',
    #'perishable'
]

df_pivot = df.pivot_table(
    index=['store_nbr', 'item_nbr'],
    columns=['date'],
    values=ts_columns
).reset_index()

df_pivot = df_pivot.merge(df_items, on='item_nbr')
df_pivot['store_nbr'] = df_pivot[('store_nbr', '')]
df_pivot = df_pivot.merge(df_stores, on='store_nbr')



CPU times: user 10min 6s, sys: 2min 11s, total: 12min 17s
Wall time: 9min 35s


In [10]:
def get_date_cols(date, history=20, predict_days=16, ts_columns=ts_columns, skip=0):
                  #date, days=1, attr_cols=attr_columns_wo_means, ts_cols=ts_columns, attr=True):
    
    if type(date) != pd.Timestamp:
        date = pd.to_datetime(date)
        
    X_start_date = date - pd.Timedelta('{} days'.format(history-1))
    #X_end_date = date
    y_start_date = date + pd.Timedelta('{} days'.format(skip+1))
    #y_end_date = date + pd.Timedelta('{} days'.format(predict_days))

    X_cols, y_cols, y_day_attr_cols = [], [], []
    
    for d in pd.date_range(X_start_date, periods=history, freq='D'):
        for elem in ts_columns:
            X_cols.append((elem, d))
            
    for d in pd.date_range(y_start_date, periods=predict_days, freq='D'):
        y_cols.append(('unit_sales_scaled', d))
        for elem in ts_columns[1:]:
            y_day_attr_cols.append((elem, d))
            
    return X_cols, y_cols, y_day_attr_cols



X_cols, y_cols, y_day_attr_cols = get_date_cols('2017-08-15', predict_days=16)
    
df_pivot.head().loc[:, y_cols]

Unnamed: 0,"(unit_sales_scaled, 2017-08-16 00:00:00)","(unit_sales_scaled, 2017-08-17 00:00:00)","(unit_sales_scaled, 2017-08-18 00:00:00)","(unit_sales_scaled, 2017-08-19 00:00:00)","(unit_sales_scaled, 2017-08-20 00:00:00)","(unit_sales_scaled, 2017-08-21 00:00:00)","(unit_sales_scaled, 2017-08-22 00:00:00)","(unit_sales_scaled, 2017-08-23 00:00:00)","(unit_sales_scaled, 2017-08-24 00:00:00)","(unit_sales_scaled, 2017-08-25 00:00:00)","(unit_sales_scaled, 2017-08-26 00:00:00)","(unit_sales_scaled, 2017-08-27 00:00:00)","(unit_sales_scaled, 2017-08-28 00:00:00)","(unit_sales_scaled, 2017-08-29 00:00:00)","(unit_sales_scaled, 2017-08-30 00:00:00)","(unit_sales_scaled, 2017-08-31 00:00:00)"
0,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471
1,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471
2,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471
3,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471
4,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471,-0.69471


In [11]:
def get_validation(df_pivot,
        date, history=1, predict_days=16, attr_cols=attr_cols, ts_columns=ts_columns, skip=0):
    
    X_cols, y_cols, y_day_attr_cols = get_date_cols(
        date, history=history, predict_days=predict_days, ts_columns=ts_columns, skip=skip
    )

    X = np.array(
        df_pivot.loc[:, X_cols]
    ).reshape([-1, history, len(ts_columns)])
    
    y_day_attr = np.array(
        df_pivot.loc[:, y_day_attr_cols]
    ).reshape([-1, predict_days, len(ts_columns)-1])

    y = np.array(df_pivot.loc[:, y_cols])
    features = [X, y_day_attr, y]
    for feature in attr_cols:
        features.append(
            np.array(df_pivot.loc[:, feature])
        )

    return features

tmp = get_validation(df_pivot, '2017-08-15', history=90, predict_days=5)
print(len(tmp))
print(tmp[0].shape, tmp[1].shape, tmp[2].shape, tmp[3].shape)

12
(217944, 90, 12) (217944, 5, 11) (217944, 5) (217944,)


In [99]:
%%time

def compute_predictions(val_date, ckpt_dir, step=None):
    
    if step is None:
        checkpoint = tf.train.latest_checkpoint(ckpt_dir)
    else:
        checkpoint = ckpt_dir + '/model.ckpt-' + str(step)

    test_set = get_validation(df_pivot, val_date, history=400, predict_days=16)
    
    (                                                           
        b_X, b_y_day_attr, b_y,
        b_store_nbr, b_n_city, b_n_state, b_n_type,
        b_cluster, b_item_nbr, b_n_family, b_class,
        b_weights
    ) = test_set

    graph_path = checkpoint + '.meta'

    loaded_graph = tf.Graph()
    with loaded_graph.as_default():
        #with tf.Session(graph=loaded_graph) as sess:
        sess = tf.Session(graph=loaded_graph)
        saver = tf.train.import_meta_graph(graph_path)

        #saver.restore(sess, tf.train.latest_checkpoint(checkpoint))
        saver.restore(sess, checkpoint)


        t_X = loaded_graph.get_tensor_by_name('inputs/X:0')
        t_y_day_attr = loaded_graph.get_tensor_by_name('inputs/y_day_attr:0')
        t_y = loaded_graph.get_tensor_by_name('inputs/y:0')
        t_feat_store_nbr = loaded_graph.get_tensor_by_name('inputs/feat_store_nbr:0')
        t_feat_n_city = loaded_graph.get_tensor_by_name('inputs/feat_n_city:0')
        t_feat_n_state = loaded_graph.get_tensor_by_name('inputs/feat_n_state:0')
        t_feat_n_type = loaded_graph.get_tensor_by_name('inputs/feat_n_type:0')
        t_feat_cluster = loaded_graph.get_tensor_by_name('inputs/feat_cluster:0')
        t_feat_item_nbr = loaded_graph.get_tensor_by_name('inputs/feat_item_nbr:0')
        t_feat_n_family = loaded_graph.get_tensor_by_name('inputs/feat_n_family:0')
        t_feat_class = loaded_graph.get_tensor_by_name('inputs/feat_class:0')
        t_weights = loaded_graph.get_tensor_by_name('inputs/weights:0')
        t_encoder_dropout_kp = loaded_graph.get_tensor_by_name('enc_dropout:0')
        t_decoder_dropout_kp = loaded_graph.get_tensor_by_name('dec_dropout:0')
        t_conv_dropout_kp = loaded_graph.get_tensor_by_name('conv_dropout:0')
        t_predictions = loaded_graph.get_tensor_by_name('predictions:0')

        dataset = tf.data.Dataset.from_tensor_slices(
            (
                t_X,
                t_y_day_attr,
                t_y,
                t_feat_store_nbr,
                t_feat_n_city,
                t_feat_n_state,
                t_feat_n_type,
                t_feat_cluster,
                t_feat_item_nbr,
                t_feat_n_family,
                t_feat_class,
                t_weights
            )
        )
        dataset = dataset.prefetch(512*420)
        dataset = dataset.batch(512*20)

        iterator = tf.data.Iterator(
            loaded_graph.get_tensor_by_name('Iterator:0'),
            loaded_graph.get_operation_by_name('Iterator'),
            dataset.output_types,
            dataset.output_shapes
        )

        next_element = iterator.get_next()

        iterator = iterator.make_initializer(dataset)


        feed_dict = {
            t_X: b_X,
            t_y_day_attr: b_y_day_attr,
            t_y: b_y,
            t_feat_store_nbr: b_store_nbr,
            t_feat_n_city: b_n_city,
            t_feat_n_state: b_n_state,
            t_feat_n_type: b_n_type,
            t_feat_cluster: b_cluster,
            t_feat_item_nbr: b_item_nbr,
            t_feat_n_family: b_n_family,
            t_feat_class: b_class,
            t_weights: b_weights,
            t_encoder_dropout_kp: 1.,
            t_decoder_dropout_kp: 1.,
            t_conv_dropout_kp: 1.,
        }
        #sess.run(iterator)
        sess.run(iterator, feed_dict=feed_dict)
        #print(dir(tmp))
        predictions = []
        real_unit_sales = b_y


        while True:

            try:
                predictions.append(
                    sess.run(
                        t_predictions,
                        feed_dict=feed_dict
                    )
                )
            except tf.errors.OutOfRangeError:
                break

            except KeyboardInterrupt:
                print("Ctrl+C")
                break

        predicted_val = np.vstack(predictions)

        NWRMSLE = np.sqrt(mean_squared_error(
            (unit_std*predicted_val.reshape(-1,1) + unit_mean),
            (unit_std*real_unit_sales.reshape(-1,1) + unit_mean),
            np.tile((np.expand_dims(b_weights+.25+1, -1)), 16).reshape(-1,1)
        ))
        print(NWRMSLE)

        NWRMSLE_5 = np.sqrt(mean_squared_error(
            (unit_std*predicted_val[:,:5].reshape(-1,1) + unit_mean),
            (unit_std*real_unit_sales[:,:5].reshape(-1,1) + unit_mean),
            np.tile((np.expand_dims(b_weights+.25+1, -1)), 5).reshape(-1,1)
        ))

        print(NWRMSLE_5)
        #tmp = sess.run(t_predictions, feed_dict=feed_dict)
        
    return predicted_val

    
val_date = '2017-07-30'
val_date = '2017-08-15'

ckpt_dir = '../RNN/log/2018-01-03_20-05-17'
predicted_val = compute_predictions(val_date, ckpt_dir)

INFO:tensorflow:Restoring parameters from ../RNN/log/2018-01-03_20-05-17/model.ckpt-48000
1.19380524622
1.26183324342
CPU times: user 4min 28s, sys: 1min 7s, total: 5min 35s
Wall time: 4min 47s


```
 8000 - 0.525397366651 - 0.515445371994
16000 - 0.518722911751 - 0.509224939791
44000 - 0.511786064374 - 0.503474613307

```

In [103]:
date_range = pd.date_range('2017-07-31', '2017-08-15')
date_range = pd.date_range('2017-08-16', '2017-08-31')

df_prognosis = pd.DataFrame(
    predicted_val,
    columns=date_range
)

df_prognosis['item_nbr'] = df_pivot['item_nbr']

df_prognosis['store_nbr'] = df_pivot['store_nbr']

df_prognosis = df_prognosis.melt(
    id_vars=['item_nbr', 'store_nbr'],
    value_vars=date_range
)

df_prognosis = df_prognosis.merge(
    df_items[['item_nbr', 'perishable']],
    on='item_nbr'
)

df_prognosis.rename(
    columns={'variable':'date', 'value': 'unit_sales_scaled'},
    inplace=True
)

#df_prognosis['perishable'] = df_pivot['perishable']
#df_prognosis['W'] = df_prognosis['perishable'].apply(lambda x: 1.25 if x else 1)

df_prognosis['unit_sales'] = np.expm1(
    (df_prognosis['unit_sales_scaled'] * unit_std) + unit_mean
)

#df_prognosis['unit_sales'] = df_prognosis['unit_sales'].apply(lambda x: x if x>=0 else 0)

out = df_test.merge(
    df_prognosis,
    on=['date', 'item_nbr', 'store_nbr'],
    how='left'
)[['id', 'unit_sales']]
out.fillna(0, inplace=True)
#out.to_csv('14-03-06.csv.gz', compression='gzip', index=False)
print('Done')

Done


In [None]:
#! kg submit 14-03-02.csv.gz -m 'val 5d 0.511119'

In [91]:
tmp = df_prognosis.merge(
    train_csv[['date', 'store_nbr', 'item_nbr','unit_sales_real']],
    on=['date', 'item_nbr', 'store_nbr'],
    how='left'
)
tmp = tmp.fillna(0)

In [92]:
tmp.shape, df_prognosis.shape

((3487104, 7), (3487104, 6))

In [97]:
tmp.head()

Unnamed: 0,item_nbr,store_nbr,date,unit_sales_scaled,perishable,unit_sales,unit_sales_real
0,96995,1,2017-07-31,-0.588287,0,0.114318,2.0
1,96995,2,2017-07-31,-0.503795,0,0.214313,0.0
2,96995,3,2017-07-31,-0.427555,0,0.312223,0.0
3,96995,4,2017-07-31,-0.578194,0,0.125815,0.0
4,96995,5,2017-07-31,-0.416889,0,0.326536,0.0


In [98]:
np.sqrt(mean_squared_error(
    np.log1p(tmp['unit_sales']),
    np.log1p(tmp['unit_sales_real']),
    tmp['perishable']*.25 + 1
))

0.51119741740289248

In [72]:
np.sqrt(
    (
        np.sum(
            np.square(tmp['W']) * np.square(
                 np.log1p(tmp['unit_sales']) - np.log1p(tmp['unit_sales_real'])
            )
        )
    )/(    
        tmp['W'].sum() 
    )
)

KeyError: 'W'

In [199]:
tmp1 = tmp[
    tmp['date'] < '2017-08-05'
]

In [241]:
np.sqrt(
    (
        np.sum(
            tmp1['W'] * np.square(
                 np.log1p(tmp1['unit_sales']) - np.log1p(tmp1['unit_sales_real'])
            )
        )
    )/(    
        tmp1['W'].sum() 
    )
)

0.52558896751809192

In [103]:
np.sqrt(1.25)

1.2247448713915889

In [113]:
from sklearn import metrics

In [123]:
np.sqrt(
    metrics.mean_squared_error(
        np.log1p(tmp['unit_sales']),
        np.log1p(tmp['unit_sales_real']),
        tmp['W']
    )*tmp.shape[0] / tmp['W'].sum()
)

0.50259588274380929

In [124]:
np.sum(
    tmp1['W'] * np.square(
         np.log1p(tmp1['unit_sales']) - np.log1p(tmp1['unit_sales_real'])
    )
)

290669.19014115859

In [229]:
y     = np.log1p(tmp1['unit_sales_real'])
y_hat = np.log1p(tmp1['unit_sales'])
w     = tmp1['W']

np.sqrt(metrics.mean_squared_error(y, y_hat, w))

0.50873586678231009

In [143]:
metrics.mean_squared_error(
    np.log1p(tmp['unit_sales']),
    np.log1p(tmp['unit_sales_real']),
    tmp['W']
)*tmp['W'].sum()

957517.42717758799

In [144]:
y     = np.array([1,   2,   3,   4,   5  ])
y_hat = np.array([1.1, 2.1, 3.1, 4.1, 5.1])
w     = np.array([.5,   1,   1,   2,   2  ])

metrics.mean_squared_error(y, y_hat, w)*w.sum()

0.064999999999999752

In [145]:
(np.square(y - y_hat)*w).sum()

0.064999999999999752