In [10]:
import pandas as pd
import numpy as np
import pickle
import sklearn
import gc

In [2]:
df = pd.read_csv(
    '../LSTM/inputs/df_train_reduced_cleared.csv.gz',
    parse_dates=[0],
    usecols=list(range(1,9)),
    dtype={
        'item_nbr': np.int32,
        'store_nbr': np.int8,
        'unit_sales': np.float32,
        'onpromotion': np.int8,
        'holiday': np.int8,
        'dow': np.int8,
        'weekend': np.int8
    }
)

In [3]:
df.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion,holiday,dow,weekend
0,2015-06-01,1,96995,0.0,0,0,0,0
1,2015-06-02,1,96995,0.0,0,0,1,0
2,2015-06-03,1,96995,0.0,0,0,2,0
3,2015-06-04,1,96995,0.0,0,0,3,0
4,2015-06-05,1,96995,0.0,0,0,4,0


In [5]:
df_items = pd.read_csv(
    '../input/items.csv',
    dtype={
        'item_nbr': np.int32,
        'class': np.int8,
        'perishable': np.int8,
    }
)
df_items.head()

Unnamed: 0,item_nbr,family,class,perishable
0,96995,GROCERY I,69,0
1,99197,GROCERY I,43,0
2,103501,CLEANING,-64,0
3,103520,GROCERY I,4,0
4,103665,BREAD/BAKERY,-104,1


In [26]:
df['waged_day'] = df['date'].apply(lambda x: 1 if x.day == 15 or x.day == x.daysinmonth else 0)

In [7]:
df = df.merge(df_items[['item_nbr', 'perishable']])

df['W'] = df['perishable'].apply(lambda x: 1.25 if x else 1)
df['unit_sales_scaled'] = np.log1p(df['unit_sales']) * np.sqrt(df['W'])

units_mean = df['unit_sales_scaled'].mean()
units_std = df['unit_sales_scaled'].std()
#units_min = df['unit_sales_scaled'].min()
#units_max = df['unit_sales_scaled'].max()
#df['unit_sales_scaled'] = (df['unit_sales_scaled'] - units_min)/(units_max-units_min)
df['unit_sales_scaled'] = (df['unit_sales_scaled'] - units_mean)/units_std

df.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion,holiday,dow,weekend,perishable,W,unit_sales_scaled
0,2015-06-01,1,96995,0.0,0,0,0,0,0,1.0,-0.716897
1,2015-06-02,1,96995,0.0,0,0,1,0,0,1.0,-0.716897
2,2015-06-03,1,96995,0.0,0,0,2,0,0,1.0,-0.716897
3,2015-06-04,1,96995,0.0,0,0,3,0,0,1.0,-0.716897
4,2015-06-05,1,96995,0.0,0,0,4,0,0,1.0,-0.716897


In [33]:
df_dummies = pd.get_dummies(
    df[[
        'date', 'store_nbr', 'item_nbr', 'unit_sales_scaled',
        'onpromotion', 'holiday', 'dow', 'weekend', 'waged_day'
    ]],
    columns=['dow']
)

df_dummies.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales_scaled,onpromotion,holiday,weekend,waged_day,dow_0,dow_1,dow_2,dow_3,dow_4,dow_5,dow_6
0,2015-06-01,1,96995,-0.716897,0,0,0,0,1,0,0,0,0,0,0
1,2015-06-02,1,96995,-0.716897,0,0,0,0,0,1,0,0,0,0,0
2,2015-06-03,1,96995,-0.716897,0,0,0,0,0,0,1,0,0,0,0
3,2015-06-04,1,96995,-0.716897,0,0,0,0,0,0,0,1,0,0,0
4,2015-06-05,1,96995,-0.716897,0,0,0,0,0,0,0,0,1,0,0


In [9]:
df_stores = pd.read_csv('../input/stores.csv')
df_stores.head()

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


In [17]:
le_city = sklearn.preprocessing.LabelEncoder()
le_state = sklearn.preprocessing.LabelEncoder()
le_type = sklearn.preprocessing.LabelEncoder()
df_stores['n_city'] = le_city.fit_transform(df_stores['city'])
df_stores['n_state'] = le_state.fit_transform(df_stores['state'])
df_stores['n_type'] = le_type.fit_transform(df_stores['type'])
#df_stores['n_city'] =
df_stores.head()


Unnamed: 0,store_nbr,city,state,type,cluster,n_city,n_state,n_type
0,1,Quito,Pichincha,D,13,18,12,3
1,2,Quito,Pichincha,D,13,18,12,3
2,3,Quito,Pichincha,D,8,18,12,3
3,4,Quito,Pichincha,D,9,18,12,3
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4,21,14,3


In [22]:
le_family = sklearn.preprocessing.LabelEncoder()
df_items['n_family'] = le_family.fit_transform(df_items['family'])
df_items.head()

Unnamed: 0,item_nbr,family,class,perishable,n_family
0,96995,GROCERY I,69,0,12
1,99197,GROCERY I,43,0,12
2,103501,CLEANING,-64,0,7
3,103520,GROCERY I,4,0,12
4,103665,BREAD/BAKERY,-104,1,5


In [34]:
df_stores[['store_nbr', 'n_city', 'n_state', 'n_type', 'cluster']].to_csv(
    'data/num_stores.csv.gz', compression='gzip', index=None
)
df_items[['item_nbr', 'n_family', 'class', 'perishable']].to_csv(
    'data/num_items.csv.gz', compression='gzip', index=None
)
df_dummies.to_csv(
    'data/ts.csv.gz', compression='gzip', index=None
)