In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
import math
import torch 
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [4]:
current = os.getcwd()
data_path = os.path.join(os.path.dirname(current),'data')
data_path
def prepare_df(data_path):
    df_dict = {}
    for root,folder,files in os.walk(data_path):
        for file in files:
            if file.endswith('.csv'):
                df_name = file.split('.')[0]+'_df'
                print(df_name)
                df_dict[df_name] = pd.read_csv(os.path.join(root,file))
    return df_dict

In [5]:
df_dict = prepare_df(data_path)

calendar_df
sales_train_validation_df
sample_submission_df
sell_prices_df


In [6]:
def describe_df(df_dict):
    for key in df_dict.keys():
        print('*'*80)
        print('df_name: ',key,df_dict[key].shape)
        if(len(df_dict[key].columns) < 15):
            print('columns: ',df_dict[key].columns)
            print('*'*80)

In [7]:
describe_df(df_dict)

********************************************************************************
df_name:  calendar_df (1969, 14)
columns:  Index(['date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year', 'd',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI'],
      dtype='object')
********************************************************************************
********************************************************************************
df_name:  sales_train_validation_df (30490, 1919)
********************************************************************************
df_name:  sample_submission_df (60980, 29)
********************************************************************************
df_name:  sell_prices_df (6841121, 4)
columns:  Index(['store_id', 'item_id', 'wm_yr_wk', 'sell_price'], dtype='object')
********************************************************************************


In [8]:
cal_df = df_dict['calendar_df']
cal_df.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1


In [9]:
cal_df['date'] = pd.to_datetime(cal_df['date'])
cal_df = cal_df[['date','wm_yr_wk']]
cal_df = cal_df.sort_values(by='date')
cal_df = cal_df[cal_df.date >= '2015-11-01']
cal_df.head()

Unnamed: 0,date,wm_yr_wk
1737,2015-11-01,11540
1738,2015-11-02,11540
1739,2015-11-03,11540
1740,2015-11-04,11540
1741,2015-11-05,11540


In [10]:
sell_df = df_dict['sell_prices_df']
sell_df = sell_df[['wm_yr_wk','sell_price','item_id']]
sell_df.head()

Unnamed: 0,wm_yr_wk,sell_price,item_id
0,11325,9.58,HOBBIES_1_001
1,11326,9.58,HOBBIES_1_001
2,11327,8.26,HOBBIES_1_001
3,11328,8.26,HOBBIES_1_001
4,11329,8.26,HOBBIES_1_001


In [11]:
item_sell_df = pd.DataFrame(cal_df.merge(sell_df,on='wm_yr_wk',how='left').groupby(['item_id'])['sell_price'].max().reset_index())
item_sell_df.head()

Unnamed: 0,item_id,sell_price
0,FOODS_1_001,2.24
1,FOODS_1_002,9.48
2,FOODS_1_003,3.23
3,FOODS_1_004,1.96
4,FOODS_1_005,3.54


In [12]:
sales_train_validation_df = df_dict['sales_train_validation_df']
sales_train_validation_df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4


In [13]:
df = item_sell_df.merge(sales_train_validation_df,on='item_id')

In [14]:
df.head()

Unnamed: 0,item_id,sell_price,id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,FOODS_1_001,2.24,FOODS_1_001_CA_1_validation,FOODS_1,FOODS,CA_1,CA,3,0,0,...,0,2,0,4,1,1,0,1,1,0
1,FOODS_1_001,2.24,FOODS_1_001_CA_2_validation,FOODS_1,FOODS,CA_2,CA,2,0,0,...,1,0,14,0,1,1,4,0,0,4
2,FOODS_1_001,2.24,FOODS_1_001_CA_3_validation,FOODS_1,FOODS,CA_3,CA,1,2,1,...,0,0,13,0,0,0,0,0,1,0
3,FOODS_1_001,2.24,FOODS_1_001_CA_4_validation,FOODS_1,FOODS,CA_4,CA,0,1,1,...,0,0,0,2,0,0,0,1,1,1
4,FOODS_1_001,2.24,FOODS_1_001_TX_1_validation,FOODS_1,FOODS,TX_1,TX,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [63]:
#df.to_csv('train_data.csv',index=False)

In [27]:
#total products
len(df.item_id.unique())

3049

In [18]:
ids = df['item_id']+'_'+df['store_id']+'_validation'

In [19]:
demand_mat = df.iloc[:,7:].values.astype(np.float64) * df.sell_price.values.astype(np.float64).reshape(30490,-1)

In [20]:
demand_mat.T.shape

(1913, 30490)

In [124]:
# Normalize data for the  feature_range=(-1, 1)

np_mat = demand_mat.T[:280]
scalar = MinMaxScaler(feature_range=(-1, 1))
normalized_mat = scalar.fit_transform(np_mat)
normalized_mat

array([[-0.33333333, -0.75      , -0.9       , ..., -1.        ,
        -1.        , -1.        ],
       [-1.        , -1.        , -0.8       , ..., -1.        ,
        -1.        , -0.33333333],
       [-1.        , -1.        , -0.9       , ..., -1.        ,
        -1.        , -1.        ],
       ...,
       [-0.33333333, -0.875     , -1.        , ..., -1.        ,
        -1.        , -1.        ],
       [-0.77777778, -1.        , -1.        , ..., -1.        ,
        -1.        ,  0.33333333],
       [-0.11111111, -0.75      ,  0.1       , ..., -1.        ,
        -1.        , -1.        ]])

In [40]:
train_mat = normalize_data(demand_mat.T[:280])
train_mat

array([[-0.33333333, -0.75      , -0.9       , ..., -1.        ,
        -1.        , -1.        ],
       [-1.        , -1.        , -0.8       , ..., -1.        ,
        -1.        , -0.33333333],
       [-1.        , -1.        , -0.9       , ..., -1.        ,
        -1.        , -1.        ],
       ...,
       [-0.33333333, -0.875     , -1.        , ..., -1.        ,
        -1.        , -1.        ],
       [-0.77777778, -1.        , -1.        , ..., -1.        ,
        -1.        ,  0.33333333],
       [-0.11111111, -0.75      ,  0.1       , ..., -1.        ,
        -1.        , -1.        ]])

In [116]:
past_days = 180
X = train_mat[:past_days,0].reshape(-1,1)
y = train_mat[past_days+1,0].reshape(-1,1)
X.shape

(180, 1)

In [117]:
y

array([[-0.55555556]])

In [85]:
train_data_normalized_X = torch.FloatTensor(X)
train_data_normalized_y = torch.FloatTensor(y)

In [86]:
train_data_normalized_X.shape

torch.Size([28, 1])

In [43]:
def split_sequence(sequence, n_steps_in, n_steps_out):
	X, y = list(), list()
	for i in range(len(sequence)):
		# find the end of this pattern
		end_ix = i + n_steps_in
		out_end_ix = end_ix + n_steps_out
		# check if we are beyond the sequence
		if out_end_ix > len(sequence):
			break
		# gather input and output parts of the pattern
		seq_x, seq_y = sequence[i:end_ix], sequence[end_ix:out_end_ix]
		X.append(seq_x)
		y.append(seq_y)
	return np.array(X), np.array(y)

In [75]:
sequence, n_steps_in, n_steps_out = train_mat,60,28
#X,Y = split_sequence(sequence, n_steps_in, n_steps_out)

In [76]:
X.shape,Y.shape

((219, 60, 30490), (219, 2, 30490))

In [77]:
np.save('train_matrix.npy',train_mat)

# Build Model

In [105]:
class LSTM(nn.Module):
    def __init__(self, input_size, output_size,hidden_layer_size=100):
        super().__init__()
        self.hidden_layer_size = hidden_layer_size

        self.lstm = nn.LSTM(input_size, hidden_layer_size)

        self.linear = nn.Linear(hidden_layer_size, output_size)

        self.hidden_cell = (torch.zeros(1,1,self.hidden_layer_size),
                            torch.zeros(1,1,self.hidden_layer_size))

    def forward(self, input_seq):
        lstm_out, self.hidden_cell = self.lstm(input_seq.view(len(input_seq) ,1, -1), self.hidden_cell)
        predictions = self.linear(lstm_out.view(len(input_seq), -1))
        return predictions[-1]

In [107]:
train_data_normalized_X.shape[1]

1

In [109]:
input_size = train_data_normalized_X.shape[1]
future_days = 28
model = LSTM(input_size,future_days)
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [110]:
print(model)

LSTM(
  (lstm): LSTM(1, 100)
  (linear): Linear(in_features=100, out_features=28, bias=True)
)


In [118]:
epochs = 150

for i in range(epochs):
    optimizer.zero_grad()
    model.hidden_cell = (torch.zeros(1, 1, model.hidden_layer_size),
                        torch.zeros(1, 1, model.hidden_layer_size))
    y_pred = model(train_data_normalized_X)
    
    single_loss = loss_function(y_pred, train_data_normalized_y)
    single_loss.backward()
    optimizer.step()
    
    if i%25 == 1:
        print(f'epoch: {i:3} loss: {single_loss.item():10.8f}')


print(f'epoch: {i:3} loss: {single_loss.item():10.10f}')

  return F.mse_loss(input, target, reduction=self.reduction)


epoch:   1 loss: 0.00000003
epoch:  26 loss: 0.00000000
epoch:  51 loss: 0.00000000
epoch:  76 loss: 0.00000000
epoch: 101 loss: 0.00000000
epoch: 126 loss: 0.00000000
epoch: 149 loss: 0.0000000000


In [136]:
y_pred

tensor([-0.5556, -0.5556, -0.5556, -0.5556, -0.5556, -0.5556, -0.5556, -0.5556,
        -0.5556, -0.5556, -0.5556, -0.5556, -0.5556, -0.5556, -0.5556, -0.5556,
        -0.5556, -0.5556, -0.5556, -0.5556, -0.5556, -0.5556, -0.5556, -0.5556,
        -0.5556, -0.5556, -0.5556, -0.5556], grad_fn=<SelectBackward>)

In [142]:
for idx,i in enumerate(y_pred.tolist()):
    print('forcast for day{}:{}'.format(idx,i))

forcast for day0:-0.5555555820465088
forcast for day1:-0.5555557012557983
forcast for day2:-0.5555556416511536
forcast for day3:-0.5555557012557983
forcast for day4:-0.5555557608604431
forcast for day5:-0.5555556416511536
forcast for day6:-0.5555556416511536
forcast for day7:-0.5555557012557983
forcast for day8:-0.5555557012557983
forcast for day9:-0.5555555820465088
forcast for day10:-0.5555556416511536
forcast for day11:-0.5555556416511536
forcast for day12:-0.5555555820465088
forcast for day13:-0.5555555820465088
forcast for day14:-0.5555556416511536
forcast for day15:-0.5555556416511536
forcast for day16:-0.5555555820465088
forcast for day17:-0.5555557012557983
forcast for day18:-0.5555556416511536
forcast for day19:-0.5555555820465088
forcast for day20:-0.5555557012557983
forcast for day21:-0.5555555820465088
forcast for day22:-0.5555556416511536
forcast for day23:-0.5555555820465088
forcast for day24:-0.5555555820465088
forcast for day25:-0.5555555820465088
forcast for day26:-0.5

In [147]:
1903/27

70.48148148148148