In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
import sys
sys.path.append("../src/utils")
from eda_tools import *

## Reading in Dataset

### Weekly Sales

In [2]:
sales = pd.read_csv("../walmart_sales/train.csv")
sales.head()

Unnamed: 0,Region,DMZ,Date,Weekly_Sales,IsHoliday,DMZ.1
0,1,1,5/2/2010,24924.5,False,1
1,1,1,12/2/2010,46039.49,True,1
2,1,1,19/2/10,41595.55,False,1
3,1,1,26/2/10,19403.54,False,1
4,1,1,5/3/2010,21827.9,False,1


In [3]:
# Converting dates
sales["Date"] = pd.to_datetime(sales["Date"])

# Dropping DMZ.1 column
sales = sales.drop(["DMZ.1"], axis=1)

sales.head()

Unnamed: 0,Region,DMZ,Date,Weekly_Sales,IsHoliday
0,1,1,2010-05-02,24924.5,False
1,1,1,2010-12-02,46039.49,True
2,1,1,2010-02-19,41595.55,False
3,1,1,2010-02-26,19403.54,False
4,1,1,2010-05-03,21827.9,False


In [4]:
missing_data(sales)

Unnamed: 0,Region,DMZ,Date,Weekly_Sales,IsHoliday
Total,117604,117604,117604,117604,117604
Missing,0,0,0,0,0
Percent Missing(%),0.0,0.0,0.0,0.0,0.0
Types,int64,int64,datetime64[ns],float64,bool


### Weekly Sales by Region

In [5]:
region_sales = pd.read_csv('../walmart_sales/train_region_v2.csv')
# Converting dates
region_sales["Date"] = pd.to_datetime(region_sales["Date"])

In [6]:
region_sales

Unnamed: 0,Region,Date,Weekly_Sales
0,1,2010-02-05,1643690.90
1,1,2010-02-12,1641957.44
2,1,2010-02-19,1611968.17
3,1,2010-02-26,1409727.59
4,1,2010-03-05,1554806.68
...,...,...,...
1711,12,2012-09-28,880415.67
1712,12,2012-10-05,979825.92
1713,12,2012-10-12,934917.47
1714,12,2012-10-19,960945.43


### Store information

In [7]:
store_features = pd.read_csv("../walmart_sales/stores.csv")
store_features.head()

Unnamed: 0,Region,Type,Size
0,1,A,151315
1,2,A,202307
2,3,B,37392
3,4,A,205863
4,5,B,34875


### Other features

In [8]:
other_features = pd.read_csv("../walmart_sales/features.csv")
# Converting dates
other_features["Date"] = pd.to_datetime(other_features["Date"])

In [9]:
other_features.head()

Unnamed: 0,Region,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
0,1,2010-02-05,42.31,2.572,,,,,,211.096358,8.106,False
1,1,2010-02-12,38.51,2.548,,,,,,211.24217,8.106,True
2,1,2010-02-19,39.93,2.514,,,,,,211.289143,8.106,False
3,1,2010-02-26,46.63,2.561,,,,,,211.319643,8.106,False
4,1,2010-03-05,46.5,2.625,,,,,,211.350143,8.106,False


In [10]:
# Drop MarkDown columns
other_features.drop(["MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5"], axis=1, inplace=True)

In [11]:
other_features.head()

Unnamed: 0,Region,Date,Temperature,Fuel_Price,CPI,Unemployment,IsHoliday
0,1,2010-02-05,42.31,2.572,211.096358,8.106,False
1,1,2010-02-12,38.51,2.548,211.24217,8.106,True
2,1,2010-02-19,39.93,2.514,211.289143,8.106,False
3,1,2010-02-26,46.63,2.561,211.319643,8.106,False
4,1,2010-03-05,46.5,2.625,211.350143,8.106,False


### Merging

In [12]:
missing_data(region_sales)

Unnamed: 0,Region,Date,Weekly_Sales
Total,1716,1716,1716
Missing,0,0,0
Percent Missing(%),0.0,0.0,0.0
Types,int64,datetime64[ns],float64


In [13]:
# Merging other features and region sales
temp1 = pd.merge(region_sales, other_features, on=["Region", "Date"], how = "inner")
missing_data(temp1)

Unnamed: 0,Region,Date,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,IsHoliday
Total,1716,1716,1716,1716,1716,1716,1716,1716
Missing,0,0,0,0,0,0,0,0
Percent Missing(%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Types,int64,datetime64[ns],float64,float64,float64,float64,float64,bool


In [14]:
# Merging region features
temp2 = pd.merge(temp1, store_features, on=["Region"], how="inner")
missing_data(temp2)

Unnamed: 0,Region,Date,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,IsHoliday,Type,Size
Total,1716,1716,1716,1716,1716,1716,1716,1716,1716,1716
Missing,0,0,0,0,0,0,0,0,0,0
Percent Missing(%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Types,int64,datetime64[ns],float64,float64,float64,float64,float64,bool,object,int64


In [15]:
region = temp2.copy()

In [16]:
region.head()

Unnamed: 0,Region,Date,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,IsHoliday,Type,Size
0,1,2010-02-05,1643690.9,42.31,2.572,211.096358,8.106,False,A,151315
1,1,2010-02-12,1641957.44,38.51,2.548,211.24217,8.106,True,A,151315
2,1,2010-02-19,1611968.17,39.93,2.514,211.289143,8.106,False,A,151315
3,1,2010-02-26,1409727.59,46.63,2.561,211.319643,8.106,False,A,151315
4,1,2010-03-05,1554806.68,46.5,2.625,211.350143,8.106,False,A,151315


In [18]:
# train_df = pd.DataFrame()
# test_df = pd.DataFrame()

# n = len(region)
# split = 0.2


# for r in pd.unique(region["Region"]):
#     temp = region.loc[region["Region"] == r, ]
#     temp_train = temp.iloc[:-(int(split*len(temp))), :]
#     temp_test = temp.iloc[-(int(split*len(temp))):, :]
#     print(len(temp), len(temp_train), len(temp_test))
#     train_df = pd.concat([train_df, temp_train], axis = 0)
#     test_df = pd.concat([test_df, temp_test], axis = 0)

# train_df.reset_index(drop=True)
# test_df.reset_index(drop=True)

train_df, test_df = train_test_split_timeseries(region, split=0.2, group="Region")


143 115 28
143 115 28
143 115 28
143 115 28
143 115 28
143 115 28
143 115 28
143 115 28
143 115 28
143 115 28
143 115 28
143 115 28


In [19]:
train_df[train_df["Region"] == 1].tail()

Unnamed: 0,Region,Date,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,IsHoliday,Type,Size
110,1,2012-03-16,1677472.78,64.74,3.734,221.211813,7.348,False,A,151315
111,1,2012-03-23,1511068.07,65.93,3.787,221.286413,7.348,False,A,151315
112,1,2012-03-30,1649604.63,67.61,3.845,221.361012,7.348,False,A,151315
113,1,2012-04-06,1899676.88,70.43,3.891,221.435611,7.143,False,A,151315
114,1,2012-04-13,1621031.7,69.07,3.891,221.51021,7.143,False,A,151315


In [20]:
test_df[test_df["Region"]==1].head()

Unnamed: 0,Region,Date,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,IsHoliday,Type,Size
115,1,2012-04-20,1521577.87,66.76,3.877,221.564074,7.143,False,A,151315
116,1,2012-04-27,1468928.37,67.23,3.814,221.617937,7.143,False,A,151315
117,1,2012-05-04,1684519.99,75.55,3.749,221.6718,7.143,False,A,151315
118,1,2012-05-11,1611096.05,73.77,3.688,221.725663,7.143,False,A,151315
119,1,2012-05-18,1595901.87,70.33,3.63,221.742674,7.143,False,A,151315


In [21]:
# # Exporting the data
# train_df.to_csv("../walmart_sales/train_payload2_azureml.csv", index=False)
# test_df.to_csv("../walmart_sales/test_payload2_azureml.csv", index=False)

In [100]:
train, test = train_test_split_timeseries(region, split=0.3, group="Region")

143 101 42
143 101 42
143 101 42
143 101 42
143 101 42
143 101 42
143 101 42
143 101 42
143 101 42
143 101 42
143 101 42
143 101 42


In [101]:
train

Unnamed: 0,Region,Date,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,IsHoliday,Type,Size
0,1,2010-02-05,1643690.90,42.31,2.572,211.096358,8.106,False,A,151315
1,1,2010-02-12,1641957.44,38.51,2.548,211.242170,8.106,True,A,151315
2,1,2010-02-19,1611968.17,39.93,2.514,211.289143,8.106,False,A,151315
3,1,2010-02-26,1409727.59,46.63,2.561,211.319643,8.106,False,A,151315
4,1,2010-03-05,1554806.68,46.50,2.625,211.350143,8.106,False,A,151315
...,...,...,...,...,...,...,...,...,...,...
1669,12,2011-12-09,1189646.45,42.17,3.644,129.855533,12.890,False,B,112238
1670,12,2011-12-16,1293404.18,43.29,3.600,129.898065,12.890,False,B,112238
1671,12,2011-12-23,1617612.03,45.40,3.541,129.984548,12.890,False,B,112238
1672,12,2011-12-30,1111638.07,44.64,3.428,130.071032,12.890,True,B,112238
