In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

file_dir = '../assets/combined_milk.csv'
time_dir = '../assets/time.csv'
zscore_dir = '../assets/Z_scores.csv'
raw = pd.read_csv(file_dir)
calendar = pd.read_csv(time_dir)
zscore = pd.read_csv(zscore_dir)

sales = raw[raw['Store_ID'] == 236117]
sales['Display'] = np.maximum(sales['Display1'], sales['Display2'])
sales['Feature'] = np.maximum.reduce([sales['Feature1'], sales['Feature2'], sales['Feature3'], sales['Feature4']])


## Build rolling L8W Avg Sales & L7W Sum Sales
sales.sort_values(by=['SKU', 'Time_ID'], inplace=True)
sales['Lag8w_avg_sls'] = sales.groupby('SKU')['Sales'].transform(lambda x: x.rolling(window=8, min_periods=1).mean())
sales['Lag7w_sum_sls'] = sales.groupby('SKU')['Sales'].transform(lambda x: x.rolling(window=7, min_periods=1).sum())
sales['Log_sls'] = -np.log(sales['Sales'])

## Calcualte Price Discount (Discount Index) from Landing Price
## Get lower bound price of 95 percentile prices of each SKUxYear
lb_prices =  sales.groupby(['SKU', 'Year'])['Price'].max() * 0.95
lb_prices = lb_prices.reset_index(name='lb_price')
sales = pd.merge(sales, lb_prices, on=['SKU', 'Year'], how='left')
## Filter prices for top 5% and get Median Price 
med_prices = sales[sales['Price'] >= sales['lb_price']][['SKU', 'Year', 'Price']]
med_prices = med_prices.groupby(['SKU', 'Year'])['Price'].median().reset_index(name='med_price')
sales = pd.merge(sales, med_prices, on=['SKU', 'Year'], how='left')
## Calculate Discount Index
sales['pc_disc'] = sales['med_price'] / sales['Price']

## Apply z-standardization on discount
sales = pd.merge(sales, zscore, on=['SKU'], how='left')
sales['z_disc'] = ( sales['pc_disc'] - sales['Mean'] ) / sales['Std_deviation']

## Clean up table
sales = sales[['SKU', 'Time_ID', 'Year', 'Sales', 'z_disc', 'Display', 'Feature', 'Log_sls', 'Lag8w_avg_sls', 'Lag7w_sum_sls']]
sales = sales.rename(columns={'z_disc': 'Discount'})

del lb_prices

In [2]:
sales.to_csv('../assets/processed_sales.csv', index=False)
med_prices.to_csv('../assets/prices.csv', index=False)

In [3]:
cal_week = calendar[['IRI Week', 'Calendar week starting on', 'Calendar week ending on']]
cal_week = cal_week.rename(columns={'IRI Week': 'Time_ID', 
                                    'Calendar week starting on': 'Start_Date', 
                                    'Calendar week ending on': 'End_Date'})

cal_week.to_csv('../assets/calendar_week.csv', index=False)
cal_week.head()

Unnamed: 0,Time_ID,Start_Date,End_Date
0,1114,1-Jan-01,7-Jan-01
1,1115,8-Jan-01,14-Jan-01
2,1116,15-Jan-01,21-Jan-01
3,1117,22-Jan-01,28-Jan-01
4,1118,29-Jan-01,4-Feb-01


In [4]:
events = calendar[['IRI Week', 'Halloween', 'Halloween_1', 'Thanksgiving', 'Thanksgiving_1', 'Christmas', 'Christmas_1', 'NewYear', 'President', 'President_1', 'Easter', 'Easter_1', 'Memorial', 'Memorial_1', '4thJuly', '4thJuly_1', 'Labour', 'Labour_1']]
events = events.rename(columns={'IRI Week': 'Time_ID'})
events.fillna(0, inplace=True)
events.to_csv('../assets/events.csv', index=False)
events.head()

Unnamed: 0,Time_ID,Halloween,Halloween_1,Thanksgiving,Thanksgiving_1,Christmas,Christmas_1,NewYear,President,President_1,Easter,Easter_1,Memorial,Memorial_1,4thJuly,4thJuly_1,Labour,Labour_1
0,1114,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1115,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1116,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1117,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1118,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
