In [1]:
import zipfile
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates

from matplotlib.ticker import StrMethodFormatter       # to set a comma format
from sklearn.preprocessing import MinMaxScaler


from collections import namedtuple, defaultdict

In [59]:
zip = zipfile.ZipFile('data/archive.zip')
fp = zip.open('Historical Product Demand.csv') 
df = pd.read_csv(fp)


df.dropna(subset=['Date'], inplace=True)
df.dropna(subset=['Order_Demand'], inplace=True)

df['Order_Demand'] = df['Order_Demand'].str.replace('(',"")
df['Order_Demand'] = df['Order_Demand'].str.replace(')',"")
df['Order_Demand'] = df['Order_Demand'].astype('int64')

df['Date'] = pd.to_datetime(df['Date']) 
df.sort_values('Date', inplace=True)

df['Month']  = df['Date'].dt.month_name()
df['Year']   = df['Date'].dt.year
df['Months'] = df['Date'].dt.to_period('M').dt.to_timestamp()

In [58]:
df_whse_a = df[df.Warehouse == 'Whse_A']

In [4]:
n_products = len(set(df_whse_a['Product_Code']))
n_rows = len(df_whse_a)
n_warehouse = len(set(df_whse_a['Warehouse']))
n_categories = len(set(df_whse_a['Product_Category']))

print(f"""
#rows:       {n_rows}
#products:   {n_products}
#categories: {n_categories}
#warehouse:  {n_warehouse}
""")


#rows:       142335
#products:   427
#categories: 25
#warehouse:  1



In [5]:
x = df_whse_a[(df_whse_a.Months >= '2012-01-01') & (df_whse_a.Months < '2017-01-01')].groupby('Months')['Order_Demand'].sum()
x = x.reset_index()

In [None]:
# choose a warehouse
df_whse_a.drop(['Warehouse', 'Product_Category'], axis=1, inplace=True)

In [8]:
# filter for freuent product
threshold = 1000
filtered_data = df_whse_a[(df_whse_a.Year >= 2012) & (df_whse_a.Year < 2017)]
frequent_products = df_whse_a.groupby('Product_Code')['Order_Demand'].sum().reset_index()
frequent_products = set(frequent_products[frequent_products['Order_Demand'] > threshold].Product_Code)
filtered_data = filtered_data[filtered_data.Product_Code.isin(frequent_products)]

In [10]:
# aggregation
data_aggregated       = filtered_data[['Product_Code', 'Date', 'Order_Demand']]
data_aggregated       = data_aggregated.groupby(['Product_Code', 'Date']).sum().reset_index()

# weekly
data_pivot            = data_aggregated.pivot(index='Date', columns='Product_Code', values='Order_Demand').fillna(0)
data_resampled_weekly = data_pivot.resample('W').sum()

# daily
daily_index           = pd.date_range(start=data_resampled_weekly.index.min(), end=data_resampled_weekly.index.max(), freq='D')
data_resampled_daily  = data_resampled_weekly.reindex(daily_index).ffill() / 7

In [13]:
import numpy as np
from numpy.fft import fft

def spectrogram(audio, win = 7, step=1, normalize=True):
    spectrogram = []
    hanning = np.hanning(win)
    for i in range(win, len(audio), step):
        dft = np.abs(fft(audio[i - win: i] * hanning))
        if normalize:
            mu  = np.mean(dft)
            std = np.std(dft) + 1.0
            spectrogram.append((dft - mu) / std)
        else:
            spectrogram.append(dft)        
    spectrogram = np.array(spectrogram)[:, :win//2]
    return spectrogram


In [60]:
def create_sequences(data, time_steps=1):
    X, y = [], []
    for i in range(len(data) - time_steps):
        X.append(data[i:(i + time_steps)])
        y.append(data[i + time_steps])
    return np.array(X), np.array(y)

WEEK_IDX = {
  'Monday' : 0,
  'Tuesday': 1,
  'Wednesday': 2,
  'Thursday': 3,
  'Friday':4 ,
  'Saturday': 5,
  'Sunday': 6, 
}
def days_of_week(df, time_steps=1): 
    names = list(df.index.day_name())
    names = [WEEK_IDX[name] for name in names]
    X = []
    for i in range(len(names) - time_steps):
        X.append(names[i:(i + time_steps)])
    return np.array(X)


MONTH_IDX = {
    'January': 0,
    'February': 1,
    'March': 2,
    'April': 3,
    'May': 4,
    'June': 5,
    'July': 6,
    'August': 7,
    'September': 8,
    'October': 9,
    'November': 10,
    'December': 11
}
def month_of_year(df, time_steps=1): 
    names = list(df.index.month_name())
    names = [MONTH_IDX[name] for name in names]
    X = []
    for i in range(len(names) - time_steps):
        X.append(names[i:(i + time_steps)])
    return np.array(X)


def spectrogram_features(data, time_steps=1):
    t, d = data.shape
    spectrograms = []
    for i in range(0, d):
        spectrograms.append(spectrogram(data[:, i], win=30).T)
    stacked = np.stack(spectrograms).T
    X = []
    for i in range(len(stacked) - time_steps):
        X.append(stacked[i:(i + time_steps)])
    return np.array(X)

In [61]:
spec = spectrogram_features(scaled_data, time_steps=30)

In [62]:
month_features = month_of_year(data_resampled_daily, time_steps=30)

In [63]:
day_features = days_of_week(data_resampled_daily, time_steps=30)

In [64]:
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data_resampled_daily)

In [65]:
time_steps = 30  # e.g., 30 days
X, y = create_sequences(scaled_data, time_steps)

In [66]:
X.shape, y.shape, month_features.shape, day_features.shape, spec.shape

((1791, 30, 339), (1791, 339), (1791, 30), (1791, 30), (1761, 30, 15, 339))