# 0.0 Modules, etc.

In [1]:
import numpy as np
import pandas as pd
import multiprocessing

import scipy
from scipy import spatial
from scipy.spatial import cKDTree

import sklearn as sk
from sklearn import svm
from sklearn import preprocessing
from sklearn import neighbors
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn import clone
from sklearn.externals.six.moves import xrange

import matplotlib.pyplot as plt
import bokeh
import bokeh.io
from bokeh.plotting import figure
from bokeh.io import output_notebook, show

# init_notebook_mode()

import seaborn as sns

import re
import math
import copy

from collections import defaultdict
import csv
import itertools
import datetime 
from datetime import datetime
import time
import dateutil.parser
import pickle
import random

import gc
import zipfile
import sys, getopt
import os

from IPython.core.interactiveshell import InteractiveShell
from io import StringIO

import dask.dataframe as dd
#from chest import Chest

InteractiveShell.ast_node_interactivity = "all"
#InteractiveShell.ast_node_interactivity = "last"

# Magic function to make matplotlib inline
%matplotlib inline

%config InlineBackend.figure_formats = {'png', 'retina'}

# Set up Bokeh for inline viewing
bokeh.io.output_notebook()

import dask.dataframe as ddf
import dask.array as da

In [2]:
pd.set_option('max_columns', 500)
pd.set_option('max_rows', 500)

In [None]:
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
# plotly.tools.set_credentials_file(username='duplinskiy', api_key='RsZHhxIiAGGu7FN9P4bu')
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf

cf.go_offline()

# 1 Preprocessing

## 1.1 Wrangling

In [None]:
%%time
dat0 = pd.read_csv('seasonal_sales_indicators.csv',
                 delimiter = '~')

# len(dat0['article_number'].unique()) # 46573

dat = dat0.copy()

dat.columns = [x.replace('t_eu_ecom_dit_dsf_transaction_t.', '') for x in dat.columns] # tidy column names

dat = dat[dat['gross_demand_quantity'] != 0] 

Data

    (0) dat0
    (1) dat

In [None]:
# Feature subset for clustering

dat = dat[['article_number', 
           'gross_demand_quantity', # mean, sum, std
           'fraction_of_full_price', # mean, std
           'markdown', # mean (INDICATOR) 
           'sale', # mean (INDICATOR no/sale)
           'gross_sales_net_disc_gross_ret']]

dat['sales_price'] = dat['gross_sales_net_disc_gross_ret'].divide(dat['gross_demand_quantity'])
dat.drop('gross_sales_net_disc_gross_ret', axis = 1, inplace = True)

In [None]:
%%time

dat_grouped = dat.groupby(by = 'article_number')

dat = dat_grouped.agg(['mean', 'std']) # Need count
dat.drop([('markdown', 'std'),('sale', 'std')], axis = 1, inplace=True)
dat[('gross_demand_quantity','sum')] = dat_grouped['gross_demand_quantity'].sum()

dat.fillna(value=0, inplace = True) # Define single transaction article std to be 0
dat = dat.replace([np.inf, -np.inf, np.nan], 0)

## 1.2 Save/Load curated data

In [None]:
# Save/load tidied version

# dat.to_csv('dat.csv')

dat = pd.read_csv('dat.csv', low_memory=False, index_col = 0, header = [0,1]) # gotta encode multi-index

# 2 Clustering 

## 2.0 Baseline: feature-based

### 2.0.1 Sort

In [None]:
feature = 'sales_price'
stat = 'mean'

dat_feature_sorted = dat.sort_values([(feature, stat)], ascending=False) # Sort

# # --- Feature cumulative sum ---
# dat_feature_sorted['cumulative_feature'] = dat_feature_sorted[feature][stat].cumsum()

# # --- Feature cumulative percent ---
# total = dat_feature_sorted[feature][stat].sum() 
# dat_feature_sorted['cumulative_pct_feature'] = dat_feature_sorted[feature][stat].cumsum()/total

# # --- Cumulative percent of articles ---
# t = pd.Series(range(1, 46573))/46572
# dat_feature_sorted['cumulative_pct_articles'] = t.values

# --- Add feature based cluster; i.e. top X are labelled non-basic ---
f = pd.DataFrame(np.concatenate([np.ones(8483), np.zeros(38089)]))

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

scaler = StandardScaler()
scaler.fit(dat_feature_sorted)
dat_feature_sorted_scaled = scaler.transform(dat_feature_sorted)

pca = PCA(n_components=2) # keep the first two principal components of the data
pca.fit(dat_feature_sorted_scaled)

# transform data onto the first two principal components
dat_pca = pd.DataFrame(pca.transform(dat_feature_sorted_scaled))


### 2.0.2 Plots

In [None]:
# --- 1 ----
plt.rcParams["figure.figsize"] = [16,5]

plt.scatter(dat_pca[0], 
            dat_pca[1], 
            c = f[0],
            alpha = 0.5)

plt.colorbar()

plt.title('Basic and Non-basic')
plt.ylabel('Principal Component 2')
plt.xlabel('Principal Component 1')


# --- 2 ----
plt.plot(dat_feature_sorted['cumulative_pct_articles'], 
         dat_feature_sorted['cumulative_pct_feature'],
        linewidth = 3)

# plt.xlabel('Percent of Articles')
# plt.ylabel('Percent of Feature of Interst')

plt.rcParams["figure.figsize"] = [6,6]
plt.title('Majority, from a Minority of Articles')


# --- 3 ----
import plotly.graph_objs as go

trace1 = go.Scatter(
    x = dat_feature_sorted['cumulative_pct_articles'],
    y = dat_feature_sorted['cumulative_pct_feature'],
    mode = 'lines',
    name = '45 degree line'
)


data = [trace1]
iplot(data, filename='scatter-mode')

## 2.01 Scale (for ML approaches)

In [None]:
# SCALING: zero mean and unit variance 
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(dat)
dat_scaled = pd.DataFrame(scaler.transform(dat), columns = dat.columns, index = dat.index)

## 2.1 k-means

### 2.1.1 Model

In [None]:
%%time

# K-means clustering  --------------------  --------------------
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters = 4)
kmeans.fit(dat_scaled)

Kmeans = pd.Series(kmeans.predict(dat_scaled)) # Cluster assignments

# Cluster counts
print(Kmeans.value_counts())
print()

In [None]:
Kmeans = (Kmeans != 0)*1 # lone vector
Kmeans.value_counts()

# k = pd.DataFrame({'kmeans': Kmeans.values}, index = dat_scaled.index)


### 2.1.2 Plots

In [None]:
import matplotlib as mpl

In [None]:
mpl.rcParams.update(mpl.rcParamsDefault)

In [None]:
pd.plotting.scatter_matrix(
    dat[[
        ('gross_demand_quantity', 'mean'), 
        ('gross_demand_quantity', 'std'),
        ('sales_price', 'mean'), 
        ('gross_demand_quantity', 'sum')
        ]], 
    figsize=(10,10),
    diagonal='kde',
    c = Kmeans, 
    alpha = 0.5
)

In [None]:
plt.subplot(2, 1, 1)
plt.scatter(dat['gross_demand_quantity']['mean'], 
            dat['sales_price']['mean'], 
            c = Kmeans,
            alpha = 0.5)
plt.title('Sale Price vs. Gross Demand Quantity')
plt.ylabel('Sale Price')

plt.subplot(2, 1, 2)
plt.scatter(dat['gross_demand_quantity']['sum'], 
            dat['sales_price']['mean'], 
            c = Kmeans,
            alpha = 0.5)
# plt.title('Sale Price vs. Gross Demand Qty (mean/sum)')
plt.ylabel('Sale Price')
plt.xlabel('GDQ Sum')

# plt.xlabel('Sales Price')
# plt.ylabel('log(sold_qty)')

# plt.colorbar()
# plt.rcParams["figure.figsize"] = [5,5]
# plt.title('K-means Classification')


### 2.1.3 Histograms

In [None]:
#### (a) gross_demand_quantity - basic

# Histogram: total gross demand quantity, basic items
dat[(dat['Kmeans'] == 0) & (dat['gross_demand_quantity']['sum'] < 10000)]['gross_demand_quantity']['sum'].hist(bins = 100)

plt.xlabel('Total Gross Demand Quantity')
plt.ylabel('Quantity')

plt.rcParams["figure.figsize"] = [12,12]
plt.title('Basic Articles')

#### (b) gross_demand_quantity - non-basic

# Histogram: total gross_demand_quantity, non-basic items
dat[(dat['Kmeans'] != 0) & (dat['gross_demand_quantity']['sum'] < 5000)]['gross_demand_quantity']['sum'].hist(bins = 50)

plt.xlabel('Total Gross Demand Quantity')
plt.ylabel('Quantity')

plt.rcParams["figure.figsize"] = [12,12]
plt.title('Non-basic Articles')


#### (c) sales_price - basic

# Histogram: sales_price, basic items
dat[(dat['Kmeans'] == 0) & (dat['sales_price']['mean'] < 200) & (dat['sales_price']['mean'] > 0)]['sales_price']['mean'].hist(bins = 100)

plt.xlabel('Sale Price')
plt.ylabel('Quantity')

plt.rcParams["figure.figsize"] = [12,12]
plt.title('Basic Articles')


#### (d) sales_price - non-basic

# Histogram: sales_price, non-basic items
dat[(dat['Kmeans'] != 0) & (dat['sales_price']['mean'] < 200)]['sales_price']['mean'].hist(bins = 100)

plt.xlabel('Sale Price')
plt.ylabel('Quantity')

plt.rcParams["figure.figsize"] = [12,12]
plt.title('Non-basic Articles')

## 2.2 Agglomerative Clustering

In [None]:
%%time

# Agglomerative Clustering -------------
from sklearn.cluster import AgglomerativeClustering

agg = AgglomerativeClustering(n_clusters = 6)
agglom = pd.Series(agg.fit_predict(dat_scaled))

agglom.value_counts()

In [None]:
# # Reclassify: majority/non as 0/1 

agglom = (agglom != 0)*1 # lone vector
agglom.value_counts()

In [None]:
a = pd.DataFrame({'agglom': agglom.values}, index = dat_scaled.index)

In [None]:
plt.subplot(2, 1, 1)
plt.scatter(dat_scaled['gross_demand_quantity']['mean'], 
            dat_scaled['sales_price']['mean'], 
            c = agglom,
            alpha = 0.25)
plt.title('Agglomerative Classes: Sale Price vs. Gross Demand Qty (mean/sum)')
plt.ylabel('Sale Price')

plt.subplot(2, 1, 2)
plt.scatter(dat_scaled['gross_demand_quantity']['sum'], 
            dat_scaled['sales_price']['mean'], 
            c = agglom,
            alpha = 0.25)
plt.ylabel('Sale Price')
plt.xlabel('GDQ Sum')


## 2.3 DBSCAN 

'density based spatial clustering of applications with noise'

In [None]:
%%time

from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps = 0.145, min_samples = 5) # 3min 36s
dbs = pd.Series(dbscan.fit_predict(dat_scaled))

dbs.value_counts()

In [None]:
# # Reclassify: majority/non as 0/1 
dbs01 = (dbs == -1)*1 # lone vector
dbs01.value_counts()

In [None]:
dbs = pd.DataFrame({'dbs': dbs01.values}, index = dat_scaled.index)

In [None]:
plt.scatter(np.log(dat['net_sales']['sum']), 
            np.log(dat['sold_qty']['sum']), 
            c = dbs01,
            alpha = 0.15)
plt.xlabel('log(net_sales)')
plt.ylabel('log(sold_qty)')
plt.colorbar()
plt.title('DBSCAN Classification')
plt.rcParams["figure.figsize"] = [16,16]

## 2.4 Evaluation

### 2.4.1 Cross methods comparison

In [None]:
# k_a = pd.merge(k, a, left_index = True, right_index = True)
# f_k_a = pd.merge(f, k_a, left_index=True, right_index=True)
# fka_dbs = pd.merge(f_k_a, dbs, left_index=True, right_index=True)

# .groupby(['feat_based', 'kmeans', 'agglom']).size()

# table = pd.DataFrame(data = {'Ag': agglom, 'Km': Kmeans, 'DB': dbs01})
# table.groupby(['Ag', 'Km', 'DB']).size()
1 - 3924/46573 # 0.9157 kmeans-agglomerative agreement


### 2.4.2 Clustering Metrics

See https://scikit-learn.org/stable/modules/clustering.html for metric information (2.4.2 - __ )

In [None]:
dat_feature_sorted_scaled = pd.DataFrame(StandardScaler().fit(dat_feature_sorted).transform(dat_feature_sorted), 
                                         columns = dat_feature_sorted.columns,
                                         index = dat_feature_sorted.index)


In [None]:
#### Silhouette Coefficient

from sklearn import metrics

# higher better

metrics.silhouette_score(dat_feature_sorted_scaled, f['feat_based']) # feature based: 0.555

metrics.silhouette_score(dat_feature_sorted, f['feat_based']) # feature based: 0.59568
metrics.silhouette_score(dat_scaled, k['kmeans']) # k-means: 0.44598
metrics.silhouette_score(dat_scaled, a['agglom']) # agglomerative: 0.33
metrics.silhouette_score(dat_scaled, dbs['dbs']) # DBSCAN: 0.25


#### Calinski-Harabaz Index

# higher better

metrics.calinski_harabaz_score(dat_feature_sorted_scaled, f['feat_based']) # 20816

metrics.calinski_harabaz_score(dat_feature_sorted, f['feat_based']) # 77433
metrics.calinski_harabaz_score(dat_scaled, k['kmeans']) # 7474
metrics.calinski_harabaz_score(dat_scaled, a['agglom']) # 5147
metrics.calinski_harabaz_score(dat_scaled, dbs['dbs']) # 2311


#### Davies-Bouldin Index

# lower is better

metrics.davies_bouldin_score(dat_feature_sorted_scaled, f['feat_based']) # 0.779

metrics.davies_bouldin_score(dat_feature_sorted, f['feat_based']) # 0.44
metrics.davies_bouldin_score(dat_scaled, k['kmeans']) # 1.186
metrics.davies_bouldin_score(dat_scaled, a['agglom']) # 1.47
metrics.davies_bouldin_score(dat_scaled, dbs['dbs']) # 2.51


# 3 Article Reference Data

In [None]:
dat_ref = pd.read_csv('article_reference_data_y2016_18.csv',
                 delimiter = '~',
                  low_memory = False,
                   usecols=['group_article', 'brand', 'sub_brand', 'season_create', 'season_active',
                            'graphic', 'gender', 'age_group', 'retail_intro_date_global',
                            'retail_exit_date_global', 'material_technology', 'pictogram_composition',
                            'price_band', 'gender_age', 'construction_type', 'length_mes_uom_dim',
                            'uom_dim', 'height_mes_uom_dim', 'width_mes_uom_dim', 'article_descr',
                            'drop_season', 'uom_vol', 'uom_wgt', 'product_fit', 'material_way_type',
                            'outer_sole_main_material', 'inner_sole_main_material', 'main_material_lining',
                            'main_material_upper', 'dimension_uov', 'dimension_uom', 'carried_over_from',
                            'drop_date', 'retail_exit_tgt_season', 'product_franchise', 'age_group_descr',
                            'brand_descr', 'sub_brand_descr', 'lifecylce_status_prod_descr', 'brand_asset_descr',
                            'rmh_retail_class_descr', 'rmh_retail_department_descr', 'rmh_retail_sub_class_descr',
                            'rmh_retail_sub_dept_descr', 'rmh_category_descr', 'rmh_gender_descr',
                            'rmh_retail_section_descr', 'rmh_product_division_descr', 'rmh_product_type_descr',
                            'spm_color_first_descr', 'spm_color_second_descr', 'spm_color_third_descr',
                            'spm_color_fourth_descr', 'product_franchise_descr'
                           ]
                     )

dat_ref.drop(['uom_dim', 'drop_season', 'uom_vol', 'uom_wgt', 'material_way_type', 'inner_sole_main_material', 
           'main_material_upper', 'outer_sole_main_material', 'main_material_lining', 'carried_over_from', 'drop_date',
           'graphic', 'pictogram_composition', 'article_descr', 'lifecylce_status_prod_descr', 'brand_asset_descr', 
           'rmh_retail_class_descr', 'rmh_retail_sub_class_descr', 'rmh_retail_sub_dept_descr', 'spm_color_first_descr', 
           'spm_color_second_descr', 'spm_color_third_descr','spm_color_fourth_descr',
            'length_mes_uom_dim', 'height_mes_uom_dim', 'width_mes_uom_dim'], axis = 1, inplace=True)


In [None]:
# dat_ref.to_csv('dat_ref.csv')

dat_ref = pd.read_csv('dat_ref.csv', low_memory=False, index_col = 0) 
# dat_ref = dat_ref[dat_ref['retail_intro_date_global'] != '0']

In [None]:
# Convert retail_intro_date_global to a date for adding color to FW_18 vs. FW_17 plot

from datetime import datetime


# dat_ref['retail_intro_date_global'] = dat_ref['retail_intro_date_global'].apply(str)

# pd.to_datetime(datetime.strptime(dat_ref['retail_intro_date_global'][0], '%Y%m%d'))
# pd.to_datetime(datetime.strptime('20180414', '%Y%m%d'))

dat_ref['retail_intro_date_global'] = pd.Series([pd.to_datetime(datetime.strptime(x, '%Y%m%d')) for x in dat_ref['retail_intro_date_global']])



In [None]:
dat_ref.dtypes

In [None]:
# dat_ref.dtypes
{print(x, '-->', len(dat_ref[x].unique())) for x in dat_ref.columns}

In [None]:
{print(x, '-->', dat_ref[x].unique()) for x in dat_ref.columns}

# pd.to_datetime(dat2['season_create'])

# 4 Cluster Prediction

    *On hold, determine if clustering helps first

# 5 Prior season baseline 

### Note: key article -- C77124

In [None]:
dat[dat['article_number'] == 'C77124']

### Note: what is low volume? What is high volume?

    - Small: < 100 units
    - Large: > 30000 units

## 5.0 Andras Embeddings/distances, Exasol

In [None]:
import pyexasol
from pyexasol import ExaConnection

# Id: comischr
# Pwd: Comiskey021

In [None]:
%%time

C = pyexasol.connect(dsn='10.143.86.51:8583', user='comischr', password='Comiskey021', compression=True, encryption=True)

# dfAndrasFeatures = C.export_to_pandas("""select * from READ_SCV.ARTICLE_EMBEDDINGS""")
# dfAndrasDistances = C.export_to_pandas("""select * from READ_SCV.ARTICLE_DISTANCE""")

# dfAndreasDistances = C.export_to_pandas("""select * from READ_SCV.PPC_SIM_ARTICLEPAIR_EUCL_V""")

dfAndreasDistances = C.export_to_pandas("""select * from READ_SCV.PPC_SIM_ARTICLEPAIR_EUCL_V where EUCL_SCORE < 7.8 / 2""")


In [None]:
# dfAndrasDistances.to_csv('dfAndrasDistances.csv')

dfAndrasDistances = pd.read_csv('dfAndrasDistances.csv')
dfAndrasDistances.drop(['Unnamed: 0', 'COSINE'], axis = 1, inplace = True)

## 5.1 Similarity-based Predictions

In [None]:
# retain season introduced column

dat0 = pd.read_csv('sales_and_features1_mini.csv',
                 delimiter = '~',
                  low_memory = False
                     )

### Updated data

In [None]:
%%time

dat0 = pd.read_csv('sales_and_features.csv',
                 delimiter = '~',
                  low_memory = False,
                   usecols=['sl1.consumer_order_date', 
                            'sl1.sales_organization', 
                            'sl1.country',
                            'sl1.article_number', 
                            'sl1.brand',
                            'sl1.gross_demand_quantity', 
                            'sl1.sold_qty', 
                            'sl1.net_qty',
                            'sl1.net_sales', 
                            'sl1.std_margin',
                            'sl1.return_qty', 
                            'sl1.article_promotion_main_category_group', 
                            'fw_or_ss', 
                            'season'
                           ]
                     )

In [None]:
dat0.columns = [x.replace('sl1.', '') for x in dat0.columns] # tidy column names
dat0['consumer_order_date'] = pd.to_datetime(dat0['consumer_order_date'])

In [None]:
# dat0.to_csv('dat.csv')

dat = pd.read_csv('dat.csv', low_memory=False, index_col = 0) 

In [None]:
dat['consumer_order_date'] = pd.to_datetime(dat['consumer_order_date'])

In [None]:
dat.head()

### Wrangling

In [None]:
# %%time
# dat0 = pd.read_csv('seasonal_sales_indicators.csv', delimiter = '~')
# dat.columns = [x.replace('t_eu_ecom_dit_dsf_transaction_t.', '') for x in dat.columns] # tidy column names

dat = dat0.copy()


In [None]:
dat = dat[dat['gross_demand_quantity'] != 0] 
dat.drop(['sales_organization', 'country', 'brand', 'sold_qty', 'net_qty', 'net_sales', 
          'std_margin', 'return_qty', 'article_promotion_main_category_group',
          'fw_or_ss'], 
         inplace=True, axis = 1)
dat.head()

In [None]:
dat = dat.groupby(by = ['article_number', 'season']).agg('sum')[['gross_demand_quantity']] # aggregation step
dat.reset_index(inplace=True)

dfAndrasDistances = pd.read_csv('dfAndrasDistances.csv')
dfAndrasDistances.drop('Unnamed: 0', axis = 1, inplace = True)

In [None]:
# Data:
    # (1) dat
    # (2) dfAndrasDistances

In [None]:
d = dat[dat['season'] == 'FW17'][['article_number', 'gross_demand_quantity']]
kahuna = (
    pd.merge(dfAndrasDistances, d, left_on = 'A1', right_on = 'article_number').
    drop(['article_number', 'COSINE'], axis = 1).
    rename(columns = {'gross_demand_quantity': 'A1_FW17'})
)

In [None]:
kahuna = (
    pd.merge(kahuna, d, left_on = 'A2', right_on = 'article_number').
    drop(['article_number'], axis = 1).
    rename(columns = {'gross_demand_quantity': 'A2_FW17'})
    )


In [None]:
d = dat[dat['season'] == 'FW18'][['article_number', 'gross_demand_quantity']]
kahuna = (
    pd.merge(kahuna, d, left_on = 'A1', right_on = 'article_number').
    drop('article_number', axis = 1).
    rename(columns = {'gross_demand_quantity': 'A1_FW18'})
         )

In [None]:
neighbors = (
    pd.DataFrame(kahuna.groupby('A1')['EUCLIDEAN'].nsmallest(10)).
    reset_index()
        )

kahuna = (
    pd.merge(kahuna, neighbors, left_on = ['A1', kahuna.index], right_on = ['A1', 'level_1']).
    drop(['level_1', 'EUCLIDEAN_y'], axis = 1)
)

In [None]:
preds = (
    pd.DataFrame(kahuna.groupby('A1')['A2_FW17'].mean()).
    reset_index().
    rename(columns = {'A2_FW17': 'A1_FW18_pred'})
)

kahuna = pd.merge(kahuna, preds, left_on='A1', right_on='A1')

In [None]:
kahuna_skinny = kahuna[['A1', 'A1_FW17', 'A1_FW18', 'A1_FW18_pred']].drop_duplicates()

In [None]:
kahuna_ref = (pd.merge(kahuna_skinny, dat_ref, left_on='A1', right_on='group_article').
              # drop(['retail_intro_date_global'], axis = 1).
              drop_duplicates()
             )
              

kahuna_ref.shape
kahuna_ref.head()

### MAPEs

In [None]:
kahuna_skinny[(kahuna_skinny['A1_FW17'] > 500) & (kahuna_skinny['A1_FW18'] < 25)].shape

In [None]:
# ---- MAPE ---- Mean Absolute Percent Error ----
def mape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return pd.Series(np.mean(np.abs((y_true - y_pred) / y_true)) * 100)

In [None]:
mape(kahuna_skinny['A1_FW18'], kahuna_skinny['A1_FW18_pred']) # prediction: mean of similars

mape(kahuna_skinny['A1_FW18'], kahuna_skinny['A1_FW17']) # predcition: last year



# ---- SANITY CHECK ----
mape(kahuna_skinny['A1_FW18'][0:4], kahuna_skinny['A1_FW18_pred'][0:4]) # 2336.97

(np.abs(567.9 - 34)/34 + np.abs(744.8 - 52)/52 + np.abs(276.8 - 7)/7 + np.abs(269.1 - 10)/10)/4 * 100 # 2336.97

In [None]:
# ---- Prediction: mean of similars ----
kahuna_skinny['Pct_Err'] = np.abs(kahuna_skinny['A1_FW18_pred'] - kahuna_skinny['A1_FW18'])/kahuna_skinny['A1_FW18'] * 100
kahuna_skinny['Pct_Err'].describe() # overall MAPE

# ---- Prediction: last year ----
kahuna_skinny['Pct_Err2'] = np.abs(kahuna_skinny['A1_FW17'] - kahuna_skinny['A1_FW18'])/kahuna_skinny['A1_FW18'] * 100


In [None]:
# ---- wAPE ---- weighted absolute percent error

tot_gdq = kahuna_skinny['A1_FW18'].sum() 
tot_gdq # 819215

kahuna_skinny['pct_gdq'] = kahuna_skinny['A1_FW18']/tot_gdq 
kahuna_skinny['pct_gdq'].sum() # 1.0

np.average(kahuna_skinny['Pct_Err'], weights = kahuna_skinny['pct_gdq']) # 140.99
np.average(kahuna_skinny['Pct_Err'], weights = kahuna_skinny['A1_FW18']) # 140.99

np.sum((kahuna_skinny['A1_FW18']*kahuna_skinny['Pct_Err']))/tot_gdq # 140.99
np.sum(kahuna_skinny['Pct_Err']*kahuna_skinny['pct_gdq']) # 140.99

# ---- SANITY CHECK ---- 

# wAPE by hand: 1704.466
(34*(np.abs(567.9 - 34)/34) + 52*(np.abs(744.8 - 52)/52) + 7*(np.abs(276.8 - 7)/7) + 10*(np.abs(269.1 - 10)/10))/103*100

# wAPE : 1704.466
np.sum(kahuna_skinny['Pct_Err'][0:4]*kahuna_skinny['pct_gdq'][0:4])/np.sum(kahuna_skinny['pct_gdq'][0:4]) # 1704.466

np.average(kahuna_skinny['Pct_Err'][0:4], weights = kahuna_skinny['pct_gdq'][0:4]) # 1704.466

# Sanity intact : )


In [None]:
# ---- BENCHMARK ----
kahuna_skinny['benchmark0'] = 0
kahuna_skinny['benchmark'] = 2

mape(kahuna_skinny['A1_FW18'], kahuna_skinny['benchmark']) # 92.36

kahuna_skinny['Pct_Err00'] = np.abs(kahuna_skinny['benchmark0'] - kahuna_skinny['A1_FW18'])/kahuna_skinny['A1_FW18'] * 100



In [None]:
# ---- BENCHMARK!! ---- MAPE biased toward under-prediction

benchmark_preds = np.arange(0, 10, 0.1) # predictions
benchmark_mape = [mape(kahuna_skinny['A1_FW18'], i) for i in benchmark_preds] # calculated MAPE

pd.DataFrame(benchmark_mape, index = benchmark_preds).plot() # plot

plt.title('Benchmark Models -- MAPE by Prediction Constant')
plt.ylabel('MAPE')
plt.xlabel('Prediction Constant')

In [None]:
# --- MAPE by bin ---

# bins
bins = [0, 100, 10000, 30000]
kahuna_skinny['bin'] = pd.cut(np.array(kahuna_skinny['A1_FW18']), bins)

# ---- Similarity-based prediction, by bin ----
kahuna_skinny.groupby('bin')['Pct_Err'].describe()[['count', 'mean']]

# ---- Prediction: last year ---- MAPE by bins
kahuna_skinny.groupby('bin')['Pct_Err2'].describe()[['count', 'mean']]

In [None]:
plt.subplot(2, 1, 1)
kahuna_skinny[kahuna_skinny['Pct_Err'] < 3000]['Pct_Err'].hist(bins = 100)

plt.subplot(2, 1, 2)
kahuna_skinny[(kahuna_skinny['Pct_Err'] < 3000) & (kahuna_skinny['A1_FW17'] < 21)]['Pct_Err'].hist(bins = 100)

In [None]:
plt.scatter(np.log(kahuna_skinny['A1_FW17']), 
            np.log(kahuna_skinny['A1_FW18']), 
            # c = dbs01,
            alpha = 0.15)
# plt.xlabel('FW17 net_qty')
# plt.ylabel('Abs Pct Error')

kahuna_skinny[['A1_FW17', 'A1_FW18']].corr()

# plt.rcParams["figure.figsize"] = [16,16]

### Binned MAPES by RMHs

In [None]:
%whos DataFrame

In [None]:
kahuna_ref.shape
kahuna_ref.head()

In [None]:
dat0['consumer_order_date'] == dat0['consumer_order_date'].min()

In [None]:
bins = [0, 100, 10000, 30000]
kahuna_ref['bin'] = pd.cut(np.array(kahuna_skinny['A1_FW17']), bins)

# ---- Similarity-based prediction, by bin ----
kahuna_ref.groupby('bin')['Pct_Err'].describe()[['count', 'mean']]

In [None]:
# dat_ref.dtypes
{print(x, '-->', len(kahuna_ref[x].unique())) for x in kahuna_ref.columns}

In [None]:
kahuna_ref.groupby(['product_franchise_descr', 'bin'])['Pct_Err'].describe()[['count', 'mean']]

# Conclusion: no discernable pattern in MAPE by RMH categories

## 5.3 SARIMAX

In [None]:
%%time
dat0 = pd.read_csv('seasonal_sales_indicators.csv',
                 delimiter = '~')

# dat_ref = pd.read_csv('dat_ref.csv', low_memory=False, index_col = 0) 

dat = dat0.copy()

dat.columns = [x.replace('t_eu_ecom_dit_dsf_transaction_t.', '') for x in dat.columns] # tidy column names
dat = dat[dat['gross_demand_quantity'] != 0] 
dat['consumer_order_date'] = pd.to_datetime(dat['consumer_order_date'])

datter = dat.merge(dat_ref, left_on = 'article_number', right_on = 'group_article', how = 'left')
datter.shape

{print(x, '-->', len(dat[x].unique())) for x in dat.columns}

## 5.4 Time Series EDA

In [None]:
dat_g = dat.groupby(by = ['article_number', 'consumer_order_date']) # data grouped

dat_aggs = pd.DataFrame(dat_g.agg('sum')['gross_demand_quantity']) # data aggregates

dat_aggs.reset_index(level = 'article_number', inplace=True)

dat_aggs = dat_aggs.groupby('article_number')
dat_aggs = dat_aggs.resample('W').sum() # 'aggregate' to weekly sums by article

dat_aggs = dat_aggs.reset_index()

# Convert article_number to column, gross_demand_quantity to data
dat_aggs_pivoted = dat_aggs.pivot(index = 'consumer_order_date', 
                                  columns = 'article_number', 
                                  values = 'gross_demand_quantity')

# dat.fillna(value=0, inplace = True) # Define single transaction article std to be 0
plt.rcParams["figure.figsize"] = [18,6]

dat_aggs_pivoted[dat_aggs_pivoted.columns[0:20]].plot()


# S21490 (arbitrarily) for learning time series EDA

datS21 = (dat_aggs[dat_aggs['article_number'] == 'S21490'].
          set_index('consumer_order_date').
          drop('article_number', axis = 1)
         )

weekly = datS21.resample('W').sum() # 'aggregate' to weekly sums

weekly.head()
weekly.plot()

# 6 Generalized Autoregressive Scoring models

## 6.0 Mean of mids (monthly)

In [None]:
dat = pd.read_csv('dat.csv', low_memory=False, index_col = 0) 
dat['consumer_order_date'] = pd.to_datetime(dat['consumer_order_date'])

dat = dat[dat['gross_demand_quantity'] != 0] 
dat.drop(['sales_organization', 'country', 'brand', 'sold_qty', 'net_qty', 'net_sales', 
          'std_margin', 'return_qty', 'article_promotion_main_category_group',
          'fw_or_ss'], 
         inplace=True, axis = 1)

dat0 = dat.copy()

dat_ref = pd.read_csv('dat_ref.csv', low_memory=False, index_col = 0) 
dat_ref.head()
# {print(x, '-->', dat_ref[x].unique()) for x in dat_ref.columns}

# ---- To filter to shoes ----
dat_ref = (dat_ref[['group_article', 'rmh_product_type_descr']].
           drop_duplicates()
          )
dat_ref.shape

In [None]:
# Use dat_ref to filter dat to shoes only
dat_ref = dat_ref[['SHOES' in x for x in dat_ref['rmh_product_type_descr']]] # filter to shoes
dat = pd.merge(dat, dat_ref, left_on='article_number', right_on='group_article') # merge to reduce dat to shoes
dat.drop(['group_article', 'rmh_product_type_descr'], axis = 1, inplace = True) 

In [None]:
# gross_demand_quantity by article, order date 
dat_g = dat.groupby(by = ['article_number', 'consumer_order_date']) # data grouped
dat_g = pd.DataFrame(dat_g.agg('sum')['gross_demand_quantity']) # data aggregates
dat_g.reset_index(level = 'article_number', inplace=True)

In [None]:
# weekly article_number gross_demand_quantity
dat_g = dat_g.groupby('article_number').resample('W').sum() 
dat_g.reset_index(level = 'article_number', inplace=True)

In [None]:
# Add columns: year, week 
dat_g['year'] = [t.year for t in dat_g.index]
dat_g['week'] = [t.week for t in dat_g.index]

In [None]:
# ----- article gross_demand_quantity BY YEAR -----
dat_by_year = (dat_g.drop('week', axis = 1).
               groupby(['article_number', 'year']).
               sum().
               reset_index()
              )

In [None]:
# ----- 2017 articles: 100 < gross_demand_quantity < 10000 
mids2017 = dat_by_year[
    (dat_by_year['year'] == 2017) 
    & 
    (dat_by_year['gross_demand_quantity'] > 100)
    &
    (dat_by_year['gross_demand_quantity'] < 10000)]

# ----- 2018 articles: 100 < gross_demand_quantity < 10000 
mids2018 = dat_by_year[
    (dat_by_year['year'] == 2018) 
    & 
    (dat_by_year['gross_demand_quantity'] > 100)
    &
    (dat_by_year['gross_demand_quantity'] < 10000)]

In [None]:
%%time

dat_g_2017mids = dat_g[[a in mids2017['article_number'].unique() for a in dat_g['article_number']]]
dat_g_2018mids = dat_g[[a in mids2018['article_number'].unique() for a in dat_g['article_number']]]

# pd.cut(np.array(dat_by_year['gross_demand_quantity']), [0, 100, 10000, 30000]).describe()

In [None]:
A = (dat_g_2017mids.
     groupby('week').
     mean().
     drop('year', axis = 1).
     rename(columns = {'gross_demand_quantity': '2017mean'})
    )
B = (dat_g_2018mids.
     groupby('week').
     mean().
     drop('year', axis = 1).
     rename(columns = {'gross_demand_quantity': '2018mean'})
    )
pd.merge(A, B, left_index=True, right_index=True).plot() # AWESOME PLOT
plt.title('Shoes: 100 < gross_demand_quantity < 10000')

In [None]:
# ----- article gross_demand_quantity BY YEAR -----
dat_by_year = (dat_g.drop('week', axis = 1).
               groupby(['article_number', 'year']).
               sum().
               reset_index()
              )

# ----- 2017 articles: 100 < gross_demand_quantity < 10000 
mids2017 = dat_by_year[
    (dat_by_year['year'] == 2017) 
    & 
    (dat_by_year['gross_demand_quantity'] > 100)
    &
    (dat_by_year['gross_demand_quantity'] < 10000)]

# ----- 2018 articles: 100 < gross_demand_quantity < 10000 
mids2018 = dat_by_year[
    (dat_by_year['year'] == 2018) 
    & 
    (dat_by_year['gross_demand_quantity'] > 100)
    &
    (dat_by_year['gross_demand_quantity'] < 10000)]

In [None]:
# Calculate MAPE of 2017/2018 articles against their own mean

mids2017['gross_demand_quantity'].mean()
mape(mids2017['gross_demand_quantity'], mids2017['gross_demand_quantity'].mean()) # 2017

mape(mids2018['gross_demand_quantity'], mids2018['gross_demand_quantity'].mean()) # 2018

### Time series plotting 

In [None]:
# .pivot(...) ----- for time series plotting
dat_g_2017mids_pivot = dat_g.pivot(columns = 'article_number', values = 'gross_demand_quantity')

In [None]:
dat_g_2017mids_pivot['monthly_avg'] = dat_g_2017mids_pivot.apply(np.mean, axis = 1)

In [None]:
# 2017 residuals by article
resids = dat_g_2017mids_pivot.copy()
for c in dat_g_2017mids_pivot.columns:
    resids[c] = dat_g_2017mids_pivot[c] - dat_g_2017mids_pivot['monthly_avg']

In [None]:
# Plot of residuals (random subset)

from random import sample 

plt.rcParams["figure.figsize"] = [12,6]
resids[sample(list(resids.columns), 10)].plot()

In [None]:
# Mid-range articles for 2017: data through 2019
dat_g.head()
dat_g['year'].value_counts()

t = pd.DataFrame(dat_g.groupby(dat_g.index)['gross_demand_quantity'].mean())
t.head()
t.shape
t.plot()

In [None]:
# Linear regression on 2017 mid range articles (not sure what use this is)

import statsmodels.api as sm
t['X'] = range(1, 107)
y = t['gross_demand_quantity']
t.head()

reg = sm.OLS(t['gross_demand_quantity'], sm.add_constant(t['X'])).fit()
reg.summary()

## 6.1 Mean baseline + simliarity-based adjustment 
    --> annual

In [None]:
dat = pd.read_csv('dat.csv', low_memory=False, index_col = 0) # Wall time: 1min 47s
dat['consumer_order_date'] = pd.to_datetime(dat['consumer_order_date'])

In [None]:
dat = dat[dat['gross_demand_quantity'] != 0] 
dat.drop(['sales_organization', 'country', 'brand', 'sold_qty', 'net_qty', 'net_sales', 
          'std_margin', 'return_qty', 'article_promotion_main_category_group',
          'fw_or_ss'], 
         inplace=True, axis = 1)

In [None]:
dat0 = dat.copy()

In [None]:
dat_ref = pd.read_csv('dat_ref.csv', low_memory=False, index_col = 0) 
# {print(x, '-->', dat_ref[x].unique()) for x in dat_ref.columns}

In [None]:
# ---- add reference information, for filter to shoes ----
dat_ref = (dat_ref[['group_article', 'rmh_product_type_descr']].
           drop_duplicates()
          )
dat_ref.shape

In [None]:
# Use dat_ref to filter dat to shoes only
dat_ref = dat_ref[['SHOES' in x for x in dat_ref['rmh_product_type_descr']]] # filter to shoes
dat = pd.merge(dat, dat_ref, left_on='article_number', right_on='group_article') # merge to reduce dat to shoes
dat.drop(['group_article', 'rmh_product_type_descr'], axis = 1, inplace = True) 

In [None]:
# gross_demand_quantity groupby article, order date 
dat_g = dat.groupby(by = ['article_number', 'consumer_order_date']) # data grouped
dat_g = pd.DataFrame(dat_g.agg('sum')['gross_demand_quantity']) # data aggregates
dat_g.reset_index(level = 'article_number', inplace=True)

In [None]:
%%time

# Add columns: year, week 
dat['year'] = [t.year for t in dat['consumer_order_date']]
dat['week'] = [t.week for t in dat['consumer_order_date']]

# Wall time: 1min 33s

In [None]:
# article_demand_quantity by year
dat_annual = (dat[['article_number', 'year', 'gross_demand_quantity']].
              groupby(['article_number', 'year']).
              sum().
              reset_index()
             )

In [None]:
dat_annual.groupby('year')['gross_demand_quantity'].describe()

In [None]:
# ----------------------------------------------------------

In [None]:
%%time
# filter to 2017/2018 articles with mid-range demand; for MAPE, plots
m = mids2017['article_number'].unique()
dat_g_2017mids = dat_g[[a in m for a in dat_g['article_number']]] 

In [None]:
m = mids2018['article_number'].unique()
dat_g_2018mids = dat_g[[a in m for a in dat_g['article_number']]]

In [None]:
# Add columns: year, week 
dat_g['year'] = [t.year for t in dat_g.index]
dat_g['week'] = [t.week for t in dat_g.index]

In [None]:
# ----- article gross_demand_quantity BY YEAR -----
dat_by_year = (dat_g.drop('week', axis = 1).
               groupby(['article_number', 'year']).
               sum().
               reset_index()
              )

# ----- 2017 articles: 100 < gross_demand_quantity < 10000 
mids2017 = dat_by_year[
    (dat_by_year['year'] == 2017) 
    & 
    (dat_by_year['gross_demand_quantity'] > 100)
    &
    (dat_by_year['gross_demand_quantity'] < 10000)]

# ----- 2018 articles: 100 < gross_demand_quantity < 10000 
mids2018 = dat_by_year[
    (dat_by_year['year'] == 2018) 
    & 
    (dat_by_year['gross_demand_quantity'] > 100)
    &
    (dat_by_year['gross_demand_quantity'] < 10000)]

In [None]:
dat_by_year.head()

In [None]:
# ----- Plot 2017 & 2018 mid-range article means -----
A = (dat_g_2017mids.
     groupby('week').
     mean().
     drop('year', axis = 1).
     rename(columns = {'gross_demand_quantity': '2017mean'})
    )
B = (dat_g_2018mids.
     groupby('week').
     mean().
     drop('year', axis = 1).
     rename(columns = {'gross_demand_quantity': '2018mean'})
    )
pd.merge(A, B, left_index=True, right_index=True).plot()
plt.title('Shoes: 100 < gross_demand_quantity < 10000')

In [None]:
# Calculate MAPE of 2017/2018 articles against their own mean

mids2017['gross_demand_quantity'].mean()
mape(mids2017['gross_demand_quantity'], mids2017['gross_demand_quantity'].mean()) # 2017

mape(mids2018['gross_demand_quantity'], mids2018['gross_demand_quantity'].mean()) # 2018

In [None]:
# ----------------------------------------------------------

In [None]:
# dfAndrasDistances.to_csv('dfAndrasDistances.csv')

dfAndrasDistances = pd.read_csv('dfAndrasDistances.csv')
dfAndrasDistances.drop(['Unnamed: 0', 'COSINE'], axis = 1, inplace = True)

In [None]:
dfAndrasDistances.head()

In [None]:
# Consider article X:
    # (1) article X existed in 2017
        # (a) 2018 prediction is mean line
    
    # (2) article X did not exist in 2017
        # (a) Find n most similar articles of those that did exist in 2017
        # (b) See how they relate to mean line
        # (c) Take... average of those?


In [None]:
%whos DataFrame

In [None]:
dat.head()

In [None]:
dat = dat.groupby(by = ['article_number', 'season']).agg('sum')[['gross_demand_quantity']] # aggregate by year
dat.reset_index(inplace=True)

dfAndrasDistances = pd.read_csv('dfAndrasDistances.csv')
dfAndrasDistances.drop('Unnamed: 0', axis = 1, inplace = True)

d = dat[dat['season'] == 'FW17'][['article_number', 'gross_demand_quantity']]
kahuna = (
    pd.merge(dfAndrasDistances, d, left_on = 'A1', right_on = 'article_number').
    drop(['article_number', 'COSINE'], axis = 1).
    rename(columns = {'gross_demand_quantity': 'A1_FW17'})
)

# -----------------------------------------------------------------

kahuna = (
    pd.merge(kahuna, d, left_on = 'A2', right_on = 'article_number').
    drop(['article_number'], axis = 1).
    rename(columns = {'gross_demand_quantity': 'A2_FW17'})
    )
# -----------------------------------------------------------------

d = dat[dat['season'] == 'FW18'][['article_number', 'gross_demand_quantity']]
kahuna = (
    pd.merge(kahuna, d, left_on = 'A1', right_on = 'article_number').
    drop('article_number', axis = 1).
    rename(columns = {'gross_demand_quantity': 'A1_FW18'})
         )
# -----------------------------------------------------------------

neighbors = (
    pd.DataFrame(kahuna.groupby('A1')['EUCLIDEAN'].nsmallest(10)).
    reset_index()
        )

kahuna = (
    pd.merge(kahuna, neighbors, left_on = ['A1', kahuna.index], right_on = ['A1', 'level_1']).
    drop(['level_1', 'EUCLIDEAN_y'], axis = 1)
)
# -----------------------------------------------------------------

preds = (
    pd.DataFrame(kahuna.groupby('A1')['A2_FW17'].mean()).
    reset_index().
    rename(columns = {'A2_FW17': 'A1_FW18_pred'})
)

kahuna = pd.merge(kahuna, preds, left_on='A1', right_on='A1')
# -----------------------------------------------------------------

kahuna_skinny = kahuna[['A1', 'A1_FW17', 'A1_FW18', 'A1_FW18_pred']].drop_duplicates()

# Past Performance EDA

### Historical raw MAPE

In [None]:
%%time

carryover = pd.read_csv('Stock left and of season.csv', low_memory=False, index_col = 0, 
                                          usecols=['season', 'article_number', 'ecom_available_stock']) 

carryover.reset_index(inplace = True)

In [None]:
# dat0 = pd.read_csv('dat.csv', low_memory=False, index_col = 0) # Wall time: 1min 47s

In [None]:
dat = dat0.copy()

In [None]:
dat['consumer_order_date'] = pd.to_datetime(dat['consumer_order_date'])

dat = dat[dat['gross_demand_quantity'] != 0] 
dat.drop(['sales_organization', 'country', 'brand', 'sold_qty', 'std_margin', 
          'return_qty', 'article_promotion_main_category_group', 'fw_or_ss'], 
         inplace=True, axis = 1)

dat = dat.groupby(by = ['article_number', 'season']).agg('sum')[['net_qty']] # aggregate by year
dat.reset_index(inplace=True)


In [None]:
dat = pd.merge(dat, carryover, left_on=['article_number', 'season'], right_on=['article_number', 'season'])

In [None]:
# multiple ecom_available_stock on accident; so find min
dat1 = (
    pd.DataFrame(dat.groupby(['article_number', 'season'])['ecom_available_stock'].min()).
    reset_index())

# leftover was 0, so understocked (maybe, unless sold leftovers)
dat_understock = dat1[dat1['ecom_available_stock'] == 0]


In [None]:
dat2 = (pd.merge(dat1, dat, how = 'left').
        drop_duplicates()
       )
dat2 = dat2[dat2['season'] != 'SS19']

In [None]:
dat2 = dat2[(dat2['net_qty'] != 0) & (dat2['ecom_available_stock'] != 0)]


In [None]:
# pct error = [forecast - actual]/actual = (net_qty + stock - net_qty)/net_qty
dat2['percent_error'] = dat2['ecom_available_stock']/dat2['net_qty']*100

In [None]:
bins = [0, 100, 1000, 30000]
dat2['bin'] = pd.cut(np.array(dat2['net_qty']), bins)

# ---- Similarity-based prediction, by bin ----
# kahuna_ref.groupby('bin')['Pct_Err'].describe()[['count', 'mean']]

In [None]:
d = dat2.groupby(['season', 'bin'])['percent_error'].describe()[['count', 'mean', '50%']]

In [None]:
# --- Sort by season ---

order = {'SS17': 0, 'FW17': 1, 'SS18': 2, 'FW18': 3, 'SS19': 4}
d['order_id'] = [order[i] for i in d.reset_index()['season']]

d.sort_values(by = ['order_id', 'bin'], inplace=True)
d.drop('order_id', axis = 1, inplace=True)

In [None]:
d.round().astype('int')

### Understock EDA

In [36]:
carryover = pd.read_csv('Stock left and of season.csv', low_memory=False, index_col = 0, 
                                          usecols=['season', 'article_number', 'ecom_available_stock']) 

carryover.reset_index(inplace = True)

# Remove pseudo-duplicates: multiple ecom_available_stock on accident, so find min
carryover = (
    pd.DataFrame(carryover.groupby(['article_number', 'season'])['ecom_available_stock'].min()).
    reset_index())

In [4]:
dat0 = pd.read_csv('dat.csv', low_memory=False, index_col = 0) # Wall time: 1min 47s

  mask |= (ar1 == a)


In [39]:
dat = dat0.copy()

In [40]:
# aggregate to article-seasonal net_qty
dat = dat.groupby(by = ['article_number', 'season']).agg('sum')[['net_qty']] # aggregate by year
dat.reset_index(inplace=True)

In [42]:
# Add beginning of season stock information
dat = pd.merge(dat, carryover, left_on=['article_number', 'season'], right_on=['article_number', 'season'])

In [54]:
# leftover was 0, so understocked (maybe, unless sold leftovers)
dat_understock = dat[dat['ecom_available_stock'] == 0]

In [55]:
# remove current season
dat_understock = dat_understock[dat_understock['season'] != 'SS19']

# remove small potatoes items
dat_understock = dat_understock[dat_understock['net_qty'] > 100]

In [57]:
bins = [0, 100, 200, 500, 1000, 2000, 5000, 10000, 30000]
dat_understock['bin'] = pd.cut(np.array(dat_understock['net_qty']), bins)

d = pd.DataFrame(dat_understock.groupby(['season', 'bin'])['article_number'].describe()['count'])

# --- Sort by season ---

order = {'SS17': 0, 'FW17': 1, 'SS18': 2, 'FW18': 3, 'SS19': 4}
d['order_id'] = [order[i] for i in d.reset_index()['season']]

d.sort_values(by = ['order_id', 'bin'], inplace=True)
d.drop('order_id', axis = 1, inplace=True)

In [None]:
# dat_understock[dat_understock['season'] == 'SS17']['net_qty'].hist(bins = 50)

# plt.rcParams["figure.figsize"] = [12,6]
# dat_understock[dat_understock['net_qty'] <2500].groupby('season')['net_qty'].hist(bins = 50)

In [58]:
dat.head()
dat_understock.head()
d

Unnamed: 0,article_number,season,net_qty,ecom_available_stock
0,1354,FW17,1502,4728.0
1,1354,FW18,1805,2499.0
2,1354,SS17,276,5498.0
3,1354,SS18,1254,2932.0
4,1354,SS19,429,4558.0


Unnamed: 0,article_number,season,net_qty,ecom_available_stock,bin
136,33620,SS18,326,0.0,"(200, 500]"
211,87609,FW17,547,0.0,"(500, 1000]"
213,87609,SS18,243,0.0,"(200, 500]"
240,288193,FW18,6883,0.0,"(5000, 10000]"
242,288193,SS18,2540,0.0,"(2000, 5000]"


Unnamed: 0_level_0,Unnamed: 1_level_0,count
season,bin,Unnamed: 2_level_1
SS17,"(100, 200]",91
SS17,"(200, 500]",118
SS17,"(500, 1000]",31
SS17,"(1000, 2000]",7
SS17,"(2000, 5000]",8
FW17,"(100, 200]",1977
FW17,"(200, 500]",2092
FW17,"(500, 1000]",787
FW17,"(1000, 2000]",281
FW17,"(2000, 5000]",63
