In [None]:
# Modules -- 

import numpy as np
import pandas as pd
import multiprocessing

import matplotlib.pyplot as plt
import bokeh
import bokeh.io
from bokeh.plotting import figure
from bokeh.io import output_notebook, show

import seaborn as sns

import re
import math
import copy

from collections import defaultdict
import csv
import itertools
import datetime 
from datetime import datetime
import time
import dateutil.parser
import pickle
import random

import gc
import zipfile
import sys, getopt
import os

from IPython.core.interactiveshell import InteractiveShell
from io import StringIO

# import dask.dataframe as dd

InteractiveShell.ast_node_interactivity = "all"
# InteractiveShell.ast_node_interactivity = "last"

# Magic function to make matplotlib inline
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

# Set up Bokeh for inline viewing
bokeh.io.output_notebook()

# import dask.dataframe as ddf
# import dask.array as da

pd.set_option('max_columns', 500)
pd.set_option('max_rows', 800)

import scipy

import statsmodels.api as sm
# from statsmodels.tsa.seasonal import seasonal_decompose
# from statsmodels.tsa.tsatools import detrend

import datetime as dt

from sklearn.tree import DecisionTreeClassifier

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import json


# Data

In [None]:
dat0 = pd.read_csv('data/ch4k.csv')
ref_dat0 = pd.read_csv('data/Article reference data.csv', low_memory = False, error_bad_lines = False, 
                       usecols = ['article_no', 'model_no', 'art_desc', 'sports_cat_desc', 'rmh_cat_desc', 
                                  'franchise', 'gender_desc', 'age_group_desc', 'prod_grp_desc', 'prod_type_desc',
                                  'brand_desc', 'bus_unit_desc', 'rmh_cat_desc'])

# Remove clearance transactions!!
# dat0['clearance'] = dat0.clearance.fillna(0) 
# dat0['net_qty'] = (1 - dat0.clearance)*dat0.net_qty

# Magic

### Wrangling 

In [None]:
dat = dat0.copy()

dat = dat[(dat.net_qty > 0) & (dat.season.isin(['SS17', 'SS18', 'SS19']))]


In [None]:
SS19 = pd.read_excel('data/ecom_SS19.xlsx').dropna()
SS19.shape

SS19 = SS19[SS19.carryover_FW18 == 'NO'] # new articles only


In [None]:
# Within article price, cost, margin averages 
dat[['price', 'cost', 'margin']] = (
    dat.
    groupby('article_number', group_keys=False)[['price', 'cost', 'margin']].
    transform(lambda x: np.nanmean(x).round(2))
             )


# Remove articles where 'price = NA' in all transactions
dat = dat[~dat.price.isna()]


In [None]:
# Correct -- 

dat = dat.sort_values(['article_number', 'country', 'year', 'week'])

# ---- Replace NAs and zeros (w/ no impact replacements) ----
dat['buy_availability'] = dat.buy_availability.fillna(1) # assume full availability
dat['buy_availability'] = np.where(dat.buy_availability == 0, 1, dat.buy_availability) # replace 0


# ---- Smooth buy_availability ----
def roll(df):
    return df.rolling(window = 5, min_periods = 1, center = True).mean()

dat['buy_availability'] = dat.groupby(['article_number', 'country'])['buy_availability'].apply(roll)


# ---- Correct ----
dat['corr_net_qty'] = (dat.net_qty / dat.buy_availability).round()


# ---- Aggregate to season ----
dat['corr_season_net_qty'] = dat.groupby(['article_number', 'season'])['corr_net_qty'].transform(lambda x: np.sum(x))
dat['corr_season_net_qty'] = np.where(dat.corr_season_net_qty > dat.season_net_qty, dat.corr_season_net_qty, dat.season_net_qty) # only if >

dat['avg_buy_availability'] = dat.groupby(['article_number', 'season'])['buy_availability'].transform(lambda x: np.nanmean(x).round(3))



In [None]:
# Time series shenanigans: alternative correction -- 

# pd.DataFrame([5, 5,5,5,5, 5, 5, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5]).ewm(com = 1).mean().plot()

# pd.DataFrame([5, 5, 5, 5, 5, 5, 5, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5]).rolling(window = 5, center = True).mean().plot()

# from statsmodels.tsa.arima_model import ARIMA


# yw.loc[:,'YEAR'] = [str(x)[:-2] for x in yw.year]
# yw.loc[:,'WEEK'] = [str(x)[:-2] for x in yw.week]

# yw.loc[:,'date'] = [dt.datetime.strptime(x[0] + '-' + x[1] + '-1', "%Y-%W-%w") for x in zip(yw.YEAR, yw.WEEK)]

# arimax = sm.tsa.statespace.SARIMAX(yX.net_qty,
#                                    order = (1,0,1),
#                                    seasonal_order = (0,0,0,0),
#                                    exog = yX.drop('net_qty', axis = 1),
#                                    enforce_stationarity=False, 
#                                    enforce_invertibility=False,
#                                    missing = 'drop').fit()



In [None]:
# Keep first season only
dat = (dat.sort_values(['article_number', 'season']).drop_duplicates(subset = 'article_number'))


dat = dat[['article_number', 'brand', 'season', 'season_net_qty', 'corr_season_net_qty', 'avg_buy_availability', 
           'art_desc', 'sports_cat_desc', 'rmh_cat_desc', 'franchise', 
           'gender_desc', 'age_group_desc', 'prod_grp_desc', 'prod_type_desc', 
           'price', 'cost', 'margin']]




In [None]:
dat = dat.set_index('article_number')

dat = dat[dat.corr_season_net_qty > 100]


In [None]:
81.5*(40/37) + 81.5*0.09

In [None]:
95/57

In [None]:
89.5*1.1

In [None]:
# art = np.random.choice(dat.index, size = 500, replace = False)

# Just articles new in SS19, with season_net_qty > 200
art = set(SS19.article_number).intersection(set(dat.index))


### Hiearachical Sample (HS)

In [None]:
# hierarchical sampling -- 

from itertools import permutations 
from itertools import combinations

d = {}
cats = ['sports_cat_desc', 'rmh_cat_desc', 'franchise', 'gender_desc', 'age_group_desc', 'prod_grp_desc', 'prod_type_desc']

# Create tidy dataframe, add results to it

for a in art:
    p = dat.loc[a, 'price']
    
    net_qtys = pd.Series()
    
    dat_p = dat[(dat.price >= 0.9*p) & (dat.price <= 1.1*p)]
    
    for c in cats: 
        dat_a = dat_p[dat_p[c] == dat_p.loc[a, c]] # filter to that level of that category        
        net_qtys = net_qtys.append(dat_a.corr_season_net_qty)
    
    for c2 in combinations(cats, 2):
        dat_a = dat_p[(dat_p[c2[0]] == dat_p.loc[a, c2[0]]) &
                      (dat_p[c2[1]] == dat_p.loc[a, c2[1]])]
        net_qtys = net_qtys.append(dat_a.corr_season_net_qty)
        
    for c3 in combinations(cats, 3):
        dat_a = dat_p[(dat_p[c3[0]] == dat_p.loc[a, c3[0]]) &
                      (dat_p[c3[1]] == dat_p.loc[a, c3[1]]) &
                      (dat_p[c3[2]] == dat_p.loc[a, c3[2]])]
        net_qtys = net_qtys.append(dat_a.corr_season_net_qty)
    
    for c4 in combinations(cats, 4):
        dat_a = dat_p[(dat_p[c4[0]] == dat_p.loc[a, c4[0]]) &
                      (dat_p[c4[1]] == dat_p.loc[a, c4[1]]) &
                      (dat_p[c4[2]] == dat_p.loc[a, c4[2]]) &
                      (dat_p[c4[3]] == dat_p.loc[a, c4[3]])]
        net_qtys = net_qtys.append(dat_a.corr_season_net_qty)
    
    for c4 in combinations(cats, 5):
        dat_a = dat_p[(dat_p[c4[0]] == dat_p.loc[a, c4[0]]) &
                      (dat_p[c4[1]] == dat_p.loc[a, c4[1]]) &
                      (dat_p[c4[2]] == dat_p.loc[a, c4[2]]) &
                      (dat_p[c4[3]] == dat_p.loc[a, c4[3]]) &
                      (dat_p[c4[4]] == dat_p.loc[a, c4[4]])]
        net_qtys = net_qtys.append(dat_a.corr_season_net_qty)
    
    for c4 in combinations(cats, 6):
        dat_a = dat_p[(dat_p[c4[0]] == dat_p.loc[a, c4[0]]) &
                      (dat_p[c4[1]] == dat_p.loc[a, c4[1]]) &
                      (dat_p[c4[2]] == dat_p.loc[a, c4[2]]) &
                      (dat_p[c4[3]] == dat_p.loc[a, c4[3]]) &
                      (dat_p[c4[4]] == dat_p.loc[a, c4[4]]) &
                      (dat_p[c4[5]] == dat_p.loc[a, c4[5]])]
        net_qtys = net_qtys.append(dat_a.corr_season_net_qty)
    
    d[a] = {
        'mean': net_qtys.mean(),
        'max': net_qtys.max(),
        '50': np.percentile(net_qtys, 50),
        '70': np.percentile(net_qtys, 70),
        '80': np.percentile(net_qtys, 80),
        '90': np.percentile(net_qtys, 90),
        'length': len(net_qtys)
           }
    if len(d) % 50 == 0:
        print(len(d))
        
        

In [None]:
dat_art = dat[dat.index.isin(art)] # SS19-ers only


In [None]:
# Profit fcn -- 
def P(d, margin, cost, b):
    if d > b:    # CANNOT satisfy demand
        return b*margin
    
    elif d <= b: # CAN satisfy demand
        return d*margin - (b - d)*cost
    
    else:
        print('Error')

### Save/load HSing results

In [None]:
# save d
# import json

# json_i = json.dumps(d)
# f = open("d.json", "w")
# f.write(json_i)
# f.close()

In [None]:
with open('data/d.json') as json_file:
    d = json.load(json_file)


# {k: d[k] for k in sorted(d.keys())[:2]}
# {k: d_test[k] for k in sorted(d_test.keys())[:2]}

In [None]:
{k: d[k] for k in sorted(d.keys())[:2]}

### Back to business

In [None]:
pct_7 = '70'
pct_8 = '80'
pct_9 = '90'

preds = pd.DataFrame([(a, d[a][pct_7], d[a][pct_8], d[a][pct_9], d[a]['length']) for a in d.keys()]).round()
preds.columns = ('article_number', 'pctl_7', 'pctl_8', 'pctl_9', 'length')
preds = preds.set_index('article_number')

In [None]:
preds.head()

In [None]:
# -- logistic regression add-on ----- 

# dat_art0 = pd.merge(dat_art, preds, left_index = True, right_index = True
#                    ).merge(log_reg, left_index=True, right_index=True)

# dat_art0.columns

# # Logistic regression angle for <500 articles
# dat_art0['pred'] = np.where(dat_art0['>500'], dat_art0['pctl_u'], dat_art0['pctl_l'])


In [None]:
dat_art.shape
preds.shape

In [None]:
dat_art2 = pd.merge(dat_art, preds, left_index=True, right_index=True, how = 'left') # dat_art + HierSamp preds

dat_art2 = dat_art2[['season', 'season_net_qty', 'corr_season_net_qty', 'art_desc', 'sports_cat_desc', 
                     'rmh_cat_desc', 'franchise', 'gender_desc', 'age_group_desc', 'prod_grp_desc', 'prod_type_desc', 
                     'price', 'margin', 'cost', 'pctl_7', 'pctl_8', 'pctl_9', 'length']] # .sort_values('APE', ascending = False)

dat_art2.shape

In [None]:
dat_art4 = pd.merge(dat_art2, SS19[['article_number', 'Ecom_FC_RMA']], left_index = True, right_on = 'article_number', how = 'left').set_index('article_number')

In [None]:
dat_art5 = dat_art4.copy()


In [None]:

dat_art5[(dat_art5.corr_season_net_qty > dat_art5.season_net_qty) &
         (dat_art5.corr_season_net_qty > dat_art5.Ecom_FC_RMA) & 
         (dat_art5.corr_season_net_qty < dat_art5.pctl_9)][['season', 'season_net_qty', 'corr_season_net_qty', 'pctl_7', 'pctl_8', 'pctl_9', 'Ecom_FC_RMA', 'art_desc',
       'sports_cat_desc', 'franchise', 
       'prod_type_desc', 'price', 'margin',
       'cost', 'length']]#[:5]



In [None]:
 
dat_art5[dat_art5.index == 'EE3708']


In [None]:
# HSample -- just one article -- 
demo = {}

a = 'EE3708'
p = dat.loc[a, 'price']

net_qtys = pd.Series()

dat_p = dat[(dat.price >= 0.9*p) & (dat.price <= 1.1*p)]

for c in cats: 
    dat_a = dat_p[dat_p[c] == dat_p.loc[a, c]] # filter to that level of that category        
    net_qtys = net_qtys.append(dat_a.corr_season_net_qty)
    nq1 = net_qtys.copy()

for c2 in combinations(cats, 2):
    dat_a = dat_p[(dat_p[c2[0]] == dat_p.loc[a, c2[0]]) &
                  (dat_p[c2[1]] == dat_p.loc[a, c2[1]])]
    net_qtys = net_qtys.append(dat_a.corr_season_net_qty)
    nq2 = net_qtys.copy()


for c3 in combinations(cats, 3):
    dat_a = dat_p[(dat_p[c3[0]] == dat_p.loc[a, c3[0]]) &
                  (dat_p[c3[1]] == dat_p.loc[a, c3[1]]) &
                  (dat_p[c3[2]] == dat_p.loc[a, c3[2]])]
    net_qtys = net_qtys.append(dat_a.corr_season_net_qty)
    nq3 = net_qtys.copy()


for c4 in combinations(cats, 4):
    dat_a = dat_p[(dat_p[c4[0]] == dat_p.loc[a, c4[0]]) &
                  (dat_p[c4[1]] == dat_p.loc[a, c4[1]]) &
                  (dat_p[c4[2]] == dat_p.loc[a, c4[2]]) &
                  (dat_p[c4[3]] == dat_p.loc[a, c4[3]])]
    net_qtys = net_qtys.append(dat_a.corr_season_net_qty)
    nq4 = net_qtys.copy()

for c4 in combinations(cats, 5):
    dat_a = dat_p[(dat_p[c4[0]] == dat_p.loc[a, c4[0]]) &
                  (dat_p[c4[1]] == dat_p.loc[a, c4[1]]) &
                  (dat_p[c4[2]] == dat_p.loc[a, c4[2]]) &
                  (dat_p[c4[3]] == dat_p.loc[a, c4[3]]) &
                  (dat_p[c4[4]] == dat_p.loc[a, c4[4]])]
    net_qtys = net_qtys.append(dat_a.corr_season_net_qty)
    nq5 = net_qtys.copy()

for c4 in combinations(cats, 6):
    dat_a = dat_p[(dat_p[c4[0]] == dat_p.loc[a, c4[0]]) &
                  (dat_p[c4[1]] == dat_p.loc[a, c4[1]]) &
                  (dat_p[c4[2]] == dat_p.loc[a, c4[2]]) &
                  (dat_p[c4[3]] == dat_p.loc[a, c4[3]]) &
                  (dat_p[c4[4]] == dat_p.loc[a, c4[4]]) &
                  (dat_p[c4[5]] == dat_p.loc[a, c4[5]])]
    net_qtys = net_qtys.append(dat_a.corr_season_net_qty)
    nq6 = net_qtys.copy()

demo[a] = {
    'mean': net_qtys.mean(),
    'max': net_qtys.max(),
    '50': np.percentile(net_qtys, 50),
    '70': np.percentile(net_qtys, 70),
    '80': np.percentile(net_qtys, 80),
    '90': np.percentile(net_qtys, 90),
    'length': len(net_qtys)
       }

In [None]:
# combinatorics -- 

for i in combinations(cats, 2):
    print(i)

def factorial(x):
    fact = 1
    for i in range(1,x+1): 
        fact = fact * i
    return fact

def comb(n, r):
    return factorial(n)/(factorial(n-r)*factorial(r))

comb(7,4)


one = nq1.shape[0]
one

two = nq2.shape[0] - nq1.shape[0]
two

three = nq3.shape[0] - two - one
three

four = nq4.shape[0] - three - two - one
four

five = nq5.shape[0] - four - three - two - one
five

six = nq6.shape[0] -  five -  four - three - two - one
six

In [None]:
net_qtys.shape
type(net_qtys)

net_qtys.hist(bins = 40, figsize=(7, 5))
plt.xlabel('Net Quantity', size = 20)
plt.title('Pseudo-Empirical Distribution', size = 20)
# plt.axvline(x = 2326, color = 'red', linewidth = 5)

pass;

In [None]:
#P(row['season_net_qty'], row['margin'], row['cost'], row['pred'])

dat_art5['eCom_profit'] = dat_art5.apply(lambda row: P(row['corr_season_net_qty'], row['margin'], row['cost'], row['Ecom_FC_RMA']), axis=1)
dat_art5['DAA_profit7']  = dat_art5.apply(lambda row: P(row['corr_season_net_qty'], row['margin'], row['cost'], row['pctl_7']), axis=1)
dat_art5['DAA_profit8']  = dat_art5.apply(lambda row: P(row['corr_season_net_qty'], row['margin'], row['cost'], row['pctl_8']), axis=1)
dat_art5['DAA_profit9']  = dat_art5.apply(lambda row: P(row['corr_season_net_qty'], row['margin'], row['cost'], row['pctl_9']), axis=1)


# dat_art4['DAA_profit_pred']  = dat_art4.apply(lambda row: P(row['corr_season_net_qty'], row['margin'], row['cost'], row['pred']), axis=1)
# dat_art4['DAA_profit_m']  = dat_art4.apply(lambda row: P(row['corr_season_net_qty'], row['margin'], row['cost'], row['mean']), axis=1)

In [None]:
    
l = np.array([0, 2000, 5000, 10000, 75000])
dat_art5.loc[:,'net_bins'] = pd.cut(dat_art['season_net_qty'], bins = l)

dat_art5[['DAA_profit7', 'DAA_profit8', 'DAA_profit9', 'eCom_profit']].sum().round()
(dat_art5.groupby('net_bins')[['DAA_profit7', 'DAA_profit8', 'DAA_profit9', 'eCom_profit']].sum()/1000000).round(3)

### Profit w/ logistic regression add-on 

In [None]:
# Model fitting further down

dat_art00 = pd.merge(dat_art0, SS19[['article_number', 'Ecom_FC_RMA']], left_index = True, right_on = 'article_number', how = 'left').drop('article_number', axis = 1)

dat_art00['DAA_profit_pctl']  = dat_art00.apply(lambda row: P(row['corr_season_net_qty'], row['margin'], row['cost'], row['pctl_u']), axis=1)
dat_art00['eCom_profit'] = dat_art00.apply(lambda row: P(row['corr_season_net_qty'], row['margin'], row['cost'], row['Ecom_FC_RMA']), axis=1)
dat_art00['DAA_profit']  = dat_art00.apply(lambda row: P(row['corr_season_net_qty'], row['margin'], row['cost'], row['pred']), axis=1)

dat_art00[['DAA_profit', 'eCom_profit', 'DAA_profit_pctl']].sum().round()

# Using pctl_l = 50 for predicted <500 unit articles:
# --> eCom makes ~18m, we lose ~5m
# DAA_profit         108593975.0
# eCom_profit        131718527.0
# DAA_profit_pctl    106169547.0

# Using pctl_l = 70 for predicted <500 unit articles:
# --> eCom makes ~18m, we... MAKE MONEY!!
# DAA_profit         119232632.0
# eCom_profit        131718527.0
# DAA_profit_pctl    106169547.0

In [None]:
# Logistic regression: profitability BY corrected net_qty bins
    
l = np.array([0, 500, 1000, 2000, 5000, 10000, 75000])
dat_art00.loc[:,'corr_seas_net_qty_bins'] = pd.cut(dat_art00['corr_season_net_qty'], bins = l)

dat_art00.groupby('corr_seas_net_qty_bins')[['DAA_profit', 'eCom_profit', 'DAA_profit_pctl']].sum().round()/1000000

In [None]:
# Profit summary stats

pct
dat_art4[(dat_art4.corr_season_net_qty > 500)][['DAA_profit_pct', 'eCom_profit']].sum().round()

# '90' # empirical percentile used
# DAA_profit_pct     113660189.0
# eCom_profit        112574442.0
# dtype: float64



In [None]:
# -- EDA --

dat_art4['diff']  = dat_art4.apply(lambda row: row['pred_pct'] - row['corr_season_net_qty'], axis=1)


In [None]:
# -- EDA --

l = np.array([0, 500, 1000, 2000, 5000, 10000, 75000])
dat_art4.loc[:,'prediction_bins'] = pd.cut(dat_art4['pred_pct'], bins = l)

# p = np.array([0, 50, 100, 150, 200, 300, 1000])
# dat_mini.loc[:,'price_bin'] = pd.cut(dat_mini.price, bins = p)

dat_art4.groupby('prediction_bins')[['DAA_profit_pct', 'eCom_profit']].apply(np.mean).round()

In [None]:
# -- EDA --

dat_art4.groupby('prediction_bins')[['price']].apply(np.mean).round()

In [None]:
# -- EDA --

x = 'corr_season_net_qty'
y = 'pred_pct'

plt.rcParams["figure.figsize"] = [12,6]
plt.scatter(dat_art4[x], dat_art4[y], alpha = 0.1)

plt.xlabel(x)
plt.ylabel(y)

x = np.linspace(0, 20000,100)
y = x
plt.plot(x, y, '-b')


In [None]:
# -- EDA --

plt.hist(net_qtys, bins = [0, 100, 250, 500, 750, 1000, 1250, 1500, 1750, 2000, 5000], density = True)

pass;

In [None]:
# Empirical distributions

import numpy as np
from statsmodels.distributions.empirical_distribution import ECDF

ecdf = ECDF(net_qtys)

ecdf([100, 500, 1000, 3000])

np.percentile(net_qtys, 75)
net_qtys.describe()

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer 


## decision tree example

In [None]:
cancer = load_breast_cancer() 

print(cancer.keys())
print()

print(cancer.data.shape)
print()

print(
{n: v for n, v in zip(cancer.target_names, np.bincount(cancer.target))})
print()

print(cancer.feature_names)



In [None]:
type(cancer.data)
type(cancer.target)

In [None]:
cancer = load_breast_cancer()

X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, stratify=cancer.target, random_state=42
)

In [None]:
type(X_train)

In [None]:
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, y_train)

print("Accuracy on training set", tree.score(X_train, y_train))
print("Accuracy on test set:", tree.score(X_test, y_test).round(3))

## decision tree article forecasting

### load and wrangle

In [None]:
dat0 = pd.read_csv('data/ch4k.csv')
ref_dat0 = pd.read_csv('data/Article reference data.csv', low_memory = False, error_bad_lines = False, 
                       usecols = ['article_no', 'model_no', 'art_desc', 'sports_cat_desc', 'rmh_cat_desc', 
                                  'franchise', 'gender_desc', 'age_group_desc', 'prod_grp_desc', 'prod_type_desc',
                                  'brand_desc', 'bus_unit_desc', 'rmh_cat_desc'])

# Remove clearance transactions
# dat0['clearance'] = dat0.clearance.fillna(0) 
# dat0['net_qty'] = (1 - dat0.clearance)*dat0.net_qty

In [417]:
dat = dat0.copy()

In [418]:
dat = dat[(dat.net_qty > 0) & (dat.season.isin(['SS17', 'SS18', 'SS19']))]  

In [None]:
dat.shape
dat[dat.margin.isna()].shape

# Why missing SO MANY price/cost/margins?

In [None]:
# SS19 = pd.read_excel('data/ecom_SS19.xlsx').dropna()
# SS19.shape

# SS19 = SS19[SS19.carryover_FW18 == 'NO'] # new articles only


In [None]:
# Within article price, cost, margin averages
# dat[['price', 'cost', 'margin']] = (
#     dat.
#     groupby('article_number', group_keys=False)[['price', 'cost', 'margin']].
#     transform(lambda x: np.nanmean(x).round(2))
#              )


# Remove articles where 'price = NA' in all transactions
# dat = dat[~dat.price.isna()]


In [None]:
dat = dat.sort_values(['article_number', 'country', 'year', 'week'])

# # ---- Replace NAs and zeros (w/ no impact replacements) ----
# dat['buy_availability'] = dat.buy_availability.fillna(1) # assume full availability
# dat['buy_availability'] = np.where(dat.buy_availability == 0, 1, dat.buy_availability) # replace 0 

In [None]:
# ---- Smooth buy_availability ----
# def roll(df):
#     return df.rolling(window = 5, min_periods = 1, center = True).mean()

# # dat['buy_availability3'] = dat.groupby(['article_number', 'country'])['buy_availability3'].apply(roll)
# dat['buy_availability'] = dat.groupby(['article_number', 'country'])['buy_availability'].apply(roll)

# # ---- Corrected net_qty ----
# dat['corr_net_qty'] = (dat.net_qty / dat.buy_availability).round()

In [None]:
# corr_season_net_qty = pd.DataFrame(dat.groupby(['article_number', 'season'], group_keys = False)['corr_net_qty'].apply(sum)).reset_index()
# dat['avg_buy_availability'] = dat.groupby(['article_number', 'season'])['buy_availability3'].transform(lambda x: np.nanmean(x).round(3))

# dat['corr_season_net_qty'] = dat.groupby(['article_number', 'season'])['corr_net_qty'].transform(lambda x: np.sum(x))
# dat['corr_season_net_qty'] = np.where(dat.corr_season_net_qty > dat.season_net_qty, dat.corr_season_net_qty, dat.season_net_qty) # only if >

# dat['avg_buy_availability'] = dat.groupby(['article_number', 'season'])['buy_availability'].transform(lambda x: np.nanmean(x).round(3))




In [419]:
dat['season_net_qty'] = dat.groupby(['article_number', 'brand', 'season'])['net_qty'].transform(sum)

dat = dat[dat.season_net_qty > 100]

In [421]:
dat = (dat[['article_number', 'brand', 'season', 
       'season_net_qty', 'model_no', 'art_desc',
       'sports_cat_desc', 'rmh_cat_desc', 'franchise', 'gender_desc',
       'age_group_desc', 'prod_grp_desc', 'prod_type_desc']].
       sort_values(['article_number', 'season']).
       drop_duplicates(subset = 'article_number')
      )

In [434]:
dat.isna().sum()
dat = dat[~dat.art_desc.isna()]

article_number     0
brand              0
season             0
season_net_qty     0
model_no           0
art_desc           0
sports_cat_desc    0
rmh_cat_desc       0
franchise          0
gender_desc        0
age_group_desc     0
prod_grp_desc      0
prod_type_desc     0
dtype: int64

In [435]:
dat = dat.set_index('article_number')


In [None]:
# art = np.random.choice(dat.index, size = 500, replace = False)

# Just articles new in SS19, with season_net_qty > 200
art = set(SS19.article_number).intersection(set(dat.index))


In [446]:
dat.shape
len(dat.index.unique())

(30512, 12)

30512

### One-hot-encoding

In [464]:
from sklearn.model_selection import train_test_split

# subset = np.random.choice(dat.index, size = 15000, replace = False)
# dat_tree = dat[dat.index.isin(subset)].copy()

dat_tree = dat.copy()

dat_tree = dat_tree[['season_net_qty', 'sports_cat_desc',
                'rmh_cat_desc', 'franchise', 'gender_desc', 'age_group_desc',
                'prod_grp_desc', 'prod_type_desc']]

dat_tree = pd.get_dummies(dat_tree)

# p = np.array([0, 50, 100, 150, 200, 300, 1000])
# dat_mini.loc[:,'price_bin'] = pd.cut(dat_mini.price, bins = p)

In [472]:
dat_tree.shape

(30512, 381)

### Linear Regression

In [486]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

X = dat_tree.drop('season_net_qty', axis = 1)
y = dat_tree.season_net_qty

X_train, X_test, y_train, y_test = train_test_split(X.values, y, random_state=42)

linreg = LinearRegression()
linreg.fit(X, y)

linreg.score(X, y).round(3)

r2_score(y, linreg.predict(X)).round(3)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

0.197

0.197

### Logistic Regression

In [473]:
# Logistic regression on corr_net_qty > 500
from sklearn.linear_model import LogisticRegression

X = dat_tree.drop('season_net_qty', axis = 1)
y = dat_tree.season_net_qty > 500

X_train, X_test, y_train, y_test = train_test_split(X.values, y, random_state=42)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

logreg.score(X_train, y_train).round(3)
logreg.score(X_test, y_test).round(3)

# Could use this in combination with HS method to identify < 500 articles, then use lower percentile



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

0.76

0.763

In [480]:
X_test.shape
pd.crosstab(logreg.predict(X_test), y_test)

(5300 + 519)/(7628)

(7628, 380)

season_net_qty,False,True
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
False,5300,1438
True,371,519


0.7628474042999476

In [None]:
log_reg = pd.DataFrame(data = logreg.predict(dat_tree_dummies.loc[:, 'price':]), index = dat_tree.index, columns = ['>500'])



### Decision Tree Regression

In [493]:
from sklearn.tree import DecisionTreeRegressor

X = dat_tree.drop('season_net_qty', axis = 1)
y = dat_tree.season_net_qty

X_train, X_test, y_train, y_test = train_test_split(X.values, y, random_state=42)

tree = DecisionTreeRegressor().fit(X_train, y_train)

tree.score(X_train, y_train)
tree.score(X_test, y_test)
tree.score(X, y)

DecisionTreeRegressor().fit(X, y).score(X, y)

0.25803565547226226

0.11769572881049484

0.23704042783307422

0.24334923896388805

### Decision Tree Classifier

In [494]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [499]:
X = dat_tree.drop('season_net_qty', axis = 1)
y = dat_tree.season_net_qty > 500

X_train, X_test, y_train, y_test = train_test_split(X.values, y, random_state=42)


In [500]:
tree = DecisionTreeClassifier(random_state = 0)
tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best')

In [501]:
tree.score(X_test, y_test)

0.7597011012060828

In [502]:
y.mean()

0.2609465128474043

In [None]:
# tree_preds = pd.DataFrame(data = tree.predict(X), index = dat_tree_dummies.index, columns = ['>500'])

# Appendix

In [None]:
dat_art5[dat_art5.pred_90 < dat_art5.season_net_qty].sum()

# season_net_qty         227889.0
# pred_80                129573.0
# Ecom_FC_RMA            252412.0
# DAA_profit_pred_80    4725638.0
# eCom_profit           6433454.0
# diff                  2266284.0
# dtype: float64

# Comments: 
    # when we under-forecast... we DRAMATICALLY under-forecast
    # HUGE difference in profit

dat_art5[dat_art5.pred_90 >= dat_art5.season_net_qty].sum()

# season_net_qty         179877.0
# pred_80                354269.0
# Ecom_FC_RMA            310015.0
# DAA_profit_pred_80    4232213.0
# eCom_profit           4550809.0
# diff                  2017816.0
# dtype: float64

# Comments: 
    # in sum, we just slightly over-forecast
    # Tiny difference in profit
    

In [None]:
## buy_availability correction

dat_roll = dat0.copy()

dat_roll = dat_roll[(dat_roll.net_qty > 0) & (dat_roll.season.isin(['SS17', 'SS18', 'SS19']))] # No buy_availability before '17


dat_roll = dat_roll[['article_number', 'country', 'season', 'year', 'week', 'net_qty', 'season_net_qty', 'art_desc', 'sports_cat_desc',
           'rmh_cat_desc', 'franchise', 'gender_desc', 'age_group_desc',
           'prod_grp_desc', 'prod_type_desc', 'price', 'margin', 'cost', 'buy_availability']]

dat_roll = dat_roll[dat_roll.season_net_qty > 200]

# dat_roll[['price', 'margin', 'cost']] = dat_roll[['price', 'margin', 'cost']] # .fillna(0).astype('int')



dat_roll = (
    dat_roll[['article_number', 'country', 'season', 'year', 'week', 'net_qty', 'buy_availability']].
    sort_values(['article_number', 'country', 'year', 'week'])
) 


dat_roll['buy_availability2'] = dat_roll.buy_availability.fillna(1) # assume full availability
dat_roll['buy_availability2'] = np.where(dat_roll.buy_availability2 == 0, 1, dat_roll.buy_availability2) # replace 0 

pd.crosstab(index = [dat_roll.season, dat_roll.buy_availability2], columns = 'counts')

rollers = np.random.choice(dat_roll.article_number, size = 5, replace = False)

dat_roll2 = dat_roll[dat_roll.article_number.isin(rollers)].copy()
dat_roll2 = dat_roll2.sort_values(['article_number', 'country', 'season', 'year', 'week'])

dat_roll2

# Smooth buy_availability

# Function
def roll(df):
    return df.rolling(window = 5, min_periods = 1, center = True).mean()

dat_roll2['buy_availability3'] = dat_roll2.groupby(['article_number', 'country'])['buy_availability2'].apply(roll)
# dat_roll2['buy_availability3'] = np.where(dat_roll2.buy_availability3 > 0.15, dat_roll2.buy_availability3, 0.15)


# Corrected net_qty
dat_roll2['corr_net_qty'] = (dat_roll2.net_qty / dat_roll2.buy_availability3).round()


In [None]:
# for pct in ['pred', 'pred_50', 'pred_60', 'pred_70', 'pred_75', 'pred_80', 'pred_90', 'pred_98']:
#     col = 'DAA_profit' + '_' + pct
#     dat_art4[col]  = dat_art4.apply(lambda row: P(row['season_net_qty'], row['margin'], row['cost'], row[pct]), axis=1)
    
                          