In [1]:
import numpy as np
import pandas as pd
import multiprocessing

import matplotlib.pyplot as plt
import bokeh
import bokeh.io
from bokeh.plotting import figure
from bokeh.io import output_notebook, show

# init_notebook_mode()

import seaborn as sns

import re
import math
import copy

from collections import defaultdict
import csv
import itertools
import datetime 
from datetime import datetime
import time
import dateutil.parser
import pickle
import random

import gc
import zipfile
import sys, getopt
import os

from IPython.core.interactiveshell import InteractiveShell
from io import StringIO

import dask.dataframe as dd
#from chest import Chest

InteractiveShell.ast_node_interactivity = "all"
#InteractiveShell.ast_node_interactivity = "last"

# Magic function to make matplotlib inline
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

# Set up Bokeh for inline viewing
bokeh.io.output_notebook()

import dask.dataframe as ddf
import dask.array as da

pd.set_option('max_columns', 500)
pd.set_option('max_rows', 800)

import scipy

import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.tsatools import detrend

import datetime as dt

from sklearn.tree import DecisionTreeClassifier

# Data

In [2]:
dat0 = pd.read_csv('data/ch4k_df.csv')
ref_dat0 = pd.read_csv('data/Article reference data.csv', low_memory = False, error_bad_lines = False, 
                       usecols = ['article_no', 'model_no', 'art_desc', 'sports_cat_desc', 'rmh_cat_desc', 
                                  'franchise', 'gender_desc', 'age_group_desc', 'prod_grp_desc', 'prod_type_desc',
                                  'brand_desc', 'bus_unit_desc', 'rmh_cat_desc'])

# Remove clearance transactions!!
dat0['clearance'] = dat0.clearance.fillna(0) 
dat0['net_qty'] = (1 - dat0.clearance)*dat0.net_qty

# Magic

In [3]:
dat = dat0.copy()

# dat = dat[(dat.net_qty > 0) & (dat.season.isin(['FW15', 'FW16', 'FW17', 'FW18', 'FW19']))].round()

dat = dat[(dat.net_qty > 0) & (dat.season.isin(['SS17', 'SS18', 'SS19']))] # 'SS15', 'SS16', 

In [4]:
SS19 = pd.read_excel('data/ecom_SS19.xlsx').dropna()
SS19.shape

SS19 = SS19[SS19.carryover_FW18 == 'NO'] # new articles only


(7604, 3)

In [5]:
# Within article price, cost, margin averages
dat[['price', 'cost', 'margin']] = (
    dat.
    groupby('article_number', group_keys=False)[['price', 'cost', 'margin']].
    transform(lambda x: np.nanmean(x).round(2))
             )


  """


In [6]:
# Remove articles where 'price = NA' in all transactions
dat = dat[~dat.price.isna()]


In [None]:
dat.buy_availability.isna().sum()
(dat.buy_availability == 0).sum()

In [7]:
dat = dat.sort_values(['article_number', 'country', 'year', 'week'])

# ---- Replace NAs and zeros (w/ no impact replacements) ----
dat['buy_availability'] = dat.buy_availability.fillna(1) # assume full availability
dat['buy_availability'] = np.where(dat.buy_availability == 0, 1, dat.buy_availability) # replace 0 

In [8]:
# ---- Smooth buy_availability ----
def roll(df):
    return df.rolling(window = 5, min_periods = 1, center = True).mean()

# dat['buy_availability3'] = dat.groupby(['article_number', 'country'])['buy_availability3'].apply(roll)
dat['buy_availability'] = dat.groupby(['article_number', 'country'])['buy_availability'].apply(roll)

# ---- Corrected net_qty ----
dat['corr_net_qty'] = (dat.net_qty / dat.buy_availability).round()

In [9]:
# corr_season_net_qty = pd.DataFrame(dat.groupby(['article_number', 'season'], group_keys = False)['corr_net_qty'].apply(sum)).reset_index()
# dat['avg_buy_availability'] = dat.groupby(['article_number', 'season'])['buy_availability3'].transform(lambda x: np.nanmean(x).round(3))

dat['corr_season_net_qty'] = dat.groupby(['article_number', 'season'])['corr_net_qty'].transform(lambda x: np.sum(x))
dat['corr_season_net_qty'] = np.where(dat.corr_season_net_qty > dat.season_net_qty, dat.corr_season_net_qty, dat.season_net_qty) # only if >

dat['avg_buy_availability'] = dat.groupby(['article_number', 'season'])['buy_availability'].transform(lambda x: np.nanmean(x).round(3))



In [10]:

dat = (
    dat.
    sort_values(['article_number', 'season']).
    drop_duplicates(subset = 'article_number')
)

In [11]:
dat = dat[['article_number', 'brand', 'season', 'season_net_qty', 'corr_season_net_qty', 'avg_buy_availability', 
           'art_desc', 'sports_cat_desc', 'rmh_cat_desc', 'franchise', 
           'gender_desc', 'age_group_desc', 'prod_grp_desc', 'prod_type_desc', 
           'price', 'cost', 'margin']]




In [12]:
dat = dat.set_index('article_number')


In [13]:
dat.shape
dat = dat[dat.corr_season_net_qty > 100]
dat.shape

(30615, 16)

(15842, 16)

In [14]:
# art = np.random.choice(dat.index, size = 500, replace = False)

# Just articles new in SS19, with season_net_qty > 200
art = set(SS19.article_number).intersection(set(dat.index))


In [None]:
pd.crosstab(index = dat.season, columns = 'count')

In [15]:
from itertools import permutations 
from itertools import combinations

d = {}
cats = ['sports_cat_desc', 'rmh_cat_desc', 'franchise', 'gender_desc', 'age_group_desc', 'prod_grp_desc', 'prod_type_desc']

# Create tidy dataframe, add results to it

for a in art:
    p = dat.loc[a, 'price']
    
    net_qtys = pd.Series()
    
    dat_p = dat[(dat.price >= 0.9*p) & (dat.price <= 1.1*p)]
    
    for c in cats: 
        dat_a = dat_p[dat_p[c] == dat_p.loc[a, c]] # filter to that level of that category        
        net_qtys = net_qtys.append(dat_a.corr_season_net_qty)
    
    for c2 in combinations(cats, 2):
        dat_a = dat_p[(dat_p[c2[0]] == dat_p.loc[a, c2[0]]) &
                      (dat_p[c2[1]] == dat_p.loc[a, c2[1]])]
        net_qtys = net_qtys.append(dat_a.corr_season_net_qty)
        
    for c3 in combinations(cats, 3):
        dat_a = dat_p[(dat_p[c3[0]] == dat_p.loc[a, c3[0]]) &
                      (dat_p[c3[1]] == dat_p.loc[a, c3[1]]) &
                      (dat_p[c3[2]] == dat_p.loc[a, c3[2]])]
        net_qtys = net_qtys.append(dat_a.corr_season_net_qty)
    
    for c4 in combinations(cats, 4):
        dat_a = dat_p[(dat_p[c4[0]] == dat_p.loc[a, c4[0]]) &
                      (dat_p[c4[1]] == dat_p.loc[a, c4[1]]) &
                      (dat_p[c4[2]] == dat_p.loc[a, c4[2]]) &
                      (dat_p[c4[3]] == dat_p.loc[a, c4[3]])]
        net_qtys = net_qtys.append(dat_a.corr_season_net_qty)
    
    for c4 in combinations(cats, 5):
        dat_a = dat_p[(dat_p[c4[0]] == dat_p.loc[a, c4[0]]) &
                      (dat_p[c4[1]] == dat_p.loc[a, c4[1]]) &
                      (dat_p[c4[2]] == dat_p.loc[a, c4[2]]) &
                      (dat_p[c4[3]] == dat_p.loc[a, c4[3]]) &
                      (dat_p[c4[4]] == dat_p.loc[a, c4[4]])]
        net_qtys = net_qtys.append(dat_a.corr_season_net_qty)
    
    for c4 in combinations(cats, 6):
        dat_a = dat_p[(dat_p[c4[0]] == dat_p.loc[a, c4[0]]) &
                      (dat_p[c4[1]] == dat_p.loc[a, c4[1]]) &
                      (dat_p[c4[2]] == dat_p.loc[a, c4[2]]) &
                      (dat_p[c4[3]] == dat_p.loc[a, c4[3]]) &
                      (dat_p[c4[4]] == dat_p.loc[a, c4[4]]) &
                      (dat_p[c4[5]] == dat_p.loc[a, c4[5]])]
        net_qtys = net_qtys.append(dat_a.corr_season_net_qty)
    
    d[a] = {
        'mean': net_qtys.mean(),
        'max': net_qtys.max(),
        '50': np.percentile(net_qtys, 50),
        '70': np.percentile(net_qtys, 70),
        '80': np.percentile(net_qtys, 80),
        '90': np.percentile(net_qtys, 90),
        'length': len(net_qtys)
           }
    if len(d) % 50 == 0:
        print(len(d))
        
        

50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750
3800
3850
3900
3950
4000
4050
4100
4150
4200
4250
4300
4350


In [16]:
dat_art = dat[dat.index.isin(art)] # SS19-ers only


In [17]:
# Profit
def P(d, margin, cost, b):
    if d > b:    # CANNOT satisfy demand
        return b*margin
    
    elif d <= b: # CAN satisfy demand
        return d*margin - (b - d)*cost
    
    else:
        print('Error')

#### Save d

In [25]:
# save d
import json

json_i = json.dumps(d)
f = open("d.json", "w")
f.write(json_i)
f.close()

554979

In [None]:
with open('d.json') as json_file:
    d_test = json.load(json_file)

# d.keys() == d_test.keys()

# {k: d[k] for k in sorted(d.keys())[:2]}

# {k: d_test[k] for k in sorted(d_test.keys())[:2]}

In [133]:
pct_l = '80'
pct_u = '90'

preds = pd.DataFrame([(a, d[a][pct_l], d[a][pct_u], d[a]['length']) for a in d.keys()]).round()
preds.columns = ('article_number', 'pctl_l', 'pctl_u', 'length')
preds = preds.set_index('article_number')

In [98]:
dat_art.head()
preds.head()

Unnamed: 0_level_0,brand,season,season_net_qty,corr_season_net_qty,avg_buy_availability,art_desc,sports_cat_desc,rmh_cat_desc,franchise,gender_desc,age_group_desc,prod_grp_desc,prod_type_desc,price,cost,margin
article_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
AC7438,adidas,SS19,614.0,954.0,0.696,BASELINE CMF INF,TENNIS,CORE SPORTS INSPIRED,GRAND COURT,UNISEX,INFANT,SHOES,SHOES - LOW (NON FOOTBALL),35.0,6.92,28.08
AC7548,adidas,SS19,124.0,124.0,0.998,UltraBOOST X S.,RUNNING,STELLA,ULTRA BOOST,WOMEN,ADULT,SHOES,SHOES - LOW (NON FOOTBALL),226.24,43.24,182.99
AC7980,adidas,SS19,229.0,233.0,0.985,TERREX SWIFT R2,OUTDOOR,OUTDOOR,NOT APPLICABLE,MEN,ADULT,SHOES,SHOES - LOW (NON FOOTBALL),120.1,24.03,96.07
AC8206,adidas,SS19,101.0,112.0,0.843,COPA 19.1 TF,FOOTBALL/SOCCER,FOOTBALL GENERIC,COPA,MEN,ADULT,SHOES,FOOTBALL SHOES (TURF),127.76,28.55,99.21
AH2131,adidas,SS19,282.0,321.0,0.957,SoleCourt M,TENNIS,TENNIS,NOT APPLICABLE,MEN,ADULT,SHOES,SHOES - LOW (NON FOOTBALL),159.35,41.46,117.89


Unnamed: 0_level_0,pctl_l,pctl_u,length
article_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DV2820,663.0,1990.0,9820
DX2461,358.0,1247.0,9467
DZ0053,303.0,1413.0,4980
DB3415,342.0,1665.0,13740
BB7216,273.0,1328.0,15197


In [134]:
dat_art0 = pd.merge(
    dat_art, preds,left_index = True, right_index = True
).merge(log_reg, left_index=True, right_index=True)

In [114]:
dat_art0.columns

Index(['brand', 'season', 'season_net_qty', 'corr_season_net_qty',
       'avg_buy_availability', 'art_desc', 'sports_cat_desc', 'rmh_cat_desc',
       'franchise', 'gender_desc', 'age_group_desc', 'prod_grp_desc',
       'prod_type_desc', 'price', 'cost', 'margin', 'pctl_l', 'pctl_u',
       'length', '>500'],
      dtype='object')

In [135]:
# Logistic regression angle for <500 articles
dat_art0['pred'] = np.where(dat_art0['>500'], dat_art0['pctl_u'], dat_art0['pctl_l'])


In [50]:
dat_art2 = pd.merge(dat_art, preds, left_index=True, right_index=True,how = 'left')
dat_art2 = dat_art2[['season', 'season_net_qty', 'corr_season_net_qty', 'mean', 'pred_pct', 'length', 'art_desc', 'sports_cat_desc', 'rmh_cat_desc', 'franchise', 'gender_desc', 'age_group_desc', 'prod_grp_desc', 'prod_type_desc', 'price', 'margin', 'cost']] # .sort_values('APE', ascending = False)
dat_art4 = pd.merge(dat_art2, SS19[['article_number', 'Ecom_FC_RMA']], left_index = True, right_on = 'article_number', how = 'left').drop('article_number', axis = 1)


In [51]:
#P(row['season_net_qty'], row['margin'], row['cost'], row['pred'])

dat_art4['eCom_profit'] = dat_art4.apply(lambda row: P(row['corr_season_net_qty'], row['margin'], row['cost'], row['Ecom_FC_RMA']), axis=1)
dat_art4['DAA_profit_pct']  = dat_art4.apply(lambda row: P(row['corr_season_net_qty'], row['margin'], row['cost'], row['pred']), axis=1)

# dat_art4['DAA_profit_pred']  = dat_art4.apply(lambda row: P(row['corr_season_net_qty'], row['margin'], row['cost'], row['pred']), axis=1)
# dat_art4['DAA_profit_m']  = dat_art4.apply(lambda row: P(row['corr_season_net_qty'], row['margin'], row['cost'], row['mean']), axis=1)



In [None]:
# Profit w/ logistic regression component
dat_art00 = pd.merge(dat_art0, SS19[['article_number', 'Ecom_FC_RMA']], left_index = True, right_on = 'article_number', how = 'left').drop('article_number', axis = 1)

dat_art00['DAA_profit_pctl']  = dat_art00.apply(lambda row: P(row['corr_season_net_qty'], row['margin'], row['cost'], row['pctl_u']), axis=1)
dat_art00['eCom_profit'] = dat_art00.apply(lambda row: P(row['corr_season_net_qty'], row['margin'], row['cost'], row['Ecom_FC_RMA']), axis=1)
dat_art00['DAA_profit']  = dat_art00.apply(lambda row: P(row['corr_season_net_qty'], row['margin'], row['cost'], row['pred']), axis=1)

dat_art00[['DAA_profit', 'eCom_profit', 'DAA_profit_pctl']].sum().round()

# Using pctl_l = 50 for predicted <500 unit articles:
# --> eCom makes ~18m, we lose ~5m
# DAA_profit         108593975.0
# eCom_profit        131718527.0
# DAA_profit_pctl    106169547.0

# Using pctl_l = 70 for predicted <500 unit articles:
# --> eCom makes ~18m, we... MAKE MONEY!!
# DAA_profit         119232632.0
# eCom_profit        131718527.0
# DAA_profit_pctl    106169547.0

In [148]:
# Logistic regression approach profitability BY corrected net_qty bins
    
l = np.array([0, 500, 1000, 2000, 5000, 10000, 75000])
dat_art00.loc[:,'corr_seas_net_qty_bins'] = pd.cut(dat_art00['corr_season_net_qty'], bins = l)

dat_art00.groupby('corr_seas_net_qty_bins')[['DAA_profit', 'eCom_profit', 'DAA_profit_pctl']].sum().round()/1000000

Unnamed: 0_level_0,DAA_profit,eCom_profit,DAA_profit_pctl
corr_seas_net_qty_bins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(0, 500]",10.309045,19.144084,-7.490642
"(500, 1000]",25.977298,24.947239,23.091974
"(1000, 2000]",27.74125,27.226527,30.655019
"(2000, 5000]",28.50987,26.968363,31.618651
"(5000, 10000]",8.543577,10.53431,9.340675
"(10000, 75000]",18.704977,22.898003,18.953869


In [52]:
pct
dat_art4[(dat_art4.corr_season_net_qty > 500)][['DAA_profit_pct', 'eCom_profit']].sum().round()

# '90' # empirical percentile used
# DAA_profit_pct     113660189.0
# eCom_profit        112574442.0
# dtype: float64



'90'

DAA_profit_pct    113660189.0
eCom_profit       112574442.0
dtype: float64

In [None]:
dat_art4['diff']  = dat_art4.apply(lambda row: row['pred_pct'] - row['corr_season_net_qty'], axis=1)


In [None]:
dat_art4.columns

In [None]:
l = np.array([0, 500, 1000, 2000, 5000, 10000, 75000])
dat_art4.loc[:,'prediction_bins'] = pd.cut(dat_art4['pred_pct'], bins = l)

# p = np.array([0, 50, 100, 150, 200, 300, 1000])
# dat_mini.loc[:,'price_bin'] = pd.cut(dat_mini.price, bins = p)

dat_art4.groupby('prediction_bins')[['DAA_profit_pct', 'eCom_profit']].apply(np.mean).round()

In [None]:
dat_art4.groupby('prediction_bins')[['price']].apply(np.mean).round()

In [None]:
d500['diff'].hist()

In [None]:
x = 'corr_season_net_qty'
y = 'pred_pct'

plt.rcParams["figure.figsize"] = [12,6]
plt.scatter(dat_art4[x], dat_art4[y], alpha = 0.1)

plt.xlabel(x)
plt.ylabel(y)

x = np.linspace(0, 20000,100)
y = x
plt.plot(x, y, '-b')


In [None]:
# Histogram

plt.hist(net_qtys, bins = [0, 100, 250, 500, 750, 1000, 1250, 1500, 1750, 2000, 5000], density = True)

pass;

In [None]:
# Empirical distributions

import numpy as np
from statsmodels.distributions.empirical_distribution import ECDF

ecdf = ECDF(net_qtys)

ecdf([100, 500, 1000, 3000])

np.percentile(net_qtys, 75)
net_qtys.describe()

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer 


## decision tree example

In [None]:
cancer = load_breast_cancer() 

print("cancer.keys(): \n{}".format(cancer.keys()))
print()

print("Shape of cancer data: {}".format(cancer.data.shape))
print()

print("Sample counts per class:\n{}".format(
{n: v for n, v in zip(cancer.target_names, np.bincount(cancer.target))}))
print()

print("Feature names:\n{}".format(cancer.feature_names))



In [None]:
type(cancer.data)
type(cancer.target)

In [None]:


cancer = load_breast_cancer()

X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, stratify=cancer.target, random_state=42
)

In [None]:
type(X_train)

In [None]:
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, y_train)

print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train))) 
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))

## decision tree article forecasting

### load and wrangle

In [None]:
dat0 = pd.read_csv('data/ch4k_df.csv')
ref_dat0 = pd.read_csv('data/Article reference data.csv', low_memory = False, error_bad_lines = False, 
                       usecols = ['article_no', 'model_no', 'art_desc', 'sports_cat_desc', 'rmh_cat_desc', 
                                  'franchise', 'gender_desc', 'age_group_desc', 'prod_grp_desc', 'prod_type_desc',
                                  'brand_desc', 'bus_unit_desc', 'rmh_cat_desc'])

# Remove clearance transactions!!
dat0['clearance'] = dat0.clearance.fillna(0) 
dat0['net_qty'] = (1 - dat0.clearance)*dat0.net_qty

In [None]:
dat = dat0.copy()

# dat = dat[(dat.net_qty > 0) & (dat.season.isin(['FW15', 'FW16', 'FW17', 'FW18', 'FW19']))].round()

dat = dat[(dat.net_qty > 0) & (dat.season.isin(['SS17', 'SS18', 'SS19']))] # 'SS15', 'SS16', 

In [None]:
SS19 = pd.read_excel('data/ecom_SS19.xlsx').dropna()
SS19.shape

SS19 = SS19[SS19.carryover_FW18 == 'NO'] # new articles only


In [None]:
# Within article price, cost, margin averages
dat[['price', 'cost', 'margin']] = (
    dat.
    groupby('article_number', group_keys=False)[['price', 'cost', 'margin']].
    transform(lambda x: np.nanmean(x).round(2))
             )


In [None]:
# Remove articles where 'price = NA' in all transactions
dat = dat[~dat.price.isna()]


In [None]:
dat = dat.sort_values(['article_number', 'country', 'year', 'week'])

# ---- Replace NAs and zeros (w/ no impact replacements) ----
dat['buy_availability'] = dat.buy_availability.fillna(1) # assume full availability
dat['buy_availability'] = np.where(dat.buy_availability == 0, 1, dat.buy_availability) # replace 0 

In [None]:
# ---- Smooth buy_availability ----
def roll(df):
    return df.rolling(window = 5, min_periods = 1, center = True).mean()

# dat['buy_availability3'] = dat.groupby(['article_number', 'country'])['buy_availability3'].apply(roll)
dat['buy_availability'] = dat.groupby(['article_number', 'country'])['buy_availability'].apply(roll)

# ---- Corrected net_qty ----
dat['corr_net_qty'] = (dat.net_qty / dat.buy_availability).round()

In [None]:
# corr_season_net_qty = pd.DataFrame(dat.groupby(['article_number', 'season'], group_keys = False)['corr_net_qty'].apply(sum)).reset_index()
# dat['avg_buy_availability'] = dat.groupby(['article_number', 'season'])['buy_availability3'].transform(lambda x: np.nanmean(x).round(3))

dat['corr_season_net_qty'] = dat.groupby(['article_number', 'season'])['corr_net_qty'].transform(lambda x: np.sum(x))
dat['corr_season_net_qty'] = np.where(dat.corr_season_net_qty > dat.season_net_qty, dat.corr_season_net_qty, dat.season_net_qty) # only if >

dat['avg_buy_availability'] = dat.groupby(['article_number', 'season'])['buy_availability'].transform(lambda x: np.nanmean(x).round(3))



In [None]:

dat = (
    dat.
    sort_values(['article_number', 'season']).
    drop_duplicates(subset = 'article_number')
)

In [None]:
dat = dat[['article_number', 'brand', 'season', 'season_net_qty', 'corr_season_net_qty', 'avg_buy_availability', 
           'art_desc', 'sports_cat_desc', 'rmh_cat_desc', 'franchise', 
           'gender_desc', 'age_group_desc', 'prod_grp_desc', 'prod_type_desc', 
           'price', 'cost', 'margin']]




In [None]:
dat = dat.set_index('article_number')


In [None]:
dat.shape
dat = dat[dat.corr_season_net_qty > 100]
dat.shape

In [None]:
# art = np.random.choice(dat.index, size = 500, replace = False)

# Just articles new in SS19, with season_net_qty > 200
art = set(SS19.article_number).intersection(set(dat.index))


### fancy-ing

In [75]:
len(dat.index.unique())

15842

In [90]:
# mini = np.random.choice(dat.index, size = 5000, replace = False)
# dat_mini = dat[dat.index.isin(mini)].copy()

dat_tree = dat_art.copy()

In [91]:
# b = np.array([0, 500, 1000, 1500, 2000, 3000, 5000, 10000, 50000])
# dat_mini.loc[:,'net_qty_bin'] = pd.cut(dat_mini.season_net_qty, bins = b)

# p = np.array([0, 50, 100, 150, 200, 300, 1000])
# dat_mini.loc[:,'price_bin'] = pd.cut(dat_mini.price, bins = p)

dat_tree = dat[['corr_season_net_qty', 'sports_cat_desc',
       'rmh_cat_desc', 'franchise', 'gender_desc', 'age_group_desc',
       'prod_grp_desc', 'prod_type_desc', 'price']].copy()

In [92]:
dat_tree_dummies = pd.get_dummies(dat_tree)

X = dat_tree_dummies.loc[:, 'price':]
y = dat_tree_dummies.corr_season_net_qty > 500

In [87]:
### Logistic Regression

In [93]:
# Logistic regression on corr_net_qty > 500

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X.values, y, random_state=42)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

logreg.score(X_train, y_train) 
logreg.score(X_test, y_test) # 0.74

# Could use this in combination with HS method to identify < 500 articles, then use lower percentile



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

0.738910866088713

0.7376925018934612

In [105]:
dat_tree_dummies.head()

Unnamed: 0_level_0,corr_season_net_qty,price,sports_cat_desc_AMERICAN FOOTBALL,sports_cat_desc_BASEBALL,sports_cat_desc_BASKETBALL,sports_cat_desc_BOXING,sports_cat_desc_CASUAL,sports_cat_desc_CYCLING,sports_cat_desc_DANCE,sports_cat_desc_FIELD HOCKEY,sports_cat_desc_FOOTBALL/SOCCER,sports_cat_desc_GOLF,sports_cat_desc_HANDBALL,sports_cat_desc_MIXED MARTIAL ARTS,sports_cat_desc_MOTORSPORT,sports_cat_desc_NOT SPORTS SPECIFIC,sports_cat_desc_OLYMPIC SPORTS,sports_cat_desc_ORIGINALS,sports_cat_desc_OUTDOOR,sports_cat_desc_RUGBY,sports_cat_desc_RUNNING,sports_cat_desc_SKATEBOARDING,sports_cat_desc_SNOWBOARDING,sports_cat_desc_STUDIO,sports_cat_desc_SWIM,sports_cat_desc_TENNIS,sports_cat_desc_TRACK AND FIELD,sports_cat_desc_TRAINING,sports_cat_desc_VOLLEYBALL,sports_cat_desc_WALKING,sports_cat_desc_WEIGHTLIFTING,sports_cat_desc_X-COUNTRY SKIING,sports_cat_desc_YOGA,rmh_cat_desc_ACTION SPORTS,rmh_cat_desc_AMERICAN FOOTBALL,rmh_cat_desc_ATHLETICS,rmh_cat_desc_BASEBALL/SOFTBALL,rmh_cat_desc_BASKETBALL GENERIC,rmh_cat_desc_BASKETBALL LICENSED,rmh_cat_desc_CLASSICS,rmh_cat_desc_CORE CLASSICS,rmh_cat_desc_CORE RUNNING,rmh_cat_desc_CORE SPORTS,rmh_cat_desc_CORE SPORTS INSPIRED,rmh_cat_desc_CORE TRAINING,rmh_cat_desc_DUMMY,rmh_cat_desc_FIELD SPORTS,rmh_cat_desc_FOOTBALL GENERIC,rmh_cat_desc_FOOTBALL LICENSED,rmh_cat_desc_GOLF,rmh_cat_desc_INDOOR SPORTS,rmh_cat_desc_ORIGINALS,rmh_cat_desc_OTHERS,rmh_cat_desc_OUTDOOR,rmh_cat_desc_PODIUM SPORTS,rmh_cat_desc_PORSCHE,rmh_cat_desc_RUNNING,rmh_cat_desc_SLIDE,rmh_cat_desc_STATEMENT,rmh_cat_desc_STELLA,rmh_cat_desc_SWIM,rmh_cat_desc_TENNIS,rmh_cat_desc_TRAINING,rmh_cat_desc_WALKING,rmh_cat_desc_YEEZY,franchise_25/7 TEE,franchise_4ATHLTS,franchise_4CMTE,franchise_ADIZERO,franchise_ADVANTAGE,franchise_ALL ME BRA,franchise_ALPHABOUNCE,franchise_ALPHASKIN TIGHT,franchise_ALTA,franchise_AZTREK,franchise_BARRICADE,franchise_BECKENBAUER TRACKTOP,franchise_BECKENBAUERTRACKPANT,franchise_BELIEVE THIS TIGHT,franchise_CAMPUS,franchise_CLASSIC BACKPACK,franchise_CLASSIC LEATHER,franchise_CLASSIC NYLON,franchise_CLUB C,franchise_CONTINENTAL 80,franchise_CONTROL HOODIE,franchise_COPA,franchise_CRAZY BYW,franchise_CRAZY EXPLOSIVE,franchise_CRAZY LIGHT,franchise_DEERUPT,franchise_DONT REST BRA,franchise_DURAMO,franchise_ENERGY BOOST,franchise_EPIC SHORT M,franchise_EPIC SHORT W,franchise_EQT ADV,franchise_FAST FLEXWEAVE,franchise_FIREBIRD TRACKPANTS,franchise_FIREBIRD TRACKTOP,franchise_FLOATRIDE RUN,franchise_FREELIFT TEE,franchise_FREESTYLE,franchise_GAZELLE,franchise_GRAND COURT,franchise_H90 CAP,franchise_HARDEN,franchise_HEARTRACER JACKET,franchise_HOW WE DO TIGHT,franchise_INSTAPUMP FURY,franchise_KRAFT SHORT,franchise_LILLARD,franchise_LUX TIGHT W,franchise_MARATHON 20 SHORT,franchise_MOVE TEE M,franchise_NANO,franchise_NEMEZIZ,franchise_NITE JOGGER,franchise_NIZZA,franchise_NMD,franchise_NOT APPLICABLE,franchise_OWN THE RUN SHORT,franchise_OWN THE RUN TEE,franchise_OWN THE RUN TIGHT,franchise_OWN THE RUN WIND JKT,franchise_OZWEEGO,franchise_P.O.D.SYSTEM,franchise_PARK,franchise_PHX JACKET,franchise_POWER BACKPACK,franchise_PREDATOR,franchise_PURE BOOST,franchise_PUREMOVE BRA,franchise_RAPIDA,franchise_RIVALRY,franchise_ROSE,franchise_RUNFALCON,franchise_SAMBA,franchise_SATURDAY SHORT,franchise_SATURDAY TIGHT,franchise_SOLAR BOOST,franchise_SOLE FURY,franchise_STABIL,franchise_STAN SMITH,franchise_STRIKER PANT,franchise_STRONGER FOR IT BRA,franchise_SUPERNOVA JACKET,franchise_SUPERSTAR,franchise_SUPERSTAR TRACKPANTS,franchise_SUPERSTAR TRACKTOP,franchise_TIRO PANT,franchise_TRAINING OPS BACKPCK,franchise_TRAINING TEAMBAG,franchise_TREFOIL HOODIE,franchise_TREFOIL TEE,franchise_ULTRA BOOST,franchise_VFA TEAMBAG,franchise_VL COURT,franchise_WND JACKET,franchise_WORKOUT,franchise_X,franchise_Z.N.E HOODIE,franchise_ZX FLUX,franchise_ZX TORSION,gender_desc_MEN,gender_desc_UNISEX,gender_desc_WOMEN,age_group_desc_ADULT,age_group_desc_INFANT,age_group_desc_JUNIOR,age_group_desc_KIDS,prod_grp_desc_APPAREL ACCESSORIES,prod_grp_desc_BAGS,prod_grp_desc_BALLS,prod_grp_desc_FOOTWEAR ACCESSORIES,prod_grp_desc_HARDWARE ACCESSORIES,prod_grp_desc_HEADWEAR,prod_grp_desc_JACKETS,prod_grp_desc_JERSEYS,prod_grp_desc_OTHER APPAREL,prod_grp_desc_OTHER SHIRTS,prod_grp_desc_PANTS,prod_grp_desc_POLO SHIRTS,prod_grp_desc_PROTECTION GEAR,prod_grp_desc_SANDALS/SLIPPERS,prod_grp_desc_SHOES,prod_grp_desc_SHORTS,prod_grp_desc_SKIRTS / DRESSES,prod_grp_desc_SOCKS,prod_grp_desc_SUITS,prod_grp_desc_SWEATERS,prod_grp_desc_SWEATSHIRTS,prod_grp_desc_SWIMWEAR,prod_grp_desc_T-SHIRTS,prod_grp_desc_TIGHTS,prod_grp_desc_TOPS,prod_grp_desc_TRACK TOPS,prod_grp_desc_UNDERWEAR,prod_type_desc_ANKLE SOCKS,prod_type_desc_APPAREL OTHERS,prod_type_desc_ARM SLEEVES,prod_type_desc_BACKPACK,prod_type_desc_BALL (FOAM CORE),prod_type_desc_BALL (HAND-STITCHED),prod_type_desc_BALL (LAMINATED),prod_type_desc_BALL (MACHINE-STITCHED),prod_type_desc_BALL (THERMAL-BONDING),prod_type_desc_BALL (VULCANISED),prod_type_desc_BANDANA,prod_type_desc_BEANIE,prod_type_desc_BELT,prod_type_desc_BIB SHORT,prod_type_desc_BIKINI BOTTOM,prod_type_desc_BIKINI SET,prod_type_desc_BIKINI TOP,prod_type_desc_BODYWEAR-SUIT,prod_type_desc_BOTTLE,prod_type_desc_BOXER SWIMWEAR,prod_type_desc_BRIEFS,prod_type_desc_BUCKET,prod_type_desc_CAP,prod_type_desc_CASES,prod_type_desc_CREW SOCKS,prod_type_desc_CRIB SHOES,prod_type_desc_CROP,prod_type_desc_CROSS BODY BAG,prod_type_desc_DRESS,prod_type_desc_DUFFEL,prod_type_desc_DUFFEL WITH WHEELS,prod_type_desc_DUFFLE/GRIP,prod_type_desc_FITNESS EQUIPMENT,prod_type_desc_FOOTBALL SHOES (ARTIFICIAL GRASS),prod_type_desc_FOOTBALL SHOES (FIRM GROUND),prod_type_desc_FOOTBALL SHOES (INDOOR),prod_type_desc_FOOTBALL SHOES (SOFT GROUND),prod_type_desc_FOOTBALL SHOES (STREET),prod_type_desc_FOOTBALL SHOES (TURF),prod_type_desc_GLOVE (BALL GLOVE),prod_type_desc_GLOVES,prod_type_desc_GOALKEEPER GLOVES (FINGERSAVE),prod_type_desc_GOALKEEPER GLOVES (W/O FINGERSAVE),prod_type_desc_GOGGLES,prod_type_desc_GRAPHIC SWEATSHIRT (LONG SLEEVE),prod_type_desc_GRAPHIC TANK,prod_type_desc_GRAPHIC TEE (LONG SLEEVE),prod_type_desc_GRAPHIC TEE (SHORT SLEEVE),prod_type_desc_GRAPHIC TEE (SLEEVELESS),prod_type_desc_GYM SACK,prod_type_desc_HAT,prod_type_desc_HEADBAND,prod_type_desc_HIGH HEELS - MID,prod_type_desc_HOODED SWEAT,prod_type_desc_HOODED TRACK TOP,prod_type_desc_HOODED TRACKSUIT JACKET,prod_type_desc_JACKET,prod_type_desc_JACKET (DOWN),prod_type_desc_JACKET (FILLED HEAVYWEIGHT),prod_type_desc_JACKET (FILLED THIN),prod_type_desc_JACKET (MIDWEIGHT),prod_type_desc_JACKET (TECHNICAL FILLED),prod_type_desc_JACKET (TECHNICAL),prod_type_desc_JAMMER,prod_type_desc_JERSEY (LONG SLEEVE),prod_type_desc_JERSEY (SHORT SLEEVE),prod_type_desc_JERSEY (SLEEVELESS),prod_type_desc_KNEE SOCKS,prod_type_desc_LEOTARD,prod_type_desc_LOAFERS,prod_type_desc_MESSENGER BAG,prod_type_desc_MINIKIT (APP),prod_type_desc_NO SHOW SOCKS,prod_type_desc_ORGANIZER,prod_type_desc_OTHER ACCESSORIES (APP),prod_type_desc_OTHER ACCESSORIES (FW),prod_type_desc_OTHER ACCESSORIES (HW),prod_type_desc_OTHER BAG,prod_type_desc_OTHER HARDWARE,prod_type_desc_OTHER SUIT,prod_type_desc_OTHER TOP,prod_type_desc_OVERALL,prod_type_desc_PANTS (1/1),prod_type_desc_PANTS (3/4),prod_type_desc_PANTS (TECHNICAL),prod_type_desc_POLO SHIRT (SHORT SLEEVE),prod_type_desc_PULLOVER (LONG SLEEVE),prod_type_desc_PUMP,prod_type_desc_RASHGUARD,prod_type_desc_ROBE,prod_type_desc_SANDALS,prod_type_desc_SCARF,prod_type_desc_SEASACK,prod_type_desc_SHIN GUARD,prod_type_desc_SHIRT (LONG SLEEVE),prod_type_desc_SHIRT (SHORT SLEEVE),prod_type_desc_SHIRT (SLEEVELESS),prod_type_desc_SHOE BAG,prod_type_desc_SHOES - HIGH (NON-FOOTBALL),prod_type_desc_SHOES - LOW (NON FOOTBALL),prod_type_desc_SHOES - MID (NON-FOOTBALL),prod_type_desc_SHOPPER,prod_type_desc_SHORTS (1/2),prod_type_desc_SHORTS (1/4),prod_type_desc_SHOULDER BAG,prod_type_desc_SINGLET (SHIRT),prod_type_desc_SINGLET (SUIT),prod_type_desc_SKIRT,prod_type_desc_SLIDES,prod_type_desc_SOCK PREPACK,prod_type_desc_STICKS PLAYER,prod_type_desc_STOCKINGS,prod_type_desc_STUDS,prod_type_desc_SWEATSHIRT (LONG SLEEVE),prod_type_desc_SWEATSHIRT (SHORT SLEEVE),prod_type_desc_SWEATSHIRT (SLEEVELESS),prod_type_desc_SWIM SHORTS,prod_type_desc_SWIMMING CAP (APP),prod_type_desc_SWIMMING CAP (HW),prod_type_desc_SWIMSUIT,prod_type_desc_T-SHIRT (LONG SLEEVE),prod_type_desc_T-SHIRT (SHORT SLEEVE),prod_type_desc_T-SHIRT (SLEEVELESS),prod_type_desc_TANK,prod_type_desc_TIGHTS (1/1),prod_type_desc_TIGHTS (1/2),prod_type_desc_TIGHTS (1/4),prod_type_desc_TIGHTS (3/4),prod_type_desc_TOWEL,prod_type_desc_TRACK TOP,prod_type_desc_TRACKSUIT,prod_type_desc_TRACKSUIT JACKET,prod_type_desc_TRACKSUIT PANTS,prod_type_desc_TRAVEL BAG,prod_type_desc_TRAVEL BAG (WITH WHEELS),prod_type_desc_TRUNK,prod_type_desc_U-BOTTOM (1/1),prod_type_desc_U-BOTTOM (1/2),prod_type_desc_U-BRA,prod_type_desc_VEST,prod_type_desc_VISOR,prod_type_desc_VULCANIZED SHOES HIGH,prod_type_desc_VULCANIZED SHOES LOW,prod_type_desc_VULCANIZED SHOES MID,prod_type_desc_WAISTBAG,prod_type_desc_WALLET,prod_type_desc_WASH KIT,prod_type_desc_WINDBREAKER,prod_type_desc_WOOLIE,prod_type_desc_WORKOUT BRA - HIGH SUPPORT,prod_type_desc_WORKOUT BRA - LIGHT SUPPORT,prod_type_desc_WORKOUT BRA - MEDIUM SUPPORT,prod_type_desc_WRISTBAND,prod_type_desc_YOUTH/BABY JOGGER
article_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1,Unnamed: 202_level_1,Unnamed: 203_level_1,Unnamed: 204_level_1,Unnamed: 205_level_1,Unnamed: 206_level_1,Unnamed: 207_level_1,Unnamed: 208_level_1,Unnamed: 209_level_1,Unnamed: 210_level_1,Unnamed: 211_level_1,Unnamed: 212_level_1,Unnamed: 213_level_1,Unnamed: 214_level_1,Unnamed: 215_level_1,Unnamed: 216_level_1,Unnamed: 217_level_1,Unnamed: 218_level_1,Unnamed: 219_level_1,Unnamed: 220_level_1,Unnamed: 221_level_1,Unnamed: 222_level_1,Unnamed: 223_level_1,Unnamed: 224_level_1,Unnamed: 225_level_1,Unnamed: 226_level_1,Unnamed: 227_level_1,Unnamed: 228_level_1,Unnamed: 229_level_1,Unnamed: 230_level_1,Unnamed: 231_level_1,Unnamed: 232_level_1,Unnamed: 233_level_1,Unnamed: 234_level_1,Unnamed: 235_level_1,Unnamed: 236_level_1,Unnamed: 237_level_1,Unnamed: 238_level_1,Unnamed: 239_level_1,Unnamed: 240_level_1,Unnamed: 241_level_1,Unnamed: 242_level_1,Unnamed: 243_level_1,Unnamed: 244_level_1,Unnamed: 245_level_1,Unnamed: 246_level_1,Unnamed: 247_level_1,Unnamed: 248_level_1,Unnamed: 249_level_1,Unnamed: 250_level_1,Unnamed: 251_level_1,Unnamed: 252_level_1,Unnamed: 253_level_1,Unnamed: 254_level_1,Unnamed: 255_level_1,Unnamed: 256_level_1,Unnamed: 257_level_1,Unnamed: 258_level_1,Unnamed: 259_level_1,Unnamed: 260_level_1,Unnamed: 261_level_1,Unnamed: 262_level_1,Unnamed: 263_level_1,Unnamed: 264_level_1,Unnamed: 265_level_1,Unnamed: 266_level_1,Unnamed: 267_level_1,Unnamed: 268_level_1,Unnamed: 269_level_1,Unnamed: 270_level_1,Unnamed: 271_level_1,Unnamed: 272_level_1,Unnamed: 273_level_1,Unnamed: 274_level_1,Unnamed: 275_level_1,Unnamed: 276_level_1,Unnamed: 277_level_1,Unnamed: 278_level_1,Unnamed: 279_level_1,Unnamed: 280_level_1,Unnamed: 281_level_1,Unnamed: 282_level_1,Unnamed: 283_level_1,Unnamed: 284_level_1,Unnamed: 285_level_1,Unnamed: 286_level_1,Unnamed: 287_level_1,Unnamed: 288_level_1,Unnamed: 289_level_1,Unnamed: 290_level_1,Unnamed: 291_level_1,Unnamed: 292_level_1,Unnamed: 293_level_1,Unnamed: 294_level_1,Unnamed: 295_level_1,Unnamed: 296_level_1,Unnamed: 297_level_1,Unnamed: 298_level_1,Unnamed: 299_level_1,Unnamed: 300_level_1,Unnamed: 301_level_1,Unnamed: 302_level_1,Unnamed: 303_level_1,Unnamed: 304_level_1,Unnamed: 305_level_1,Unnamed: 306_level_1,Unnamed: 307_level_1,Unnamed: 308_level_1,Unnamed: 309_level_1,Unnamed: 310_level_1,Unnamed: 311_level_1,Unnamed: 312_level_1,Unnamed: 313_level_1,Unnamed: 314_level_1,Unnamed: 315_level_1,Unnamed: 316_level_1,Unnamed: 317_level_1,Unnamed: 318_level_1,Unnamed: 319_level_1,Unnamed: 320_level_1,Unnamed: 321_level_1,Unnamed: 322_level_1,Unnamed: 323_level_1,Unnamed: 324_level_1,Unnamed: 325_level_1,Unnamed: 326_level_1,Unnamed: 327_level_1,Unnamed: 328_level_1,Unnamed: 329_level_1,Unnamed: 330_level_1,Unnamed: 331_level_1,Unnamed: 332_level_1,Unnamed: 333_level_1,Unnamed: 334_level_1,Unnamed: 335_level_1,Unnamed: 336_level_1,Unnamed: 337_level_1,Unnamed: 338_level_1,Unnamed: 339_level_1,Unnamed: 340_level_1,Unnamed: 341_level_1,Unnamed: 342_level_1,Unnamed: 343_level_1,Unnamed: 344_level_1,Unnamed: 345_level_1,Unnamed: 346_level_1,Unnamed: 347_level_1,Unnamed: 348_level_1,Unnamed: 349_level_1,Unnamed: 350_level_1,Unnamed: 351_level_1,Unnamed: 352_level_1
15110,515.0,149.12,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
19000,1923.0,70.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
19228,413.0,130.27,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
19310,165.0,130.08,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
19332,217.0,79.64,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [112]:
log_reg = pd.DataFrame(data = logreg.predict(dat_tree_dummies.loc[:, 'price':]), index = dat_tree.index, columns = ['>500'])

# Optimal Buy

In [None]:
# ---- Plot -----
# a = np.random.choice(preds.article_number.unique(), size = 1, replace = False)[0]
aoi = 'G26535'
a = aoi

dat_a = preds[preds.article_number == a][['week', 'net_qty', 'corrected', 'y_hat']]
        

plt.rcParams["figure.figsize"] = [10,7]
dat_a.sort_values('week').set_index('week').plot(linewidth = 3)
dat_a.sort_values('week').set_index('week').round()

dat_a[['net_qty', 'corrected', 'y_hat']].apply(np.sum).round()

preds_season.reset_index()[preds_season.index == a]

dat_aoi = dat0[dat0.article_number == aoi].copy()

dat_aoi = pd.merge(
    pd.DataFrame(dat_aoi.groupby(['year', 'week'])['net_qty'].sum()).reset_index(),
    dat_aoi[['year', 'week']].drop_duplicates()
)

dat_aoi.year = [str(x) for x in dat_aoi.year]
dat_aoi.week = [str(x) for x in dat_aoi.week]
dat_aoi['date'] = [dt.datetime.strptime(x[0] + '-' + x[1] + '-1', "%Y-%W-%w") for x in zip(dat_aoi.year, dat_aoi.week)]

plt.rcParams["figure.figsize"] = [10,7]
dat_aoi[['date', 'net_qty']].set_index('date').plot(linewidth = 4)

# dat_aoi

In [None]:
dat[dat.index == art[0]][['season', 'season_net_qty', 'sports_cat_desc', 'rmh_cat_desc', 'franchise', 'price', 'margin', 'cost']]

minimize_EL(net_qtys, dat.loc[a, 'margin'], dat.loc[a, 'cost'])
net_qtys.describe()


In [None]:
from functools import partial
from scipy import optimize
from scipy import integrate
import scipy.stats as stats

In [None]:
# Loss --- demand, buy, margin, cost
def L(d, b, margin, cost):
    if d > b:    # CANNOT satisfy demand
        return (d - b)*margin
    
    elif d <= b: # CAN satisfy demand
        return (b - d)*cost
    
    else:
        print('Error')

# E[L | buy, article_mean, article_sd, article_margin, article_cost]
def EL(net_qtys, margin, cost, b):
    return sum([L(x, b, margin, cost) for x in net_qtys])/len(net_qtys)

def minimize_EL(net_qtys, margin, cost):
    p = partial(EL, net_qtys, margin, cost) # Make EL function of only one var
    mu = np.mean(net_qtys)
    buy_opt = optimize.minimize_scalar(p, bounds = (mu, mu + 2*np.std(net_qtys)))
    return int(buy_opt['x']) # optimal buy quantity

In [None]:
# Profit
def P(d, margin, cost, b):
    if d > b:    # CANNOT satisfy demand
        return b*margin
    
    elif d <= b: # CAN satisfy demand
        return d*margin - (b - d)*cost
    
    else:
        print('Error')

def EP(net_qtys, margin, cost, b):
    return -sum([P(x, margin, cost, b) for x in net_qtys])Yeah /len(net_qtys) # integral--by rectangle method, each w/ 1/n height

def maximize_EP(net_qtys, margin, cost):
    p = partial(EP, net_qtys, margin, cost) # Make EL function of only one var
    mu = np.mean(net_qtys)
    buy_opt = optimize.minimize_scalar(p, bounds = (mu, mu + 2*np.std(net_qtys)))
    return int(buy_opt['x']) # optimal buy quantity

In [None]:
# one offs

# # Minimize Loss
# l = partial(EL, net_qtys, 63, 30) # Make EL function of only one var
# [print(x, ':', round(l(x))) for x in range(0, 600, 50)]

# # Maximize Profit
# p = partial(EP, net_qtys, 63, 30)
# [print(x, ':', round(p(x))) for x in range(0, 600, 50)]


minimize_EL(net_qtys, 6, 2)
                          
maximize_EP(net_qtys, 6, 2)

# HUZZAH!!!!!


In [None]:


# --- Overbuy statistics ---
# preds_season['pct_overbuy'] = (preds_season.Opt_Ovb - preds_season.y_hat)/preds_season.y_hat*100
# b = np.array([0, 1000, 2000, 5000, 10000, 50000])
# preds_season['bins'] = pd.cut(preds_season.y_hat, bins = b)
# preds_season.groupby('bins')['pct_overbuy'].describe().round()


# Appendix

In [None]:
dat_art5[dat_art5.pred_90 < dat_art5.season_net_qty].sum()

# season_net_qty         227889.0
# pred_80                129573.0
# Ecom_FC_RMA            252412.0
# DAA_profit_pred_80    4725638.0
# eCom_profit           6433454.0
# diff                  2266284.0
# dtype: float64

# Comments: 
    # when we under-forecast... we DRAMATICALLY under-forecast
    # HUGE difference in profit

dat_art5[dat_art5.pred_90 >= dat_art5.season_net_qty].sum()

# season_net_qty         179877.0
# pred_80                354269.0
# Ecom_FC_RMA            310015.0
# DAA_profit_pred_80    4232213.0
# eCom_profit           4550809.0
# diff                  2017816.0
# dtype: float64

# Comments: 
    # in sum, we just slightly over-forecast
    # Tiny difference in profit
    

In [None]:
## buy_availability correction

dat_roll = dat0.copy()

dat_roll = dat_roll[(dat_roll.net_qty > 0) & (dat_roll.season.isin(['SS17', 'SS18', 'SS19']))] # No buy_availability before '17


dat_roll = dat_roll[['article_number', 'country', 'season', 'year', 'week', 'net_qty', 'season_net_qty', 'art_desc', 'sports_cat_desc',
           'rmh_cat_desc', 'franchise', 'gender_desc', 'age_group_desc',
           'prod_grp_desc', 'prod_type_desc', 'price', 'margin', 'cost', 'buy_availability']]

dat_roll = dat_roll[dat_roll.season_net_qty > 200]

# dat_roll[['price', 'margin', 'cost']] = dat_roll[['price', 'margin', 'cost']] # .fillna(0).astype('int')



dat_roll = (
    dat_roll[['article_number', 'country', 'season', 'year', 'week', 'net_qty', 'buy_availability']].
    sort_values(['article_number', 'country', 'year', 'week'])
) 


dat_roll['buy_availability2'] = dat_roll.buy_availability.fillna(1) # assume full availability
dat_roll['buy_availability2'] = np.where(dat_roll.buy_availability2 == 0, 1, dat_roll.buy_availability2) # replace 0 

pd.crosstab(index = [dat_roll.season, dat_roll.buy_availability2], columns = 'counts')

rollers = np.random.choice(dat_roll.article_number, size = 5, replace = False)

dat_roll2 = dat_roll[dat_roll.article_number.isin(rollers)].copy()
dat_roll2 = dat_roll2.sort_values(['article_number', 'country', 'season', 'year', 'week'])

dat_roll2

# Smooth buy_availability

# Function
def roll(df):
    return df.rolling(window = 5, min_periods = 1, center = True).mean()

dat_roll2['buy_availability3'] = dat_roll2.groupby(['article_number', 'country'])['buy_availability2'].apply(roll)
# dat_roll2['buy_availability3'] = np.where(dat_roll2.buy_availability3 > 0.15, dat_roll2.buy_availability3, 0.15)


# Corrected net_qty
dat_roll2['corr_net_qty'] = (dat_roll2.net_qty / dat_roll2.buy_availability3).round()


In [None]:
# for pct in ['pred', 'pred_50', 'pred_60', 'pred_70', 'pred_75', 'pred_80', 'pred_90', 'pred_98']:
#     col = 'DAA_profit' + '_' + pct
#     dat_art4[col]  = dat_art4.apply(lambda row: P(row['season_net_qty'], row['margin'], row['cost'], row[pct]), axis=1)
    
                          