# 0.0 Modules, etc.

In [None]:
import numpy as np
import pandas as pd
import multiprocessing

import scipy
from scipy import spatial
from scipy.spatial import cKDTree

import sklearn as sk
from sklearn import svm
from sklearn import preprocessing
from sklearn import neighbors
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn import clone
from sklearn.externals.six.moves import xrange

import matplotlib.pyplot as plt
import bokeh
import bokeh.io
from bokeh.plotting import figure
from bokeh.io import output_notebook, show

# init_notebook_mode()

import seaborn as sns

import re
import math
import copy

from collections import defaultdict
import csv
import itertools
import datetime 
from datetime import datetime
import time
import dateutil.parser
import pickle
import random

import gc
import zipfile
import sys, getopt
import os

from IPython.core.interactiveshell import InteractiveShell
from io import StringIO

import dask.dataframe as dd
#from chest import Chest

InteractiveShell.ast_node_interactivity = "all"
#InteractiveShell.ast_node_interactivity = "last"

# Magic function to make matplotlib inline
%matplotlib inline

%config InlineBackend.figure_formats = {'png', 'retina'}

# Set up Bokeh for inline viewing
bokeh.io.output_notebook()

import dask.dataframe as ddf
import dask.array as da

In [None]:
pd.set_option('max_columns', 500)
pd.set_option('max_rows', 500)

In [None]:
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
# plotly.tools.set_credentials_file(username='duplinskiy', api_key='RsZHhxIiAGGu7FN9P4bu')
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf

cf.go_offline()

# 1 Preprocessing

## 1.1 Wrangling

In [None]:
%%time
dat0 = pd.read_csv('seasonal_sales_indicators.csv',
                 delimiter = '~')

# len(dat0['article_number'].unique()) # 46573

In [None]:
dat = dat0.copy()

In [None]:
dat.columns = [x.replace('t_eu_ecom_dit_dsf_transaction_t.', '') for x in dat.columns] # tidy column names

In [None]:
dat = dat[dat['gross_demand_quantity'] != 0] 

In [None]:
# dat = dat[['article_number', 'gross_demand_quantity', 'sold_qty',
#            'net_qty', 'gross_sales_gross_disc_net_ret',
#            'gross_sales_net_disc_gross_ret', 'net_sales', 'total_markdown',
#            'temporary_markdown', 'permanent_markdown', 'employee_markdown',
#            'fraction_of_full_price', 'markdown', 'sale']]


dat = dat[['article_number', 
           'gross_demand_quantity', # mean, sum, std
           'fraction_of_full_price', # mean, std
           'markdown', # mean (INDICATOR) 
           'sale', # mean (INDICATOR no/sale)
           'gross_sales_net_disc_gross_ret']]

In [None]:
dat['sales_price'] = dat['gross_sales_net_disc_gross_ret'].divide(dat['gross_demand_quantity'])
dat.drop('gross_sales_net_disc_gross_ret', axis = 1, inplace = True)

In [None]:
%%time

dat_grouped = dat.groupby(by = 'article_number')

In [None]:
dat = dat_grouped.agg(['mean', 'std']) # Need count

dat.drop([('markdown', 'std'),('sale', 'std')], axis = 1, inplace=True)

dat[('gross_demand_quantity','sum')] = dat_grouped['gross_demand_quantity'].sum()

dat.fillna(value=0, inplace = True) # Define single transaction article std to be 0
dat = dat.replace([np.inf, -np.inf, np.nan], 0)

## 1.2 Save/Load curated data

In [None]:
# Save/load tidied version

# dat.to_csv('dat.csv')

dat = pd.read_csv('dat.csv', low_memory=False, index_col = 0, header = [0,1]) # gotta encode multi-index

In [None]:
dat.head()

Datasets:

    (0) dat

## 1.3 Principal Component Analysis (for 2-D visualizations)

In [None]:
%%time

# SCALING: zero mean and unit variance 
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# scaler.fit(dat)
# dat_scaled = scaler.transform(dat)

# PRINCIPAL COMPONENT ANALYSIS
from sklearn.decomposition import PCA

pca = PCA(n_components=2) # keep the first two principal components of the data
pca.fit(dat)

# transform data onto the first two principal components
dat_pca = pca.transform(dat)

# EDA 
np.isnan(np.log(dat_pca[:,0])).sum()
(dat_pca[:,1] < 0).sum()
(pd.DataFrame(dat_pca)).describe()

# 2 Clustering (aka classification, segmentation)

## 2.0 Feature-based clustering

### 2.0.1 Sort

In [None]:
feature = 'sales_price'
stat = 'mean'

dat_feature_sorted = dat.sort_values([(feature, stat)], ascending=False) # Sort

# Add cumulative sum of feature
dat_feature_sorted['cumulative_feature'] = dat_feature_sorted[feature][stat].cumsum()

# Add cumulative percent of feature
total = dat_feature_sorted[feature][stat].sum() 
dat_feature_sorted['cumulative_pct_feature'] = dat_feature_sorted[feature][stat].cumsum()/total

# Add cumulative perfect of articles
t = pd.Series(range(1, 46573))/46572
dat_feature_sorted['cumulative_pct_articles'] = t.values

In [None]:
# Add feature based cluster; i.e. top X are labelled non-basic
dat_feature_sorted['feat_based'] = np.concatenate([np.ones(8483), np.zeros(38089)]) # match ____ counts below

dat_feature_sorted['feat_based'].value_counts() 

f = pd.DataFrame(dat_feature_sorted['feat_based'])

Datasets:

    (0) dat
    (1) dat_feature_sorted


### 2.0.2 Plot

In [None]:
plt.plot(dat_feature_sorted['cumulative_pct_articles'], 
         dat_feature_sorted['cumulative_pct_feature'],
        linewidth = 3)

# plt.xlabel('Percent of Articles')
# plt.ylabel('Percent of Feature of Interst')

plt.rcParams["figure.figsize"] = [6,6]
plt.title('Majority, from a Minority of Articles')

In [None]:
import plotly.graph_objs as go

trace1 = go.Scatter(
    x = dat_feature_sorted['cumulative_pct_articles'],
    y = dat_feature_sorted['cumulative_pct_feature'],
    mode = 'lines',
    name = '45 degree line'
)


data = [trace1]
iplot(data, filename='scatter-mode')

## 2.01 Scale

In [None]:
# SCALING: zero mean and unit variance 
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(dat)
dat_scaled = pd.DataFrame(scaler.transform(dat), columns = dat.columns, index = dat.index)

Datasets:

    (0) dat
    (1) dat_feature_sorted
    (2) dat_scaled

## 2.1 k-means

### 2.1.1 Model

In [None]:
%%time

# K-means clustering  --------------------  --------------------
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters = 4)
kmeans.fit(dat_scaled)

Kmeans = pd.Series(kmeans.predict(dat_scaled)) # Cluster assignments

# Cluster counts
print(Kmeans.value_counts())
print()

In [None]:
Kmeans = (Kmeans != 0)*1 # lone vector
Kmeans.value_counts()

In [None]:
k = pd.DataFrame({'kmeans': Kmeans.values}, index = dat_scaled.index)


### 2.1.2 Plots

In [None]:
import matplotlib as mpl

In [None]:
mpl.rcParams.update(mpl.rcParamsDefault)

In [None]:
pd.plotting.scatter_matrix(
    dat[[
        ('gross_demand_quantity', 'mean'), 
        ('gross_demand_quantity', 'std'),
        ('sales_price', 'mean'), 
        ('gross_demand_quantity', 'sum')
        ]], 
    figsize=(10,10),
    diagonal='kde',
    c = Kmeans, 
    alpha = 0.5
)

In [None]:
plt.subplot(2, 1, 1)
plt.scatter(dat['gross_demand_quantity']['mean'], 
            dat['sales_price']['mean'], 
            c = Kmeans,
            alpha = 0.5)
plt.title('Sale Price vs. Gross Demand Quantity')
plt.ylabel('Sale Price')

plt.subplot(2, 1, 2)
plt.scatter(dat['gross_demand_quantity']['sum'], 
            dat['sales_price']['mean'], 
            c = Kmeans,
            alpha = 0.5)
# plt.title('Sale Price vs. Gross Demand Qty (mean/sum)')
plt.ylabel('Sale Price')
plt.xlabel('GDQ Sum')

# plt.xlabel('Sales Price')
# plt.ylabel('log(sold_qty)')

# plt.colorbar()
# plt.rcParams["figure.figsize"] = [5,5]
# plt.title('K-means Classification')


### 2.1.3 Histograms

#### (a) gross_demand_quantity - basic

In [None]:
# Histogram: total gross demand quantity, basic items
dat[(dat['Kmeans'] == 0) & (dat['gross_demand_quantity']['sum'] < 10000)]['gross_demand_quantity']['sum'].hist(
    bins = 100, 
)

plt.xlabel('Total Gross Demand Quantity')
plt.ylabel('Quantity')

plt.rcParams["figure.figsize"] = [12,12]
plt.title('Basic Articles')

#### (b) gross_demand_quantity - non-basic

In [None]:
# Histogram: total gross_demand_quantity, non-basic items
dat[(dat['Kmeans'] != 0) & (dat['gross_demand_quantity']['sum'] < 5000)]['gross_demand_quantity']['sum'].hist(bins = 50)

plt.xlabel('Total Gross Demand Quantity')
plt.ylabel('Quantity')

plt.rcParams["figure.figsize"] = [12,12]
plt.title('Non-basic Articles')


#### (c) sales_price - basic

In [None]:
# Histogram: sales_price, basic items
dat[(dat['Kmeans'] == 0) & (dat['sales_price']['mean'] < 200) & (dat['sales_price']['mean'] > 0)]['sales_price']['mean'].hist(bins = 100)

plt.xlabel('Sale Price')
plt.ylabel('Quantity')

plt.rcParams["figure.figsize"] = [12,12]
plt.title('Basic Articles')


#### (d) sales_price - non-basic

In [None]:
# Histogram: sales_price, non-basic items
dat[(dat['Kmeans'] != 0) & (dat['sales_price']['mean'] < 200)]['sales_price']['mean'].hist(bins = 100)

plt.xlabel('Sale Price')
plt.ylabel('Quantity')

plt.rcParams["figure.figsize"] = [12,12]
plt.title('Non-basic Articles')

## 2.2 Agglomerative Clustering

In [None]:
dat_scaled.head()

In [None]:
%%time

# Agglomerative Clustering -------------
from sklearn.cluster import AgglomerativeClustering

agg = AgglomerativeClustering(n_clusters = 6)
agglom = pd.Series(agg.fit_predict(dat_scaled))

In [None]:
# # Reclassify: majority/non as 0/1 

agglom.value_counts()

In [None]:
agglom = (agglom != 0)*1 # lone vector
agglom.value_counts()

In [None]:
a = pd.DataFrame({'agglom': agglom.values}, index = dat_scaled.index)

In [None]:
plt.subplot(2, 1, 1)
plt.scatter(dat_scaled['gross_demand_quantity']['mean'], 
            dat_scaled['sales_price']['mean'], 
            c = agglom,
            alpha = 0.25)
plt.title('Agglomerative Classes: Sale Price vs. Gross Demand Qty (mean/sum)')
plt.ylabel('Sale Price')

plt.subplot(2, 1, 2)
plt.scatter(dat_scaled['gross_demand_quantity']['sum'], 
            dat_scaled['sales_price']['mean'], 
            c = agglom,
            alpha = 0.25)
plt.ylabel('Sale Price')
plt.xlabel('GDQ Sum')


## 2.3 DBSCAN 

'density based spatial clustering of applications with noise'

In [None]:
%%time

from sklearn.cluster import DBSCAN

dbscan = DBSCAN() # 3min 36s
dbs = pd.Series(dbscan.fit_predict(dat_scaled))

In [None]:
dbs.value_counts()

In [None]:
# # Reclassify: majority/non as 0/1 
dbs01 = (dbs != -1)*1 # lone vector
dbs01.value_counts()

In [None]:
dbs = pd.DataFrame({'dbs': dbs01.values}, index = dat_scaled.index)

In [None]:
plt.scatter(np.log(dat['net_sales']['sum']), 
            np.log(dat['sold_qty']['sum']), 
            c = dbs01,
            alpha = 0.15)
plt.xlabel('log(net_sales)')
plt.ylabel('log(sold_qty)')
plt.colorbar()
plt.title('DBSCAN Classification')
plt.rcParams["figure.figsize"] = [16,16]

## 2.4 Evaluation

### 2.4.1 Cross methods comparison

In [None]:
k_a = pd.merge(k, a, left_index = True, right_index = True)

f_k_a = pd.merge(f, k_a, left_index=True, right_index=True)

# fka_dbs = pd.merge(f_k_a, dbs, left_index=True, right_index=True)

f_k_a.head()

f_k_a.groupby(['feat_based', 'kmeans', 'agglom']).size()
# (26620 + 4100)/46573 # 0.943% agreement

In [None]:
table = pd.DataFrame(data = {'Ag': agglom, 'Km': Kmeans, 'DB': dbs01})
table.groupby(['Ag', 'Km', 'DB']).size()
1 - 3924/46573 # 0.9157 kmeans-agglomerative agreement

# table[(table['Ag'] == 0) & (table['Km'] == 1)]

### 2.4.2 Clustering Metrics

See https://scikit-learn.org/stable/modules/clustering.html for metric information (2.4.2 - __ )

In [None]:
dat_feature_sorted_scaled = pd.DataFrame(StandardScaler().fit(dat_feature_sorted).transform(dat_feature_sorted), 
                                         columns = dat_feature_sorted.columns,
                                         index = dat_feature_sorted.index)


#### Silhouette Coefficient

In [None]:
from sklearn import metrics

# higher better

metrics.silhouette_score(dat_feature_sorted_scaled, f['feat_based']) # feature based: 0.555

metrics.silhouette_score(dat_feature_sorted, f['feat_based']) # feature based: 0.59568
metrics.silhouette_score(dat_scaled, k['kmeans']) # k-means: 0.44598
metrics.silhouette_score(dat_scaled, a['agglom']) # agglomerative: 0.33

#### Calinski-Harabaz Index

In [None]:
# higher better

metrics.calinski_harabaz_score(dat_feature_sorted_scaled, f['feat_based']) # 20816

metrics.calinski_harabaz_score(dat_feature_sorted, f['feat_based']) # 77433
metrics.calinski_harabaz_score(dat_scaled, k['kmeans']) # 7474
metrics.calinski_harabaz_score(dat_scaled, a['agglom']) # 5147


#### Davies-Bouldin Index

In [None]:
# lower is better

metrics.davies_bouldin_score(dat_feature_sorted_scaled, f['feat_based']) # 0.779

metrics.davies_bouldin_score(dat_feature_sorted, f['feat_based']) # 0.44
metrics.davies_bouldin_score(dat_scaled, k['kmeans']) # 1.186
metrics.davies_bouldin_score(dat_scaled, a['agglom']) # 1.47


# 3 Article Reference Data

In [None]:
dat2 = pd.read_csv('article_reference_data_y2016_18.csv',
                 delimiter = '~',
                  low_memory = False,
                   usecols=['group_article', 'brand', 'sub_brand', 'season_create', 'season_active',
                            'graphic', 'gender', 'age_group', 'retail_intro_date_global',
                            'retail_exit_date_global', 'material_technology', 'pictogram_composition',
                            'price_band', 'gender_age', 'construction_type', 'length_mes_uom_dim',
                            'uom_dim', 'height_mes_uom_dim', 'width_mes_uom_dim', 'article_descr',
                            'drop_season', 'uom_vol', 'uom_wgt', 'product_fit', 'material_way_type',
                            'outer_sole_main_material', 'inner_sole_main_material', 'main_material_lining',
                            'main_material_upper', 'dimension_uov', 'dimension_uom', 'carried_over_from',
                            'drop_date', 'retail_exit_tgt_season', 'product_franchise', 'age_group_descr',
                            'brand_descr', 'sub_brand_descr', 'lifecylce_status_prod_descr', 'brand_asset_descr',
                            'rmh_retail_class_descr', 'rmh_retail_department_descr', 'rmh_retail_sub_class_descr',
                            'rmh_retail_sub_dept_descr', 'rmh_category_descr', 'rmh_gender_descr',
                            'rmh_retail_section_descr', 'rmh_product_division_descr', 'rmh_product_type_descr',
                            'spm_color_first_descr', 'spm_color_second_descr', 'spm_color_third_descr',
                            'spm_color_fourth_descr', 'product_franchise_descr'
                           ]
                  )

In [None]:
# dat2['<feature>'].unique()
# len(dat2['<feature'].unique())

# Yes (11)
# ['rmh_product_type_descr', 'rmh_product_division_descr', 'rmh_retail_section_descr',
# 'rmh_gender_descr', 'rmh_category_descr', 'rmh_retail_department_descr',
# 'sub_brand_descr', 'brand_descr', 'age_group_descr', 'age_group', 'gender']

# No (13)
# 'graphic', 'pictogram_composition', 'article_descr', 'lifecylce_status_prod_descr', 'brand_asset_descr', 
# 'rmh_retail_class_descr', 'rmh_retail_sub_class_descr', 'rmh_retail_sub_dept_descr', 'spm_color_first_descr', 
# 'spm_color_second_descr', 'spm_color_third_descr','spm_color_fourth_descr', 'product_franchise_descr' 


dat2.drop(['uom_dim', 'drop_season', 'uom_vol', 'uom_wgt', 'material_way_type', 'inner_sole_main_material', 
           'main_material_upper', 'outer_sole_main_material', 'main_material_lining', 'carried_over_from', 'drop_date',
           'graphic', 'pictogram_composition', 'article_descr', 'lifecylce_status_prod_descr', 'brand_asset_descr', 
           'rmh_retail_class_descr', 'rmh_retail_sub_class_descr', 'rmh_retail_sub_dept_descr', 'spm_color_first_descr', 
           'spm_color_second_descr', 'spm_color_third_descr','spm_color_fourth_descr', 'product_franchise_descr' 
          ], axis = 1, inplace=True)


In [None]:
# Save/read data

# dat2.to_csv('dat2.csv')

dat2 = pd.read_csv('dat2.csv', low_memory=False, index_col = 0) 

In [None]:
pd.to_datetime(dat2['season_create'])

dat2.shape
len(dat2['group_article'].unique())
dat2.dtypes
dat2.season_create.unique()

{print(x, '-->', len(dat2[x].unique())) for x in dat2.columns}

# 4 Cluster Prediction

    *Pause this for now, determine if clustering helps

### 4.0.2 Scaling

In [None]:
# Standardize non-numeric features

# dt = pd.DataFrame(data = dat2.dtypes, columns = ['type'])
# num = dt[(dt['type'] == 'int64') | (dt['type'] == 'float64')].index
# obj = dt[(dt['type'] != 'int64') & (dt['type'] != 'float64')].index

# dat2_num = dat2[num]

# scaler = StandardScaler()
# scaler.fit(dat2_num)

# dat2_num_scaled = pd.DataFrame(scaler.transform(dat2_num), columns = dat2_num.columns, index = dat2_num.index)
# dat2_num_scaled.head()

### 4.0.3 One-hot Encoding

In [None]:
%%time

dat2_dummy = pd.get_dummies(dat2)

In [None]:
dat2_dummy.shape

## 4.1 Classification Models

### 4.1.1 Train-Test Split

In [None]:
X = dat2_dummy.drop('Kmeans', axis = 1)
y = dat2_dummy['Kmeans']

X_train, X_test, y_train, y_test = train_test_split(X, y)

### 4.1.2 Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier().fit(X, y)

tree.score(X_train, y_train)
tree.score(X_test, y_test)

### 4.1.3 Neural Network (vanilla MLP)

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:

mlp = MLPClassifier().fit(X_train, y_train) 
    # --Arguments--
    # solver = <something>
    # hidden_layer_sizes = [<nodes_layer_1>, <nodes_layer_2>, ...]
    # activation = '<function>'
    # alpha = <regularization/complexity_param> 
    
mlp.score(X_train, y_train)
mlp.score(X_test, y_test)

# 5 Baseline eCom Demand Forecaster

In the absence of prior season data for an article, predict prior season average of ten most similarly **PRICED** items.

### Approach
    
    (0) Group by item, season
    (1) Select item
    (2) Identify ten most similarly priced items in the same season
    (3) Average the sales of those 10 for the last (same) season 

In [None]:
%%time

dat00 = pd.read_csv('seasonal_sales_indicators.csv',
                 delimiter = '~')

In [None]:
dat0 = dat00.copy()

dat0.columns = [x.replace('t_eu_ecom_dit_dsf_transaction_t.', '') for x in dat0.columns] # tidy column names

dat0.drop(['sales_organization', 'sku', 'total_markdown', 'temporary_markdown', 
           'permanent_markdown', 'employee_markdown', 'fraction_of_full_price', 
           'markdown', 'sale'], inplace=True, axis = 1)

# {print(x, '-->', len(dat0[x].unique())) for x in dat0.columns}
# dat0.dtypes

In [None]:
dat0 = dat0.groupby(by = ['article_number', 'season']).agg('sum')

dat0['price'] = dat0['net_sales']/dat0['net_qty']

In [None]:
order = {'SS17': 0, 'FW17': 1, 'SS18': 2, 'FW18': 3, 'SS19': 4}
dat0['order_id'] = [order[i] for i in dat0.reset_index()['season']]
dat0.sort_values(by = ['article_number', 'order_id'], inplace=True)
dat0.drop('order_id', axis = 1, inplace=True)

dat0.reset_index(inplace=True)

In [None]:
dat0 = dat0[[s in ['SS17', 'SS19'] for s in dat0['season']]]
dat0 = dat0.replace([np.inf, -np.inf, np.nan], 0)
dat0 = dat0[dat0['price'] > 0]


In [None]:
dat17 = dat0[dat0['season'] == 'SS17'].sort_values('price')
dat19 = dat0[dat0['season'] == 'SS19'].sort_values('price')

In [None]:
dat17.price.rolling(11, center=True).mean()