# 0.0 Load Modules, submodules, classes, functions

In [None]:
import numpy as np
import pandas as pd
import multiprocessing
import scipy
import sklearn as sk
from sklearn import svm

from sklearn import preprocessing
from sklearn import neighbors
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn import clone
from sklearn.externals.six.moves import xrange
from scipy import spatial
from scipy.spatial import cKDTree
#from pyramid.arima import auto_arima

import matplotlib.pyplot as plt
import bokeh
import bokeh.io
from bokeh.plotting import figure
from bokeh.io import output_notebook, show
# from bokeh import charts

# init_notebook_mode()

import seaborn as sns

import re
import math
import copy

from collections import defaultdict
import csv
import itertools
import datetime 
from datetime import datetime
import time
import dateutil.parser
import pickle
import random


import gc
import zipfile
import sys, getopt
import os

from IPython.core.interactiveshell import InteractiveShell
from io import StringIO

import dask.dataframe as dd
#from chest import Chest

InteractiveShell.ast_node_interactivity = "all"
#InteractiveShell.ast_node_interactivity = "last"

# Magic function to make matplotlib inline; other style specs must come AFTER
%matplotlib inline

# This enables SVG graphics inline.  There is a bug, so uncomment if it works.
# %config InlineBackend.figure_formats = {'svg',}

# This enables high resolution PNGs. SVG is preferred, but has problems
# rendering vertical and horizontal lines
%config InlineBackend.figure_formats = {'png', 'retina'}

# Set up Bokeh for inline viewing
bokeh.io.output_notebook()

import dask.dataframe as ddf
import dask.array as da
#from dask import distributed

# 1 Preprocessing

## 1.1 Wrangling

In [None]:
%%time
dat0 = pd.read_csv('seasonal_sales_indicators.csv',
                 delimiter = '~')

# len(dat0['article_number'].unique()) # 46573

In [None]:
dat = dat0.copy()

In [None]:
dat.columns = [x.replace('t_eu_ecom_dit_dsf_transaction_t.', '') for x in dat.columns] # tidy column names

In [None]:
dat = dat[dat['gross_demand_quantity'] != 0] 

In [None]:
dat = dat[['article_number', 'gross_demand_quantity', 'sold_qty',
       'net_qty', 'gross_sales_gross_disc_net_ret',
       'gross_sales_net_disc_gross_ret', 'net_sales', 'total_markdown',
       'temporary_markdown', 'permanent_markdown', 'employee_markdown',
       'fraction_of_full_price', 'markdown', 'sale']]

In [None]:
dat['sales_price'] = dat['net_sales'].divide(dat['gross_demand_quantity'])

In [None]:
%%time

dat_grouped = dat.groupby(by = 'article_number')

functions = ['min', 'median', 'mean', 'max', 'std', 'sum', 'count']
dat = dat_grouped.agg(functions)

In [None]:
dat['Count'] = dat['sold_qty']['count']
dat.drop('count', level = 1, axis = 1, inplace = True) # # Drop redundant 'count' columns

dat.fillna(value=0, inplace = True) # Define single transaction article std to be 0
dat = dat.replace([np.inf, -np.inf, np.nan], 0)

## 1.2 Save/Load curated data

In [None]:
# Save/load tidied version

# dat.to_csv('dat.csv')

dat = pd.read_csv('dat.csv', low_memory=False, index_col = 0, header = [0,1]) # gotta encode multi-index

## 1.3 Principal Component Analysis (for 2-D visualizations)

In [None]:
%%time

# SCALING: zero mean and unit variance 
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# scaler.fit(dat)
# dat_scaled = scaler.transform(dat)

# PRINCIPAL COMPONENT ANALYSIS
from sklearn.decomposition import PCA

pca = PCA(n_components=2) # keep the first two principal components of the data
pca.fit(dat)

# transform data onto the first two principal components
dat_pca = pca.transform(dat)

In [None]:
# EDA 
np.isnan(np.log(dat_pca[:,0])).sum()
(dat_pca[:,1] < 0).sum()
(pd.DataFrame(dat_pca)).describe()

# 2 Classification

## 2.1 k-means

### 2.1.1 Model

In [None]:
%%time

# K-means clustering  --------------------  --------------------
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters = 10)
kmeans.fit(dat)

Kmeans = pd.Series(kmeans.predict(dat)) # Cluster assignments

In [None]:
# Reclassify: majority/non as 0/1 

Kmeans.value_counts()

In [None]:
Kmeans = (Kmeans != 0)*1 # lone vector
Kmeans.value_counts()

In [None]:
dat['Kmeans'] = Kmeans.values
dat.head()

### 2.1.2 Plots

In [None]:
import matplotlib as mpl

In [None]:
mpl.rcParams.update(mpl.rcParamsDefault)

In [None]:
# plt.scatter(dat_pca[:,0], 
#             dat_pca[:,1], 
#             c = Kmeans,
#             alpha = 0.05)

# plt.xlabel('PC1')
# plt.ylabel('PC2')

# plt.xscale('symlog')
# plt.yscale('symlog')

plt.scatter(dat['sales_price']['mean'], 
            np.log(dat['sold_qty']['sum']), 
            c = Kmeans,
            alpha = 0.05)

plt.xlabel('Sales Price')
plt.ylabel('log(sold_qty)')

plt.colorbar()
plt.rcParams["figure.figsize"] = [12,12]
plt.title('K-means Classification')



In [None]:
# Histogram: total gross demand quantity, basic items
dat[(dat['Kmeans'] == 0) & (dat['gross_demand_quantity']['sum'] < 1000)]['gross_demand_quantity']['sum'].hist(bins = 100)


In [None]:
# Histogram: total gross_demand_quantity, non-basic items
dat[(dat['Kmeans'] != 0) & (dat['gross_demand_quantity']['sum'] < 10000)]['gross_demand_quantity']['sum'].hist(bins = 50)


In [None]:
# Histogram: sales_price, basic items
dat[dat['Kmeans'] == 0]['sales_price']['mean'].hist(bins = 100)


In [None]:
# Histogram: sales_price, non-basic items
dat[(dat['Kmeans'] != 0) & (dat['sales_price']['mean'] != 0)]['sales_price']['mean'].hist(bins = 100)


## 2.2 Agglomerative Clustering

### 2.2.1 Model

In [None]:
%%time

# Agglomerative Clustering -------------
from sklearn.cluster import AgglomerativeClustering

agg = AgglomerativeClustering(n_clusters = 10)
agglom = pd.Series(agg.fit_predict(dat))

In [None]:
# # Reclassify: majority/non as 0/1 

agglom.value_counts()
agglom = (agglom != 1)*1 # lone vector
agglom.value_counts()

### 3.2.2 Plot

In [None]:
plt.scatter(np.log(dat['net_sales']['sum']), 
            np.log(dat['sold_qty']['sum']), 
            c = agglom,
            alpha = 0.15)
plt.xlabel('log(net_sales)')
plt.ylabel('log(sold_qty)')
plt.colorbar()
plt.title('Agglomerative Classification')

## 2.3 DBSCAN - 'density based spatial clustering of applications with noise'

In [None]:
%%time

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(dat)
dat_scaled = scaler.transform(dat)

In [None]:
%%time

from sklearn.cluster import DBSCAN

dbscan = DBSCAN()
dbs = pd.Series(dbscan.fit_predict(dat_scaled))

In [None]:
# # Reclassify: majority/non as 0/1 
dbs01 = (dbs != -1)*1 # lone vector
dbs01.value_counts()

In [None]:
plt.scatter(np.log(dat['net_sales']['sum']), 
            np.log(dat['sold_qty']['sum']), 
            c = dbs01,
            alpha = 0.15)
plt.xlabel('log(net_sales)')
plt.ylabel('log(sold_qty)')
plt.colorbar()
plt.title('DBSCAN Classification')
plt.rcParams["figure.figsize"] = [16,16]

## 2.4 Cross-method comparisons

   ### 2.4.1 K-means x Agglomerative

In [None]:
table = pd.DataFrame(data = {'Ag': agglom, 'Km': Kmeans})
table.groupby(['Ag', 'Km']).size()
# 3924/46573 # disagreement

# table[(table['Ag'] == 0) & (table['Km'] == 1)]

# 3 Meta-Data

## 3.1 Import, Process Article Reference Data

In [None]:
# Key 1 = article number <----> group article

In [None]:
# 'article_descr'

In [None]:
# dat = pd.read_csv('dat.csv', low_memory=False, index_col = 0, header = [0,1]) # gotta encode multi-index

In [None]:
dat2 = pd.read_csv('article_reference_data_y2016_18.csv',
                 delimiter = '~',
                  low_memory = False,
                   usecols=['group_article', 'brand', 'sub_brand', 'season_create', 'season_active',
                            'graphic', 'gender', 'age_group', 'retail_intro_date_global',
                            'retail_exit_date_global', 'material_technology', 'pictogram_composition',
                            'price_band', 'gender_age', 'construction_type', 'length_mes_uom_dim',
                            'uom_dim', 'height_mes_uom_dim', 'width_mes_uom_dim', 'article_descr',
                            'drop_season', 'uom_vol', 'uom_wgt', 'product_fit', 'material_way_type',
                            'outer_sole_main_material', 'inner_sole_main_material', 'main_material_lining',
                            'main_material_upper', 'dimension_uov', 'dimension_uom', 'carried_over_from',
                            'drop_date', 'retail_exit_tgt_season', 'product_franchise', 'age_group_descr',
                            'brand_descr', 'sub_brand_descr', 'lifecylce_status_prod_descr', 'brand_asset_descr',
                            'rmh_retail_class_descr', 'rmh_retail_department_descr', 'rmh_retail_sub_class_descr',
                            'rmh_retail_sub_dept_descr', 'rmh_category_descr', 'rmh_gender_descr',
                            'rmh_retail_section_descr', 'rmh_product_division_descr', 'rmh_product_type_descr',
                            'spm_color_first_descr', 'spm_color_second_descr', 'spm_color_third_descr',
                            'spm_color_fourth_descr', 'product_franchise_descr'
                           ]
                  )

In [None]:
dat2 = dat2[['group_article', 'article_descr']]
dat2.head()

In [None]:
dat = dat[['Kmeans']]
dat['article'] = dat.index

In [None]:
dat3 = pd.merge(dat, dat2, left_on= 'article', right_on= 'group_article')
dat3.columns = ('Kmeans', 'article', 'article2', 'descr')

In [None]:
dat3[dat3['Kmeans'] == 0]['descr'].value_counts()