# 0.0 Modules, etc.

In [None]:
import numpy as np
import pandas as pd
import multiprocessing

import scipy
from scipy import spatial
from scipy.spatial import cKDTree

import sklearn as sk
from sklearn import svm
from sklearn import preprocessing
from sklearn import neighbors
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn import clone
from sklearn.externals.six.moves import xrange


import matplotlib.pyplot as plt
import bokeh
import bokeh.io
from bokeh.plotting import figure
from bokeh.io import output_notebook, show

# init_notebook_mode()

import seaborn as sns

import re
import math
import copy

from collections import defaultdict
import csv
import itertools
import datetime 
from datetime import datetime
import time
import dateutil.parser
import pickle
import random

import gc
import zipfile
import sys, getopt
import os

from IPython.core.interactiveshell import InteractiveShell
from io import StringIO

import dask.dataframe as dd
#from chest import Chest

InteractiveShell.ast_node_interactivity = "all"
#InteractiveShell.ast_node_interactivity = "last"

# Magic function to make matplotlib inline
%matplotlib inline

%config InlineBackend.figure_formats = {'png', 'retina'}

# Set up Bokeh for inline viewing
bokeh.io.output_notebook()

import dask.dataframe as ddf
import dask.array as da

In [None]:
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
# plotly.tools.set_credentials_file(username='duplinskiy', api_key='RsZHhxIiAGGu7FN9P4bu')
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf

cf.go_offline()

In [None]:
pd.set_option('max_columns', 500)

# 1 Preprocessing

## 1.1 Wrangling

In [None]:
%%time
dat0 = pd.read_csv('seasonal_sales_indicators.csv',
                 delimiter = '~')

# len(dat0['article_number'].unique()) # 46573

In [None]:
dat = dat0.copy()

In [None]:
dat.columns = [x.replace('t_eu_ecom_dit_dsf_transaction_t.', '') for x in dat.columns] # tidy column names

In [None]:
dat = dat[dat['gross_demand_quantity'] != 0] 

In [None]:
# dat = dat[['article_number', 'gross_demand_quantity', 'sold_qty',
#            'net_qty', 'gross_sales_gross_disc_net_ret',
#            'gross_sales_net_disc_gross_ret', 'net_sales', 'total_markdown',
#            'temporary_markdown', 'permanent_markdown', 'employee_markdown',
#            'fraction_of_full_price', 'markdown', 'sale']]


dat = dat[['article_number', 
           'gross_demand_quantity', # mean, sum, std
           'fraction_of_full_price', # mean, std
           'markdown', # mean (INDICATOR) 
           'sale', # mean (INDICATOR no/sale)
           'gross_sales_net_disc_gross_ret']]

In [None]:
dat['sales_price'] = dat['gross_sales_net_disc_gross_ret'].divide(dat['gross_demand_quantity'])
dat.drop('gross_sales_net_disc_gross_ret', axis = 1, inplace = True)

In [None]:
%%time

dat_grouped = dat.groupby(by = 'article_number')

In [None]:
dat = dat_grouped.agg(['mean', 'std']) # Need count

dat.drop([('markdown', 'std'),('sale', 'std')], axis = 1, inplace=True)

dat[('gross_demand_quantity','sum')] = dat_grouped['gross_demand_quantity'].sum()

dat.fillna(value=0, inplace = True) # Define single transaction article std to be 0
dat = dat.replace([np.inf, -np.inf, np.nan], 0)

## 1.2 Save/Load curated data

In [None]:
# Save/load tidied version

# dat.to_csv('dat.csv')

dat = pd.read_csv('dat.csv', low_memory=False, index_col = 0, header = [0,1]) # gotta encode multi-index

In [None]:
dat.head()

## 1.3 Principal Component Analysis (for 2-D visualizations)

In [None]:
%%time

# SCALING: zero mean and unit variance 
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# scaler.fit(dat)
# dat_scaled = scaler.transform(dat)

# PRINCIPAL COMPONENT ANALYSIS
from sklearn.decomposition import PCA

pca = PCA(n_components=2) # keep the first two principal components of the data
pca.fit(dat)

# transform data onto the first two principal components
dat_pca = pca.transform(dat)

# EDA 
np.isnan(np.log(dat_pca[:,0])).sum()
(dat_pca[:,1] < 0).sum()
(pd.DataFrame(dat_pca)).describe()

# 2 Clustering (aka classification, segmentation)

## 2.0 Feature-based clustering

### 2.0.1 Cluster

In [None]:
feature = 'sales_price'
stat = 'mean'

dat_feature_sorted = dat.sort_values([(feature, stat)], ascending=False) # Sort

dat_feature_sorted['cumulative_feature'] = dat_feature_sorted[feature][stat].cumsum()

total = dat_feature_sorted[feature][stat].sum() # standardized, so...???
dat_feature_sorted['cumulative_pct_feature'] = dat_feature_sorted[feature][stat].cumsum()/total

t = pd.Series(range(1, 46573))/46572
dat_feature_sorted['cumulative_pct_articles'] = t.values

In [None]:
dat_feature_sorted['feat_based'] = np.concatenate([np.ones(38089), np.zeros(8483)]) # match ____ counts below

dat_feature_sorted['feat_based'].mean() 
dat_feature_sorted['feat_based'].value_counts() 

### 2.0.2 Plot

In [None]:
plt.plot(dat_feature_sorted['cumulative_pct_articles'], 
         dat_feature_sorted['cumulative_pct_feature'],
        linewidth = 3)

# plt.xlabel('Percent of Articles')
# plt.ylabel('Percent of Feature of Interst')

plt.rcParams["figure.figsize"] = [6,6]
plt.title('Majority, from a Minority of Articles')

In [None]:
import plotly.graph_objs as go

trace1 = go.Scatter(
    x = dat_feature_sorted['cumulative_pct_articles'],
    y = dat_feature_sorted['cumulative_pct_feature'],
    mode = 'lines',
    name = '45 degree line'
)


data = [trace1]
iplot(data, filename='scatter-mode')

## 2.01 Scale

In [None]:
# SCALING: zero mean and unit variance 
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(dat)
dat_scaled = pd.DataFrame(scaler.transform(dat), columns = dat.columns, index = dat.index)

Now two datasets:

    (1) dat
    (2) dat_scaled

## 2.1 k-means

### 2.1.1 Model

In [None]:
%%time

# K-means clustering  --------------------  --------------------
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters = 4)
kmeans.fit(dat_scaled)

Kmeans = pd.Series(kmeans.predict(dat_scaled)) # Cluster assignments

# Cluster counts
print(Kmeans.value_counts())
print()

In [None]:
Kmeans = (Kmeans != 0)*1 # lone vector
Kmeans.value_counts()

In [None]:
dat_scaled['Kmeans'] = Kmeans.values


In [None]:
dat_scaled

### 2.1.2 Plots

In [None]:
import matplotlib as mpl

In [None]:
mpl.rcParams.update(mpl.rcParamsDefault)

In [None]:
pd.plotting.scatter_matrix(
    dat[[
        ('gross_demand_quantity', 'mean'), 
        ('gross_demand_quantity', 'std'),
        ('sales_price', 'mean'), 
        ('gross_demand_quantity', 'sum')
        ]], 
    figsize=(10,10),
    diagonal='kde',
    c = Kmeans, 
    alpha = 0.5
)

In [None]:
plt.subplot(2, 1, 1)
plt.scatter(dat['gross_demand_quantity']['mean'], 
            dat['sales_price']['mean'], 
            c = Kmeans,
            alpha = 0.5)
plt.title('Sale Price vs. Gross Demand Quantity')
plt.ylabel('Sale Price')

plt.subplot(2, 1, 2)
plt.scatter(dat['gross_demand_quantity']['sum'], 
            dat['sales_price']['mean'], 
            c = Kmeans,
            alpha = 0.5)
# plt.title('Sale Price vs. Gross Demand Qty (mean/sum)')
plt.ylabel('Sale Price')
plt.xlabel('GDQ Sum')

# plt.xlabel('Sales Price')
# plt.ylabel('log(sold_qty)')

# plt.colorbar()
# plt.rcParams["figure.figsize"] = [5,5]
# plt.title('K-means Classification')


### 2.1.3 Histograms

#### (a) gross_demand_quantity - basic

In [None]:
# Histogram: total gross demand quantity, basic items
dat[(dat['Kmeans'] == 0) & (dat['gross_demand_quantity']['sum'] < 10000)]['gross_demand_quantity']['sum'].hist(
    bins = 100, 
)

plt.xlabel('Total Gross Demand Quantity')
plt.ylabel('Quantity')

plt.rcParams["figure.figsize"] = [12,12]
plt.title('Basic Articles')

#### (b) gross_demand_quantity - non-basic

In [None]:
# Histogram: total gross_demand_quantity, non-basic items
dat[(dat['Kmeans'] != 0) & (dat['gross_demand_quantity']['sum'] < 5000)]['gross_demand_quantity']['sum'].hist(bins = 50)

plt.xlabel('Total Gross Demand Quantity')
plt.ylabel('Quantity')

plt.rcParams["figure.figsize"] = [12,12]
plt.title('Non-basic Articles')


#### (c) sales_price - basic

In [None]:
# Histogram: sales_price, basic items
dat[(dat['Kmeans'] == 0) & (dat['sales_price']['mean'] < 200) & (dat['sales_price']['mean'] > 0)]['sales_price']['mean'].hist(bins = 100)

plt.xlabel('Sale Price')
plt.ylabel('Quantity')

plt.rcParams["figure.figsize"] = [12,12]
plt.title('Basic Articles')


#### (d) sales_price - non-basic

In [None]:
# Histogram: sales_price, non-basic items
dat[(dat['Kmeans'] != 0) & (dat['sales_price']['mean'] < 200)]['sales_price']['mean'].hist(bins = 100)

plt.xlabel('Sale Price')
plt.ylabel('Quantity')

plt.rcParams["figure.figsize"] = [12,12]
plt.title('Non-basic Articles')

## 2.2 Agglomerative Clustering

### 2.2.1 Model

In [None]:
dat_scaled.head()

In [None]:
%%time

# Agglomerative Clustering -------------
from sklearn.cluster import AgglomerativeClustering

agg = AgglomerativeClustering(n_clusters = 3)
agglom = pd.Series(agg.fit_predict(dat_scaled))

In [None]:
# # Reclassify: majority/non as 0/1 

agglom.value_counts()

In [None]:
agglom = (agglom != 4)*1 # lone vector
agglom.value_counts()

### 2.2.2 Plot

In [None]:
plt.scatter(np.log(dat['net_sales']['sum']), 
            np.log(dat['sold_qty']['sum']), 
            c = agglom,
            alpha = 0.15)
plt.xlabel('log(net_sales)')
plt.ylabel('log(sold_qty)')
plt.colorbar()
plt.title('Agglomerative Classification')

## 2.3 DBSCAN 

'density based spatial clustering of applications with noise'

In [None]:
%%time

from sklearn.cluster import DBSCAN

dbscan = DBSCAN() # 3min 36s
dbs = pd.Series(dbscan.fit_predict(dat_scaled))

In [None]:
dbs.value_counts()

In [None]:
# # Reclassify: majority/non as 0/1 
dbs01 = (dbs != -1)*1 # lone vector
dbs01.value_counts()

### Plot

In [None]:
plt.scatter(np.log(dat['net_sales']['sum']), 
            np.log(dat['sold_qty']['sum']), 
            c = dbs01,
            alpha = 0.15)
plt.xlabel('log(net_sales)')
plt.ylabel('log(sold_qty)')
plt.colorbar()
plt.title('DBSCAN Classification')
plt.rcParams["figure.figsize"] = [16,16]

## 2.4 Evaluation

### 2.4.1 Cross methods comparison

In [None]:
k = pd.DataFrame(dat_scaled['Kmeans'])
f = pd.DataFrame(dat_feature_sorted['feat_based'])

k_f = pd.merge(k, f, left_index = True, right_index = True)

k_f.head()

In [None]:
k_f.groupby(['Kmeans', 'feat_based']).size()
(36800 + 7126)/46573 # 0.943% agreement

In [None]:
table = pd.DataFrame(data = {'Ag': agglom, 'Km': Kmeans, 'DB': dbs01})
table.groupby(['Ag', 'Km', 'DB']).size()
1 - 3924/46573 # 0.9157 kmeans-agglomerative agreement

# table[(table['Ag'] == 0) & (table['Km'] == 1)]

### 2.4.2 Metrics

See https://scikit-learn.org/stable/modules/clustering.html for metric information (2.4.2 - __ )

#### Silhouette Coefficient

In [None]:
from sklearn import metrics

metrics.silhouette_score(dat_scaled.drop('Kmeans', axis = 1), dat_scaled['Kmeans']) # higher better

#### Calinski-Harabaz Index

In [None]:
metrics.calinski_harabaz_score(dat_scaled.drop('Kmeans', axis = 1), dat_scaled['Kmeans']) # higher better


#### Davies-Bouldin Index

In [None]:
metrics.davies_bouldin_score(dat_scaled.drop('Kmeans', axis = 1), dat_scaled['Kmeans']) # lower is better

# 3 Article Reference Data

In [None]:
dat2 = pd.read_csv('article_reference_data_y2016_18.csv',
                 delimiter = '~',
                  low_memory = False,
                   usecols=['group_article', 'brand', 'sub_brand', 'season_create', 'season_active',
                            'graphic', 'gender', 'age_group', 'retail_intro_date_global',
                            'retail_exit_date_global', 'material_technology', 'pictogram_composition',
                            'price_band', 'gender_age', 'construction_type', 'length_mes_uom_dim',
                            'uom_dim', 'height_mes_uom_dim', 'width_mes_uom_dim', 'article_descr',
                            'drop_season', 'uom_vol', 'uom_wgt', 'product_fit', 'material_way_type',
                            'outer_sole_main_material', 'inner_sole_main_material', 'main_material_lining',
                            'main_material_upper', 'dimension_uov', 'dimension_uom', 'carried_over_from',
                            'drop_date', 'retail_exit_tgt_season', 'product_franchise', 'age_group_descr',
                            'brand_descr', 'sub_brand_descr', 'lifecylce_status_prod_descr', 'brand_asset_descr',
                            'rmh_retail_class_descr', 'rmh_retail_department_descr', 'rmh_retail_sub_class_descr',
                            'rmh_retail_sub_dept_descr', 'rmh_category_descr', 'rmh_gender_descr',
                            'rmh_retail_section_descr', 'rmh_product_division_descr', 'rmh_product_type_descr',
                            'spm_color_first_descr', 'spm_color_second_descr', 'spm_color_third_descr',
                            'spm_color_fourth_descr', 'product_franchise_descr'
                           ]
                  )

In [None]:
# dat2 = dat2[['group_article', 'article_descr']]
# dat = dat[['Kmeans']]
# dat['article'] = dat.index

# dat3 = pd.merge(dat, dat2, left_on= 'article', right_on= 'group_article')
# dat3.columns = ('Kmeans', 'article', 'article2', 'descr')

# dat3[dat3['Kmeans'] == 0]['descr'].value_counts()

# 4 Cluster Prediction

## 4.0 Preprocessing

### 4.0.1 Feature Selection

In [None]:
# dat2['<feature>'].unique()
# len(dat2['<feature'].unique())

# Yes (11)
# ['rmh_product_type_descr', 'rmh_product_division_descr', 'rmh_retail_section_descr',
# 'rmh_gender_descr', 'rmh_category_descr', 'rmh_retail_department_descr',
# 'sub_brand_descr', 'brand_descr', 'age_group_descr', 'age_group', 'gender']

# No (13)
# 'graphic', 'pictogram_composition', 'article_descr', 'lifecylce_status_prod_descr', 'brand_asset_descr', 
# 'rmh_retail_class_descr', 'rmh_retail_sub_class_descr', 'rmh_retail_sub_dept_descr', 'spm_color_first_descr', 
# 'spm_color_second_descr', 'spm_color_third_descr','spm_color_fourth_descr', 'product_franchise_descr' 


dat2.drop(['uom_dim', 'drop_season', 'uom_vol', 'uom_wgt', 'material_way_type', 'inner_sole_main_material', 
           'main_material_upper', 'outer_sole_main_material', 'main_material_lining', 'carried_over_from', 'drop_date',
           'graphic', 'pictogram_composition', 'article_descr', 'lifecylce_status_prod_descr', 'brand_asset_descr', 
           'rmh_retail_class_descr', 'rmh_retail_sub_class_descr', 'rmh_retail_sub_dept_descr', 'spm_color_first_descr', 
           'spm_color_second_descr', 'spm_color_third_descr','spm_color_fourth_descr', 'product_franchise_descr' 
          ], axis = 1, inplace=True)


In [None]:
# datK = pd.DataFrame(dat_scaled['Kmeans'])
# dat3 = pd.merge(dat2, datK, left_on = 'group_article', right_index=True)

In [None]:
dat2.shape
len(dat2['group_article'].unique())
dat2.dtypes

In [None]:
### 4.0.2 Scaling

### 4.0.3 One-hot Encoding

In [None]:
dat_dummy = pd.get_dummies(dat2)

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter = 500).fit(dat3.drop('Kmeans', axis = 1), dat3['Kmeans'])

# log_reg = LogisticRegression(max_iter = 500).fit(dat_scaled.drop('Kmeans', axis = 1), np.random.randint(2, size = 46572))

In [None]:
log_reg.score(dat_scaled.drop('Kmeans', axis = 1), dat_scaled['Kmeans'])