In [None]:
#DEEP CLUSTER IMPLEMENTED TO INVESTIGATE ITEMS WITH SIMILAR RELATIONSHIPS TO FUTURE TOTAL SALES~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#Evan Brown, Oct 23, 2020

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import keras
import keras.backend as K
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import log
from math import e
import sys
import matplotlib.style as style
from sklearn.metrics import mean_squared_error as mse
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.feature_selection import chi2
from sklearn.metrics import r2_score

In [None]:
#~~~~~~~~~~~~~~~~~~~DRIVE~~~~~~~~~~~~~~~~~
# file_list = ['/content/Online Retail.xlsx',
#   ]
#~~~~~~~~~~~~~~~~~~~LOCAL~~~~~~~~~~~~~~~~~
file_list = ['/home/evan/Desktop/datasets/online sales/Online Retail.xlsx',
             '/home/evan/Desktop/datasets/online sales/Online Retail_II.xlsx']

In [None]:
def read_excel_files(file_list,columns, skiprows=None, nrows=1000): #reduce nrows for experimentation
  data_list=[]
  for file_ in file_list:
    pd_data = pd.read_excel(file_, skiprows=skiprows,nrows=nrows)             
    pd_data.columns=columns
    data_list.append(pd_data) 
  return pd.concat(data_list)

data = read_excel_files(file_list=file_list,columns=['invoice', 'item_code','item','quantity','date','price','customer','country'], skiprows=1,nrows=50000)
mask = data['country'] != 'United Kingdom'
data = data[mask]
data.reindex()

In [None]:
#~~~~~~~CONVERT FEATURES TO STRING , AUGMENT AND EXTRACT FEATURES, DELETE IRRELEVANT FEATURES~~~~~

#split item_code into letter and number
data['item_code_letter']=data['item_code'].astype(str).str.extract('([A-Za-z]+)', expand=False)
data['item_code_num']=data['item_code'].astype(str).str.extract('([0-9]+)', expand=False)
data.drop('item_code',axis=1,inplace=True)

#create total_sales from quantity and       price
data['total_sales']=data['quantity']*data['price'] 
for _ in 'quantity','price': data.drop(_, axis=1,inplace=True)
target=data['total_sales']#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~y
target=target[1:]#shift up by dropping last target
data=data[:-1]#and dropping first instance


#tokenize categorical variables
data['item_code_letter'] = data['item_code_letter'].astype('category').cat.codes
data['item_code_num'] = data['item_code_num'].astype('category').cat.codes
data['invoice']=data['invoice'].astype('category').cat.codes
data['customer']=data['customer'].astype('category').cat.codes
data['country']=data['country'].astype('category').cat.codes+1 #+1 so it will be 0-indexed
data['date']=data['date'].astype('category').cat.codes

#convert item to string
data['item']=data['item'].astype('str')

#************discovered that date is a linear combination of invoice(date=invoice+12). therefore removing invoice variable
print((data['invoice'] == data['date']-12).count())
data.drop('invoice',axis=1,inplace=True)

In [None]:
#QUANTIZE TARGET INTO N_CLASSES

n_classes = 10
min_ = int(np.min(target))-1
max_ = int(np.max(target))+1

#bin indices(times 100 and divided by n_classes ) are percentiles
bins = np.linspace(min_,max_,n_classes)
target_bins = pd.cut(target, bins)

In [None]:
target_bins.cat.codes.unique()

In [None]:
bins.shape

In [None]:
target_bins=target_bins.cat.codes

In [None]:
#~~~~~~~~~~~~~~~~PREPARE INPUT VECTORS~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#~~~~~~~~~~~~~~~~~ONEHOT on X~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#not including date in one-hot transformation because most of the information it provides is the data's time dimension
date = data['date'].values
total_sales=data['total_sales'].values.astype(np.float32)
#definitely want to encode item_code_letter and item_code_number together 
#creating a one-hot from these
item_code_hot = pd.get_dummies(data=data[['item_code_num','item_code_letter']], columns=['item_code_num','item_code_letter'])


#separately encoding 'item, 'customer, 'country #item will go through pretrained embeddng layer as-is
customer_hot=pd.get_dummies(data.loc[:,'customer'])
country_hot=pd.get_dummies(data.loc[:,'country'])
item=data['item']

In [None]:
#~~~~~~~~~~~~~~~~~~~~INPUT LAYERS~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  #0          1           2           3           4
#item, customer_hot, country_hot,item_code_hot,date
for _ in [item.shape,customer_hot.shape,country_hot.shape,item_code_hot.shape,date.shape]: print(_)

input_item = keras.layers.Input(shape=item.shape[1:],dtype=tf.string)#this kerasLayer expects [batch_size, None]
input_customer_hot = keras.layers.Input(shape=customer_hot.shape[1:])
input_country_hot=  keras.layers.Input(shape=country_hot.shape[1:])
input_item_code_hot=keras.layers.Input(shape=item_code_hot.shape[1:])
input_date    =    keras.layers.Input(shape=1, dtype=tf.float32) #expects [batch, 1]
input_total_sales = keras.layers.Input(shape=1, dtype=tf.float32)

In [None]:
#~~~~~~~~~~~~~~~~EMBEDDING LAYERS~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#DIMENSION GLOBALS
emb_d0 = 50 #pretrained with 50 as output_shape
emb_d1 = 100 
emb_d2 =  100
emb_d3 =  100


emb_layer0=hub.KerasLayer('https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1', dtype =tf.string, input_shape =[], output_shape=[emb_d0],name='emb0')
emb_layer1=keras.layers.Embedding(input_dim=input_customer_hot.shape[-1], output_dim = emb_d1,name='emb1')
emb_layer2=keras.layers.Embedding(input_dim=input_country_hot.shape[-1], output_dim = emb_d2,name='emb2')
emb_layer3=keras.layers.Embedding(input_dim=input_item_code_hot.shape[-1], output_dim = emb_d3,name='emb3')

#Batch Normalize~~~~~~~~~~~~~~~~~~
def bn_layer():
  return keras.layers.BatchNormalization()

outL1_0 = bn_layer()(emb_layer0(input_item))
outL1_1 = bn_layer()(emb_layer1(input_customer_hot))
outL1_2 = bn_layer()(emb_layer2(input_country_hot))
outL1_3 = bn_layer()(emb_layer3(input_item_code_hot))

In [None]:
#~~~~~~~~~~~dense layers between embedding and flatten layers~~~~~~~~~~~~~~
from tensorflow.keras.constraints import max_norm

def dense_layer():
  return keras.layers.Dense(100, activation='relu', kernel_initializer='he_normal', kernel_constraint=max_norm(.0001),kernel_regularizer='l1')

outL2_0=bn_layer()(dense_layer()(outL1_0))
outL2_1=bn_layer()(dense_layer()(outL1_1))
outL2_2=bn_layer()(dense_layer()(outL1_2))
outL2_3=bn_layer()(dense_layer()(outL1_3))

# ~~~~~~~~~~~~~~~~~FLATTEN~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
outL3_0 = keras.layers.Flatten(name='flat0')(outL2_0)
outL3_1 = keras.layers.Flatten(name='flat1')(outL2_1)
outL3_2 = keras.layers.Flatten(name='flat2')(outL2_2)
outL3_3 = keras.layers.Flatten(name='flat3')(outL2_3)

#Batch Normalize~~~~~~~~~~~~~~~~~~

#~~~~~~~~~~~~~~~~DENSE~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
outL4_0=bn_layer()(dense_layer()(outL3_0))
outL4_1=bn_layer()(dense_layer()(outL3_1))
outL4_2=bn_layer()(dense_layer()(outL3_2))
outL4_3=bn_layer()(dense_layer()(outL3_3))
#no 4
#no 5

In [None]:
#~~~~~~~~~CONCATENATE 1~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
concat1 = keras.layers.Concatenate(axis=1)([outL4_0,outL4_1,outL4_2,outL4_3])

In [None]:
#~~~~~~~~~~~EXTRA DENSE~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
outL5 = bn_layer()(dense_layer()(concat1))
outL6 = bn_layer()(dense_layer()(outL5))
# outL7 = keras.layers.Dense(1, activation=None)(outL6)#***not used #output of this layer represents the features learned from item names and category codes and countries

In [None]:
#~~~~~~~~CONCATENATE 2~~~~~~~~~~~
#functions as linear combination of outL7, and sales (weights of inputs are learned)
concat2=keras.layers.Concatenate(axis=1)([outL6,input_total_sales])

In [None]:
#~~~~~~~~OUTPUT~~~~~~~~~~~~~~~~~~~~~~
output = keras.layers.Dense(n_classes, activation='softmax',name='out')(concat2)

In [None]:
#~~~~~~~~~COMPILE~~~~~~~~~~~~~~~~~~~~~~~~~~~~
K.clear_session()
model= keras.Model(inputs=[input_item,input_customer_hot,input_country_hot,input_item_code_hot,input_total_sales], outputs=[concat2,output])
optimizer=keras.optimizers.Nadam(lr=0.01,clipvalue=.01, clipnorm=.25)      #minimum lr=0.005   ~~~~~~~~~~~~~~~~~~~~~~   ******set lr here*****
model.compile(optimizer=optimizer, loss=['sparse_categorical_crossentropy',None],loss_weights=[1., 0.0])
keras.utils.plot_model(model, "my_fashion_mnist_model.png", show_shapes=True)

In [None]:
X= data
y= target_bins

In [None]:
target_bins = target_bins.values

In [None]:
item = item.values


In [None]:
def random_batch(X, y, batch_size=32):
    idx = np.random.randint(len(X[0]), size=batch_size)
    return [x[idx] for x in X], y[idx]
loss_fn = keras.losses.sparse_categorical_crossentropy
mean_loss = keras.metrics.Mean()

In [None]:
model.layers[-2]

In [None]:
len(target_bins)

In [None]:
target_bins.shape

In [None]:
#INSTEAD OF RANDOM BATCH, USE SEQUENTIAL BATCH OVER SHUFFLED INSTANCES IN ORDER TO APPLY 
#MODEL ACROSS ALL INSTANCES TO GENERATE EMBEDDINGS FOR NEXT EPOCH
#---THIS WILL BE FASTER THAN THE CURRENT IMPLEMENTATION WHERE THE MODEL IS RUN AGAIN OVER ALL INSTANCES AT THE END
#---ONLY TRADEOFF IS THAT INSTANCES ARE GOING TO BE SLIGHTLY DEPENDENT

In [None]:

import faiss
from sklearn.cluster import KMeans
from tqdm.notebook import trange
from collections import OrderedDict

metrics = []#[keras.metrics.MeanAbsoluteError()]
n_epochs = 3
batch_size = 1024
n_steps = len(data)//batch_size
embedding = None
y_target = target_bins #initial target- psuedo labels (clusters) used on subsequent iterations
kmeans = KMeans(n_clusters=10)


with trange(1, n_epochs + 1, desc="All epochs") as epochs:
    for epoch in epochs:
        with trange(1, n_steps + 1, desc="Epoch {}/{}".format(epoch, n_epochs)) as steps:
            for step in steps:
                X_batch,y_batch= random_batch([item,customer_hot.values,country_hot.values,item_code_hot.values,date,total_sales],y_target)

                with tf.GradientTape() as tape:
                    
                    embedding, y_pred = model(X_batch) #embedding not used here

                  
                    main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
                    loss = tf.add_n([main_loss] + model.losses)
                gradients = tape.gradient(loss, model.trainable_variables)
                optimizer.apply_gradients(zip(gradients, model.trainable_variables))
                for variable in model.variables:
                    if variable.constraint is not None:
                        variable.assign(variable.constraint(variable))                    
                status = OrderedDict()
                mean_loss(loss)
                status["loss"] = mean_loss.result().numpy()
                for metric in metrics:
                    metric(y_batch, y_pred)
                    status[metric.name] = metric.result().numpy()
                steps.set_postfix(status)
        
        #generate embedding and predictions for full dataset
        embedding,epoch_predictions = model.predict([item,customer_hot.values,country_hot.values,item_code_hot.values,date,total_sales])
        
        # Apply PCA with whitening
        mat = faiss.PCAMatrix(d_in=101, d_out=15, eigen_power=-0.5)
        mat.train(embedding)
        embedding_pca = mat.apply_py(embedding)

        #l2 normalize pca embedded instances
        norm = np.linalg.norm(embedding_pca, axis=1)
        embedding_l2 = embedding_pca / norm[:, np.newaxis]
    
        #Kmeans cluster to get pseudo labels
        kmeans.fit(embedding_l2)
        y_target = kmeans.predict(embedding_l2)

        for metric in [mean_loss] + metrics:
            metric.reset_states()

In [None]:
#final epoch's embedding of training instances
embedding.shape

In [None]:
#~~~~~~PLOT CLUSTERED REPRESENTATIONS OF INSTANCES IN 3D, LABEL EACH BY PRODUCT NAME~~~~~~~

In [None]:
%matplotlib inline
from mpl_toolkits import mplot3d
from sklearn.cluster import KMeans
import random 
def visualize_emb(embeddings, dim_reduce, labels,label_name):
  embedded_3d = dim_reduce.fit_transform(embeddings)
  
  kmeans = KMeans(n_clusters=2)
  kmeans.fit(embedded_3d)
  clustered_instances = kmeans.predict(embedded_3d)
  clustered_instances = np.array(clustered_instances).astype(bool)
  
  cluster1 = data[['item','total_sales']][:n_points][clustered_instances]
  cluster2 = data[['item','total_sales']][:n_points][~clustered_instances]

  print('cluster 1:',cluster1)
  print('cluster 2:',cluster2)
  print('mean 1:', np.mean(cluster1['total_sales']),
        'mean 2:', np.mean(cluster2['total_sales']))

  fig = plt.figure()
  ax = mplot3d.Axes3D(fig)

  y = embedded_3d[:,0]
  z =  embedded_3d[:,1]
  x = embedded_3d[:,2]

  ax.scatter(y,z) 
  ax.set_title('embedding: '+str(label_name))

  for i, txt in enumerate(labels):
        ax.annotate(txt, (y[i]+random.uniform(-.001,.001), z[i]+random.uniform(-.001,.001),x[i]+random.uniform(-.001,.001)))#adjust random uniform range to add variability to annotation location
        plt.show()


n_points = 100
y = data['item'][:n_points]
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
# visualize_emb(customer_hot[:n_points],tsne, y.values, 'tsne customer')
visualize_emb(country_hot[:n_points].values,pca,  y.values,'pca country')
visualize_emb(item_code_hot[:n_points].values,pca, y.values, 'pca item code')