# Exploratory Data Analysis

In [None]:
import os, sys, import_ipynb, ipynb
if os.getcwd() == 'C:\\Users\\admin\\Desktop\\retail_data_analysis\\code':
    print ("Already in code directory")
else:
    os.chdir('..//code')
        
# import python libraries
from importLibraries import *

# Compute Gower distance
#Ref: https://github.com/matchado/Misc/blob/master/gower_dist.py
def gower_distance(X):
    """
    This function expects a pandas dataframe as input
    The data frame is to contain the features along the columns. Based on these features a
    distance matrix will be returned which will contain the pairwise gower distance between the rows
    All variables of object type will be treated as nominal variables and the others will be treated as 
    numeric variables.
    Distance metrics used for:
    Nominal variables: Dice distance (https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient)
    Numeric variables: Manhattan distance normalized by the range of the variable (https://en.wikipedia.org/wiki/Taxicab_geometry)
    """
    individual_variable_distances = []
    for i in range(X.shape[1]):
        feature = X.iloc[:,[i]]
        if feature.dtypes[0] == np.object:
            feature_dist = DistanceMetric.get_metric('dice').pairwise(pd.get_dummies(feature))
        else:
            feature_dist = DistanceMetric.get_metric('manhattan').pairwise(feature) / np.ptp(feature.values)   
        individual_variable_distances.append(feature_dist)
    return np.array(individual_variable_distances).mean(0)


# Plot Fancy dendogram
def fancy_dendrogram(*args, **kwargs):
    max_d = kwargs.pop('max_d', None)
    if max_d and 'color_threshold' not in kwargs:
        kwargs['color_threshold'] = max_d
    annotate_above = kwargs.pop('annotate_above', 0)
    ddata = dendrogram(*args, **kwargs)
    if not kwargs.get('no_plot', False):
        plt.title('Hierarchical Clustering Dendrogram (truncated)')
        plt.xlabel('sample index or (cluster size)')
        plt.ylabel('distance')
        for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
            x = 0.5 * sum(i[1:3])
            y = d[1]
            if y > annotate_above:
                plt.plot(x, y, 'o', c=c)
                plt.annotate("%.3g" % y, (x, y), xytext=(0, -5),
                             textcoords='offset points',
                             va='top', ha='center')
        if max_d:
            plt.axhline(y=max_d, c='k')
    return ddata



def fun_cluster():
    #import dataset
    os.chdir(os.path.expanduser('../input'))
    input_data_preProcessed_chunk = pd.read_csv('input_data_preProcessed.csv', chunksize=10000, iterator=True, header=0, skipinitialspace=True, index_col=False)  
    input_data_preProcessed = pd.concat(input_data_preProcessed_chunk, ignore_index=True)    
    tmpdf = input_data_preProcessed[['Quantity', 'revenue', 'Country', 'noun_1', 'nearestConcept']]

    display("Head of dataset : ",tmpdf.head())

    tmpdf = tmpdf.head(400)

    gowerDist = gower_distance(tmpdf)
    #display(gowerDist)
    
    #save plots
    #os.chdir('..//output')
    
    
    # hierarchical clustering Using linkage function
    # Plot Typical dendogram
    # horizontal lines are cluster merges, vertical lines indicate which clusters were part of merge forming new cluster, heights of the horizontal lines indicate the distance needed to be connected to form the new cluster
    linked = linkage(gowerDist, method='average',metric='euclidean',  optimal_ordering = True)
    plt.figure(figsize=(10, 8))
    plt.title('Hierarchical Clustering Dendrogram truncated')
    plt.xlabel('cluster size')
    plt.ylabel('distance')
    dendrogram(
        linked,
        truncate_mode='lastp',  # show only the last p merged clusters
        p=30,  # show only the last p merged clusters
        leaf_rotation=90.,
        leaf_font_size=12.,
        show_leaf_counts=True,
        show_contracted=True,  # to get a distribution impression in truncated branches
    )
    plt.show()
    
    
    
    # Elbow method to find automated cut
    # Ref https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/
    last = linked[-10:, 2]
    last_rev = last[::-1]
    idxs = np.arange(1, len(last) + 1)
    plt.plot(idxs, last_rev)

    acceleration = np.diff(last, 2)  # 2nd derivative of the distances
    acceleration_rev = acceleration[::-1]
    plt.plot(idxs[:-2] + 1, acceleration_rev)
    plt.show()
    k = acceleration_rev.argmax() + 2  # if idx 0 is the max of this we want 2 clusters
    display("clusters:", k)
    
    
    plt.figure(figsize=(10,10))
    fancy_dendrogram(
        linked,
        truncate_mode='lastp',
        p=30,
        leaf_rotation=90.,
        leaf_font_size=12.,
        show_contracted=True,
        annotate_above=40,
        max_d=2,
    )
    plt.show()



    # Retrieve the clusters
    clusters = fcluster(linked, 3, criterion='maxclust')
    clusters

    
    