In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool, BoxSelectTool, CDSView, BooleanFilter, Legend
from bokeh.models.mappers import CategoricalColorMapper
from bokeh.layouts import column
import colorcet as cc

In [2]:
output_notebook()

I'll be using the classic Iris [dataset](https://archive.ics.uci.edu/ml/datasets/iris) from UCI Machine Learning Repository.

In [3]:
df = pd.read_csv('data/iris.csv')

All the information about the cluster plot and its methods is encapsulated in the ClusterPlot class

In [4]:
class ClusterPlot:
    
    def __init__(self, orig_df, linkage_method, weights=None):               
        
        self.df = orig_df
        self.method = linkage_method
        self.num_df = orig_df.set_index('observation')[df.columns.values[df.dtypes != 'object']]
            
        if not weights is None:
            #build a cluster df
            
            self.cl_df = self.num_df.copy()
            
            for key, value in weights.items():
                if value > 1:
                    for i in range(value):
                        self.cl_df[f'dup_{key}_{i}'] = self.num_df[key]
        else:
            self.cl_df = self.num_df
        
        self.Z = linkage(self.cl_df, self.method)
        
        #dendrogram part of the "source"
        #scipy dendrogram dictionary object
        self.d = dendrogram(self.Z, no_plot=True) 
        #original observation index from left to right in the dendrogram and convert strings to int
        self.cl = [int(i) for i in self.d['ivl']]
        
        #dendrogram DFs
        #x-range is a linear range with step 10 starting at 5;
        #each step is a leaf; used for adding hover at the base of the graph
        self.c_df = pd.DataFrame(data=dict(dend_x = np.arange(5, len(self.cl) * 10 + 5, 10), dend_y=0), index = self.cl)
        self.d_df = pd.merge(self.df, self.c_df, how='left', left_index=True, right_index=True)
                
        #pcp part of the source
        self.num_aliases = list(range(len(self.num_df.columns.values)))
        self.pcp_xs = [self.num_aliases]*len(self.num_df)
        self.pcp_ys = [list(row.values) for index, row in self.num_df.iterrows()]
        self.pcp_df = pd.DataFrame(data={'pcp_xs':self.pcp_xs, 'pcp_ys':self.pcp_ys})
        
        #merge two DFs into one source
        self.source_df = pd.concat([self.d_df, self.pcp_df], axis=1)
        self.source_df['clusters'] = 0
        self.source_df['pcp_color'] = 'black'
        self.source = ColumnDataSource(self.source_df)        

    def _dendrogram(self):
        """
        Create an interactive dendrogram in Bokeh
        
        The set of 4 coordinates in self.d (icoord for x and dcoord for y)
                                     _
        are for each horseshoe link | |. 
        
        """
        xs = self.d['icoord']
        ys = self.d['dcoord']
            
        #circle coords for distnace tooltips; adjust cx for getting the mid-point between two clusters
        cx = [x[1] + ((x[2]-x[1]) / 2) for x in self.d['icoord']]
        cy = [y[1] for y in self.d['dcoord']]

        #draw the dendrogram with multiple renderers
        dend = figure(plot_width=800, plot_height=400, title='Zoomable dendrogram with tooltips')
        dend.title.align = 'center'
        box_select = BoxSelectTool()
        dend.add_tools(box_select)
        
        #draw the lines connecting all cluster links
        dend.multi_line(xs=xs, ys=ys, line_width=1, line_color='black')

        #invisible circles FTW!
        c1 = dend.circle(x=cx, y=cy, fill_color='black', size=10, fill_alpha=0, line_color=None)
        c1_hover = HoverTool(renderers=[c1], tooltips=[('Distance', '@y'),])
        c1_hover.toggleable = False
        dend.add_tools(c1_hover)

        #add anohter circle renderer at the base of each cluster
        c2 = dend.circle(x='dend_x', y='dend_y', fill_color='black', size=10,
                         fill_alpha=0, line_color=None, source=self.source)
        
        c2_hover = HoverTool(renderers=[c2], tooltips=[('Species', '@species'),
                                                       ('Observation', '@observation'),])
        c2_hover.toggleable = False
        dend.add_tools(c2_hover)

        dend.grid.visible = False
        dend.xaxis.visible = False
        dend.yaxis.minor_tick_line_alpha = 0
        dend.yaxis.axis_label = 'Distance'

        return dend
    
    def _pcp(self, **kwargs):
        """
        Parallel Coordinates Plot to visualise clusters within original dataset

        You need to provide the constructor with a few things:
            - max_d is the optional distance at which to cut the HAC dendrogram
            - norm is whether to use Normalize function or not: True/False

        By default, the axes for the PCP will be taken from the column headers in DF;
        You can overwrite them by adding in an optional parameter 'domains=[]'. The 
        number of user-set domains must match the number of columns used to create the
        linkage matrix.
        """
        df = self.num_df

        #if ('domains' in kwargs) & (len(kwargs['domains']) == len(df.columns.values)):
        domains = self.num_df.columns.values
        #implement custom domains passed as keyword argument

        def normalize(df):
            result = df.copy()
            for feature_name in df.columns:
                max_value = df[feature_name].max()
                min_value = df[feature_name].min()
                result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
            return result

        if 'norm' in kwargs:
            df = normalize(df)
            self.source.data['pcp_ys'] = [list(row.values) for index, row in df.iterrows()]

        #add clusters and colours to the df
        #full list of palettes: https://bokeh.pydata.org/en/latest/docs/reference/palettes.html

        if 'max_d' in kwargs:
            self.source.data['clusters'] = fcluster(self.Z, kwargs['max_d'], criterion='distance')
            cl_colors = pd.Series(fcluster(self.Z, kwargs['max_d'], criterion='distance'))

        palette = cc.glasbey[0:self.source.data['clusters'].max()]
        cl = list(cl_colors.unique())
        color_dict = dict(zip(cl, palette))

        self.source.data['color'] = [color_dict.get(v, None) for v in self.source.data['clusters']]

        #multiline doesn't work with a categorical x-axis so we need to alias x-axis values.
        #y_range bounds depend on whether the dataset was normalised or not - TO DO
        p = figure(plot_width=800, plot_height=400, x_range=(0,len(domains)-1), y_range=(0,1))
        
        p.title.text = 'Parallel coordinates plot in Bokeh'
        p.title.align = 'center'

        #format x and y axis, including overwriting labels
        p.xaxis.ticker = self.num_aliases
        p.xaxis.major_label_overrides = dict(zip(self.num_aliases, domains))
        p.yaxis.axis_label = "Y-Axis"
        p.yaxis.minor_tick_line_color = None

        #create renderers: segment for PCP "axis" and multi-line for actual lines
        p.segment(x0=self.num_aliases, y0=[0]*len(domains), x1=self.num_aliases,
                  y1=[1]*len(domains), color="grey", line_width=2)
        
        #tooltip renderer for PCP lines
        pcp_tooltips_df = df.copy()
        pcp_tooltips_df['x'] = 0
        pcp_tooltips_df['species'] = self.df['species'].values
        pcp_tooltips_df.reset_index(inplace=True)

        pcp_tooltips_source = ColumnDataSource(pcp_tooltips_df)
        
        pcp_tooltips_renderer = p.circle(x='x', y='sepal_length', size=5, color='grey', source=pcp_tooltips_source)
        
        pcp_tooltips = HoverTool(renderers=[pcp_tooltips_renderer], tooltips=[('Species', '@species'),
                                                       ('Observation', '@observation'),])
        
        p.add_tools(pcp_tooltips)
        
        #PCP multilines - one renderer per cluster for legends / colours     
        for cluster in pd.Series(self.source.data['clusters']).unique():
            
            booleans = [True if cl == cluster else False for cl in self.source.data['clusters']]
            view = CDSView(source=self.source, filters=[BooleanFilter(booleans)])  
        
            _ = p.multi_line(xs='pcp_xs',
                         ys='pcp_ys',
                         line_width=1,
                         line_alpha=0.4,
                         line_color='color',
                         muted_alpha=0.05,
                         muted_color='grey',
                         source=self.source,
                         view=view)
            _.muted = True
        
        legend_items = [(f"Cluster {cluster}", [p.renderers[2+i]]) for 
                        i, cluster in enumerate(pd.Series(self.source.data['clusters']).unique())]

        legend = Legend(items=legend_items, click_policy="mute", orientation="horizontal", location="bottom_left")
        
        p.add_layout(legend, 'below')

        p.grid.visible = False

        return p

The ClusterPlot class accepts an optional weights argument. Some clustering algorithms are more sensitive to weighted data than others. For example, this [paper](http://mayaackerman.info/pub/weightedClustering.pdf) by Margareta Ackerman, Shai Ben-David, Simina Branzei, and David Loker suggests that Ward algorithm is weight-sensitive so we'll use that.

Lets set default weights of 1 to start with...

In [5]:
weights = dict(zip(df.columns.values[:-1], [1]*4))

weights

{'sepal_length': 1, 'sepal_width': 1, 'petal_length': 1, 'petal_width': 1}

Create an instance of the ClusterPlot with our data and default weights...

In [6]:
cp = ClusterPlot(df, 'ward', weights=weights)

Generate the dendrogram and a parallel coordinates plot...

In [7]:
show(column(cp._dendrogram(), cp._pcp(norm=True, max_d=7)))

You can see that the three clusters follow the classic split. But what if you wanted to emphasize the `sepal_width` feature and bias the clustering towards that? The expected result would that the red cluster would split into two. Lets start by changing the weight of `sepal_width` to 10.

In [8]:
weights['sepal_width'] = 10

weights

{'sepal_length': 1, 'sepal_width': 10, 'petal_length': 1, 'petal_width': 1}

In [9]:
cp = ClusterPlot(df, 'ward', weights=weights)

You can see that at the same distance, the cluster had split based on weight:

In [10]:
show(column(cp._dendrogram(), cp._pcp(norm=True, max_d=7)))

In [11]:
pd.DataFrame(cp.source.data)[['species', 'clusters']].groupby('clusters').size()

clusters
1    18
2    32
3    31
4    21
5    48
dtype: int64