In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool, BoxSelectTool
from bokeh.models.mappers import CategoricalColorMapper
from bokeh.layouts import column

In [2]:
%autosave 0
%matplotlib inline
output_notebook()

Autosave disabled


In [3]:
df = pd.read_csv('iris.csv')

In [4]:
class ClusterPlot:
    
    def __init__(self, orig_df, linkage_method):
        
        self.df = orig_df
        self.method = linkage_method
        self.num_df = orig_df.select_dtypes(exclude='object')
        self.Z = linkage(self.num_df, self.method)
        
        #dendrogram part of the "source"
        #scipy dendrogram dictionary object
        d = dendrogram(self.Z, no_plot=True) 
        #original observation index from left to right in the dendrogram
        cl = [val for key, val in d.items() if key == 'ivl']
        #convert strings to int
        cl = [int(i) for i in cl[0]]
        
        #dendrogram DFs
        c_df = pd.DataFrame(data=dict(dend_x = np.arange(5, len(cl) * 10 + 5, 10), dend_y=0), index = cl)
        d_df = pd.merge(self.df, c_df, how='left', left_index=True, right_index=True)
        
        #pcp part of the source
        self.num_aliases = list(range(len(self.num_df.columns.values)))
        pcp_xs = [self.num_aliases]*len(self.num_df)
        pcp_ys = [list(row.values) for index, row in self.num_df.iterrows()]
        pcp_df = pd.DataFrame(data={'pcp_xs':pcp_xs, 'pcp_ys':pcp_ys})
        
        #merge two DFs into one source
        source_df = pd.concat([d_df, pcp_df], axis=1)
        source_df['clusters'] = 0
        source_df['pcp_color'] = 'black'
        self.source = ColumnDataSource(source_df)
        

    def _dendrogram(self):
        """
        Create an interactive dendrogram in Bokeh  
        """
        
        Z = self.Z
        orig_df = self.df
        
        #scipy dendrogram dictionary object
        d = dendrogram(Z, no_plot=True) 
        #x axis (numerical)
        xs = [val for key, val in d.items() if key == 'icoord'][0]
        #distance
        ys = [val for key, val in d.items() if key == 'dcoord'][0]


        #circle coords for distnace tooltips; adjust cx for getting the mid-point between two clusters
        for key, value in d.items():
            if key == 'icoord':
                cx = [val[1] + ((val[2]-val[1]) / 2) for val in value]

        for key, value in d.items():
            if key == 'dcoord':
                cy = [val[2] for val in value]

        #draw the dendrogram with multiple renderers
        dend = figure(plot_width=800, plot_height=400, title='Zoomable dendrogram with tooltips')
        dend.title.align = 'center'
        box_select = BoxSelectTool()
        dend.add_tools(box_select)

        dend.multi_line(xs=xs, ys=ys, line_width=1, line_color='black')

        #invisible circles FTW!
        c1 = dend.circle(x=cx, y=cy, fill_color='black', size=10, fill_alpha=0, line_color=None)
        c1_hover = HoverTool(renderers=[c1], tooltips=[('Distance', '@y'),])
        c1_hover.toggleable = False
        dend.add_tools(c1_hover)

        #add anohter circle renderer at the base of each cluster
        c2 = dend.circle(x='dend_x', y='dend_y', fill_color='black', size=10, fill_alpha=0,
                         line_color=None, source=self.source)
        c2_hover = HoverTool(renderers=[c2], tooltips=[('Species', '@species'),])
        c2_hover.toggleable = False
        dend.add_tools(c2_hover)

        dend.grid.visible = False
        dend.xaxis.visible = False
        dend.yaxis.minor_tick_line_alpha = 0
        dend.yaxis.axis_label = 'Distance'

        return dend
    
    def _pcp(self, **kwargs):
        """
        Parallel Coordinates Plot to visualise clusters within original dataset

        You need to provide the constructor with a few things:
            - max_d is the optional distance at which to cut the HAC dendrogram
            - norm is whether to use Normalize function or not: True/False

        By default, the axes for the PCP will be taken from the column headers in DF;
        You can overwrite them by adding in an optional parameter 'domains=[]'. The 
        number of user-set domains must match the number of columns used to create the
        linkage matrix.
        """
        df = self.num_df

        #if ('domains' in kwargs) & (len(kwargs['domains']) == len(df.columns.values)):
        domains = self.num_df.columns.values
        #implement custom domains passed as keyword argument

        def normalize(df):
            result = df.copy()
            for feature_name in df.columns:
                max_value = df[feature_name].max()
                min_value = df[feature_name].min()
                result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
            return result

        if 'norm' in kwargs:
            df = normalize(df)
            self.source.data['pcp_ys'] = [list(row.values) for index, row in df.iterrows()]

        #add clusters and colours to the df
        #full list of palettes: https://bokeh.pydata.org/en/latest/docs/reference/palettes.html

        if 'max_d' in kwargs:
            self.source.data['clusters'] = fcluster(self.Z, kwargs['max_d'], criterion='distance')
            cl_colors = pd.Series(fcluster(self.Z, kwargs['max_d'], criterion='distance'))

        palette = bokeh.palettes.brewer['Set1'][self.source.data['clusters'].max()]
        cl = list(cl_colors.unique())
        color_dict = dict(zip(cl, palette))    

        self.source.data['color'] = [color_dict.get(v, None) for v in self.source.data['clusters']]

        #multiline doesn't work with a categorical x-axis so we need to alias x-axis values.
        #y_range bounds depend on whether the dataset was normalised or not - TO DO
        p = figure(plot_width=800, plot_height=400, x_range=(0,len(domains)-1), y_range=(0,1))

        p.title.text = 'Parallel coordinates plot in Bokeh'
        p.title.align = 'center'

        #format x and y axis, including overwriting labels
        p.xaxis.ticker = self.num_aliases
        p.xaxis.major_label_overrides = dict(zip(self.num_aliases, domains))
        p.yaxis.axis_label = "Y-Axis"
        p.yaxis.minor_tick_line_color = None

        #create renderers: segment for PCP "axis" and multi-line for actual lines
        p.segment(x0=self.num_aliases, y0=[0]*len(domains), x1=self.num_aliases,
                  y1=[1]*len(domains), color="grey", line_width=2)

        p.multi_line(xs='pcp_xs', ys='pcp_ys',
                     line_width=1, line_alpha=0.4, line_color='pcp_color', source=self.source)

        p.grid.visible = False

        return p

In [5]:
cp = ClusterPlot(df, 'ward')

In [6]:
show(column(cp._dendrogram(), cp._pcp(norm=True, max_d=3)))