In [2]:
#Import libraries

import pandas as pd
import numpy as np

from dash import Input, Output, dcc, html
from jupyter_dash import JupyterDash

from kmodes.kprototypes import KPrototypes

import plotly.express as px

In [3]:
#Import dataset, all scaled and wrangled from second notebook

doctor_data = pd.read_csv("Data/doctor_data_merge")
doctor_data.head()

Unnamed: 0,Complaint Type,Qty,Region,Category,Rank,Incidence rate,R rate,Experience,Purchases
0,No Complaints,-0.404304,4 15,Specialist,Ambassador,10.941947,-0.29716,1.137682,2.796951
1,No Complaints,-0.404304,1 8 T4,General Practitioner,Ambassador,7.958297,-1.528994,-0.920733,1.975615
2,No Complaints,-0.404304,1 9 T4,Specialist,Ambassador,6.963747,0.565124,-0.920733,1.676947
3,No Complaints,-0.404304,1 10 T3,Specialist,Ambassador,5.720559,1.249476,-0.097367,1.303612
4,No Complaints,-0.404304,1 14 T4,Specialist,Ambassador,4.477371,-0.215038,0.365776,0.930277


In [4]:
doctor_raw = pd.read_csv("Data/raw_doctors")
doctor_raw.head(2)

Unnamed: 0,DoctorID,Complaint Type,Qty,Region,Category,Rank,Incidence rate,R rate,Experience,Purchases
0,AHDCBA,No Complaints,0.0,4 15,Specialist,Ambassador,49.0,0.9,1.2,49
1,ABHAHF,No Complaints,0.0,1 8 T4,General Practitioner,Ambassador,37.0,0.0,0.0,38


In [5]:
#Instantiate application
app = JupyterDash(__name__)

In [6]:
#Begin to create layout

app.layout = html.Div(
    [
        #Create H1 header for title of app
        html.H1("Doctors Clusters according to selected features"),
        #Create H2 Header for subtitle for error_metric
        html.H2("Error metric"),
        #radio item for switching btw rank and w/o rank
        html.H3("With or without Rank"),
        dcc.RadioItems(
            options = [{"label": "Include Rank features", "value": True},
                      {"label": "Without Rank feature", "value": False}],
            value = True,
            id = "On/Off Rank"
        ),
        #Create dcc Slider for switching number of clusters
        html.H3("Slider for controlling number of clusters"),
        dcc.Slider(min = 2, max = 16, step = 1, value =2, id = "slider"),
        #Print error metric below
        html.H3("Error Metric below"),
        html.H5("Note: It takes a little while to load up"),
        html.Div(id = "error-metric"),
        #Create side by side bar chart layout
        html.H4("Side by Side Bar Chart displaying Average Values for each cluster"),
        dcc.Graph(id = "Bar-chart"),
        #Create scatter plot
        html.H4("Scatter plot of Incidence Rate against Experience, color codded by clusters"),
        dcc.Graph(id = "Scatter"),
        #Plot countplot and compare Purchases by ranks in different clusters
        html.H5("Note: The following plots do not take input from the radio button"),
        #Countplot for different ranks in different clusters
        dcc.Graph(id = "countplot"),
        #Bar chart comparing purchases by ranks in different clusters
        dcc.Graph(id = "rank-purchase")
    ]
    
)

In [7]:
#Define business layer to generate column names based on with/without rank

def business_rank(include_rank = True):
    """Returns features with/without rank column
    
    Parameters
    ----------
    include_rank: bool, default=True 
    
    If ``True`` return feat names with the rank
    column, If ``False`` return feat names without rank
    """
    
    if include_rank:
        feat_names = doctor_data.columns.to_list()
        #Generate categorical index for algorithm
        cat_index = [0, 2, 3, 4]
        #Create dataset
        X = doctor_data[feat_names]
        X = X.values
        X[:, 1] = X[:, 1].astype(float)
        X[:, 5] = X[:, 5].astype(float)
        X[:, 6] = X[:, 6].astype(float)
        X[:, 7] = X[:, 7].astype(float)
        X[:, 8] = X[:, 8].astype(float)
        
    else:
        feat_names = doctor_data.drop(columns = "Rank")
        feat_names = feat_names.columns.to_list()
        cat_index = [0, 2, 3]
        #Create dataset
        X = doctor_data[feat_names]
        X = X.values
        X[:, 1] = X[:, 1].astype(float)
        X[:, 4] = X[:, 4].astype(float)
        X[:, 5] = X[:, 5].astype(float)
        X[:, 6] = X[:, 6].astype(float)
        X[:, 7] = X[:, 7].astype(float)
    return X, cat_index

In [8]:
def business_model(include_rank=True, k = 2, return_error = False):
    """
    Rrturns kprototype model or error from model
    
    Parameters
    ----------
    include_rank: bool, default=True
    If ``True`` it will include rank column
    and categorical index needed for algorithm 
    If ``False`` exclude the rank column
    and categorical index needed for algorithm
    
    return_error: bool, default=False
    If ``True`` returns model errors
    If ``False`` returns only the model
    
    k: int, default=2
    Number of clusters that model should use
    for training model
    """
    #Extract features and cat_index
    data, cat_index = business_rank(include_rank=include_rank)
    
    #Fit predict model with k
    model = KPrototypes(n_clusters=k, init="Huang", random_state = 42)
    clusters = model.fit_predict(data, categorical = cat_index)
    
    if return_error:
        error = model.cost_
        return error
    else:
        return clusters

In [8]:
business_model(include_rank=False, return_error=True)

2378.349799664885

In [9]:
@app.callback(
    Output("error-metric", "children"),
    Input("On/Off Rank", "value"),
    Input("slider", "value")
)
def print_error(include_rank=True, k = 2):
    """Print the errot metric from KP model
    
    Parameter
    --------
    include_rank: bool, default:True
    ``If True`` includes the rank column
    ``If False`` does not include rank column
    k: int, deafult:2
    The number of clusters that a KP 
    algorithm will use.
    """
    error = business_model(k=k, include_rank=include_rank, return_error=True)
    
    return html.H3(round(error, 2))

In [10]:
@app.callback(
    Output("Bar-chart", "figure"),
    Input("slider", "value"),
    Input("On/Off Rank", "value")
    
)

def serve_barchart(k=2, include_rank=True):
    """Returns barchart of average values of
    numerical features
    
    Parameter
    --------
    include_rank: bool, default:True
    ``If True`` includes the rank column
    ``If False`` does not include rank column
    k: int, deafult:2
    The number of clusters that a KP 
    algorithm will use.
    """
    clusters = business_model(include_rank=include_rank, k=k, return_error=False)
    
    #Add clusters to original dataframe
    function_doctor = doctor_raw.copy()
    function_doctor["clusters"] = clusters
    
    #Group doctor
    doctors_gb = function_doctor.groupby("clusters")[["Qty", "Incidence rate", "R rate", "Experience", "Purchases"]].mean()
    
    #use plotly express to make plot
    fig = px.bar(doctors_gb,
                barmode="group")
    #fig.update_layout(xaxis_title = "Clusters")
    
    return fig

In [11]:
doctor_raw.head()

Unnamed: 0,DoctorID,Complaint Type,Qty,Region,Category,Rank,Incidence rate,R rate,Experience,Purchases
0,AHDCBA,No Complaints,0.0,4 15,Specialist,Ambassador,49.0,0.9,1.2,49
1,ABHAHF,No Complaints,0.0,1 8 T4,General Practitioner,Ambassador,37.0,0.0,0.0,38
2,FDHFJ,No Complaints,0.0,1 9 T4,Specialist,Ambassador,33.0,1.53,0.0,34
3,BJJHCA,No Complaints,0.0,1 10 T3,Specialist,Ambassador,28.0,2.03,0.48,29
4,FJBEA,No Complaints,0.0,1 14 T4,Specialist,Ambassador,23.0,0.96,0.75,24


In [12]:
@app.callback(
    Output("Scatter", "figure"),
    Input("slider", "value"),
    Input("On/Off Rank", "value")  
)

def serve_scatter(k=2, include_rank=True):
    """Returns barchart of average values of
    numerical features
    
    Parameter
    --------
    include_rank: bool, default:True
    ``If True`` includes the rank column
    ``If False`` does not include rank column
    k: int, deafult:2
    The number of clusters that a KP 
    algorithm will use.
    """
    clusters = business_model(include_rank=include_rank, k=k, return_error=False)
    
    #Add clusters to original dataframe
    function_doctor = doctor_raw.copy()
    function_doctor["clusters"] = clusters
    
    #use plotly express to make plot
    fig = px.scatter(function_doctor, x="Incidence rate",
                     y = "Experience", color=clusters)
    #fig.update_layout(xaxis_title = "Clusters")
    
    return fig

In [13]:
@app.callback(
    Output("countplot", "figure"),
    Input("slider", "value")
)

def serve_countplot(k = 2):
    """Returns the count
    for each individual rank present in a cluster
    
    Parameter
    --------
    k: int, deafult:2
    The number of clusters that a KP 
    algorithm will use.
    """
    clusters = business_model(include_rank=True, k=k, return_error=False)
    
    #Create copy of dataset
    function_doctor = doctor_raw.copy()
    function_doctor["clusters"] = clusters
    
    #Group doctor
    doctors_gb = (function_doctor.groupby("clusters")["Rank"].value_counts().rename("Count")
                  .to_frame()
                 .reset_index())
    
    fig = px.bar(doctors_gb,
                 x = "clusters",
                 y = "Count", color="Rank",
                barmode = "group")
    return fig

In [35]:
@app.callback(
    Output("rank-purchase", "figure"),
    Input("slider", "value")
)

def serve_rank_purchase(k = 2):
    """Returns the purchases
    for each individual rank present in a cluster
    
    Parameter
    --------
    k: int, deafult:2
    The number of clusters that a KP 
    algorithm will use.
    """
    
    clusters = business_model(include_rank=True, k=k, return_error=False)
    
    #Create copy of dataset
    function_doctor = doctor_raw.copy()
    function_doctor["clusters"] = clusters
    
    #Group doctor
    doctors_gb = (function_doctor.groupby(["clusters", "Rank"])["Purchases"].mean()
                 .rename("Avg_purchase")
                 .to_frame()
                 .reset_index())
    
    fig = px.bar(data_frame=doctors_gb,
                x = "clusters",
                y = "Avg_purchase",
                color="Rank", barmode="group")
    return fig

In [14]:
app.run_server(host = "localhost", mode = "external")

Dash app running on http://localhost:8050/
