# DATA1030 Interactive Classification Plots

In [1]:
# import libraries and packages
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import sklearn
from sklearn.datasets import make_moons
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import xgboost
import plotly
import plotly.graph_objs as go
import ipywidgets
from ipywidgets import interact





  from pandas import MultiIndex, Int64Index


## Function to generate contour plot data

This function fits a given ML classifier on given input data and calculates the predicted classification probabilities within a specified grid.
Returns a dictionary of 2d arrays of predicted probabilities within grid for a given ML algorithm, input data, and hyperparameter grid.

Inputs: 
- input data X
- input data y
- ML algorithm
- hyperparameter grid
- contour plot grid step size 'h'

Output:
- dictionary of 2D arrays of predicted probabilities, with hyperparameter values (in tuple) as keys


In [2]:
def generate_all_data(X, y, ml_algo, param_grid, h):
    """Returns dictionary of 2D array predicted probabilities within grid for given ML algorithm, input data, and hyperparamater grid"""
    
    # create dictionary for 2D arrays of predicted probabilities
    Z_dict = {}
    
    # create mesh grid:
    # set step size in the mesh
    h = h
    # set plot range to slightly larger than data range
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    # create meshgrid
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    
    # loop through all hyperparameter combinations
    for p in range(len(ParameterGrid(param_grid))):
        # set current parameters
        params = ParameterGrid(param_grid)[p]
        #print('   ',params) 
        
        # set the classifier algorithm - maybe bring this outside for loop
        clf = ml_algo
        
        # set the model hyperparameters
        clf.set_params(**params)
        
        # fit the classifier on the given input data 
        clf.fit(X,y)

        # calculate predicted probabilities in mesh
        Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

        # reshape predicted probabilities
        Z = Z.reshape(xx.shape) 
        
        # convert parameter values to string key
        param_key = str(sorted(params.items()))
            
        # add 2D array to dict with param values tuple as key 
        Z_dict[param_key] = Z
        
    return Z_dict


## Interactive Plot Function


This function plots an interactive contour plot of predicted classification probabilities against a scatterplot of input data for a given ML classifier's hyperparameter grid, input data, dictionary of 2D predicted probability arrays, and grid step size.

The interactive plot uses Plotly's FigureWidgets to create a variable number of sliders to adjust each of the ML classifier's hyperparameters. As the hyperparameter values are changed, the contour plot changes to reflect the ML model's classification probabilities for the given hyperparameter combination. Thus, the plot interactively shows how different hyperparameter combinations affect each ML model's classification. In addition, the 0.5 probability decision boundary is plotted in a bold black line.

Inputs: 
- hyperparameter grid
- input data X
- input data y
- dictionary of 2D arrays of predicted probabilities, with hyperparameter values (in tuple) as keys
- contour plot grid step size 'h'

Output:
- interactive figure with a slider for each hyperparameter controlling the contour plot of predicted classification probabilities on top of a scatteplot of the input data

In [3]:
def plot_clf_contour(param_grid, X, y, Z_dict, h):
    """Returns an interactive contour plot of predicted classification probabilities 
    and a scatterplot of input data for a given ML classifier's hyperparameter grid, 
    input data, dictionary of 2D predicted probability arrays, and grid step size."""
    # set plot range to slightly larger than data range
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
 
    # create figure widget
    fig = go.FigureWidget()
    # add contour plot
    fig.add_contour()
    # set plot size
    fig.update_layout(
        autosize=False,
        width=800,
        height=640)
    # add scatter plot
    fig.add_scatter(x=X[:, 0], y=X[:, 1], mode='markers')
    # add second contour line for decision boundary
    fig.add_contour()

    # create dict of sliders 
    sliders={}

    # create second dict of hyperparameters and their initial values
    slider_init_val={}

    # create sliders for each hyperparameter
    for hyperparam in range(len(param_grid)):
        # get hyperparameter name
        param_name = list(param_grid.keys())[hyperparam]
        # get hyperparameter options
        param_options = param_grid[list(param_grid.keys())[hyperparam]]
        
        # add slider with given hyperparameter name and values to dict
        # if options are categorical, create dropdown slider
        if all(isinstance(item, str) for item in param_options):
            sliders[param_name]=ipywidgets.Dropdown(options=param_options,description=param_name)
        # otherwise create selection slider
        else:
            sliders[param_name]=ipywidgets.SelectionSlider(options=param_options,description=param_name, orientation='horizontal')
        # add slider initial value to initial value dict
        slider_init_val[param_name]=param_grid[list(param_grid.keys())[hyperparam]][0]


    # create interactive plot with sliders that update hyperparameter keys and associated 2d arrays
    @interact(**sliders)
    def update(**slider_init_val):
        with fig.batch_update():
            # set contour colorscale
            fig.data[0].colorscale = "RdBu" # set red blue colorscale
            fig.data[0].contours=dict(
                size=0.05,
                start=0,
                end=1)
            fig.data[0].colorbar={"title": 'predicted probability'}
            # set contourplot data and axes 
            # set parameter key (sorted slider values)
            param_key = str(sorted(slider_init_val.items()))
            fig.data[0].z=Z_dict[param_key] # update Z - predicted prob 2D array 
            fig.data[0].x=np.arange(x_min, x_max, h)
            fig.data[0].y=np.arange(y_min, y_max, h)

            # set contour line for decision boundary data
            fig.data[2].colorscale = [[0, 'rgb(0,0,0)'], [1, 'rgb(0,0,0)']]
            fig.data[2].z=Z_dict[param_key] # update Z - predicted prob 2D array 
            fig.data[2].x=np.arange(x_min, x_max, h)
            fig.data[2].y=np.arange(y_min, y_max, h)
            fig.data[2].contours={"showlabels":False, "coloring":"lines",
                                 "start":0.5, "end":0.5}
            fig.data[2].line.width=5
            fig.data[2].ncontours=1
            # remove scale for contour line
            fig.data[2].showscale=False

            # set scatterplot point color and shape
            fig.data[1].marker.color=y
            fig.data[1].marker.colorscale=[[0, 'rgb(255,0,0)'], [1, 'rgb(0,0,255)']]
            fig.data[1].marker.size=8
            fig.data[1].marker.line.width=1


            # generate title
            title=""
            for i, (k, v) in enumerate(slider_init_val.items()):
                if i == len(slider_init_val.items())-1: # if last item, omit comma
                    title+=(str(k) + " = " + str(v))
                else:
                    title+=(str(k) + " = " + str(v)+ ", ")

            # axes labels
            fig.update_layout(
                title={
                    'text': title,
                    'y':0.9,
                    'x':0.41,
                    'xanchor': 'center',
                    'yanchor': 'top'},
                xaxis_title="feature 1",
                yaxis_title="feature 2",
                font=dict(
                    family="arial, monospace",
                    size=16)
            )
    return fig

## Input classification data

In [4]:
# set ML models' random_state
random_state = 2

# create the data for classification
X,y = make_moons(noise=0.2, random_state=1,n_samples=200)

# standardize X 
X = StandardScaler().fit_transform(X)

# set grid step size
h = 0.02

## Logistic Regression 

In [5]:
# initialize logistic regression model, with elastic net regularization
ML_algo = LogisticRegression(penalty='elasticnet', solver='saga', random_state=random_state)

# set parameter grid to tune C and l1 ratio
param_grid = {'C': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2],
             'l1_ratio': np.linspace(0, 1,11)} # l1_ratio between 0 and 1

# generate Z dict for example data and ML alg
Z_dict = generate_all_data(X, y, ML_algo, param_grid, h)

In [6]:
# plot classification contour plot
plot_clf_contour(param_grid, X, y, Z_dict, h)

interactive(children=(SelectionSlider(description='C', options=(0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0), v…

FigureWidget({
    'data': [{'colorbar': {'title': {'text': 'predicted probability'}},
              'colorsca…

## Random Forest

In [7]:
# initialize RandomForestClassifier
ML_algo = RandomForestClassifier(max_features=None, random_state=random_state) # max_features to None to consider all (both) features

# set RF parameter grid
param_grid = {'criterion' :["gini", "entropy", "log_loss"],
              'max_depth': [1, 3, 5, 10], 
              'n_estimators': [1, 3, 10, 30, 100]
             #'min_samples_split': [2, 4, 6, 8, 10],
             }

# generate Z dict for example data and ML alg
Z_dict = generate_all_data(X, y, ML_algo, param_grid, h)

In [8]:
# plot classification contour plot
plot_clf_contour(param_grid, X, y, Z_dict, h)

interactive(children=(Dropdown(description='criterion', options=('gini', 'entropy', 'log_loss'), value='gini')…

FigureWidget({
    'data': [{'colorbar': {'title': {'text': 'predicted probability'}},
              'colorsca…

## Support Vector Machine (SVM)

In [9]:
# initialize SVC
ML_algo = SVC(probability=True, random_state=random_state)

# set SVC parameter grid
param_grid = {'gamma': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2], # kernel coefficient
              'C': [1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3] # C - regularization parameter, inverse of strength of regularization
             } 

# generate Z dict for example data and ML alg
Z_dict = generate_all_data(X, y, ML_algo, param_grid, 0.04) # higher grid step size to run faster

In [10]:
# plot classification contour plot
plot_clf_contour(param_grid, X, y, Z_dict, 0.04)

interactive(children=(SelectionSlider(description='gamma', options=(0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0…

FigureWidget({
    'data': [{'colorbar': {'title': {'text': 'predicted probability'}},
              'colorsca…

## XGBoost

In [15]:
# initialize XGBoost classifier
ML_algo = xgboost.XGBClassifier(random_state=random_state, eval_metric='logloss', use_label_encoder=False)

# set parameter grid to tune the maximum depth and regularization parameters
param_grid = {"max_depth": np.arange(1,6,1),
            "reg_alpha": [1e-2, 1e-1, 1e0, 1e1, 1e2],
             "reg_lambda": [1e-1, 1e0, 1e1, 1e2, 1e3, 1e4],
             }

# generate Z dict for example data and ML alg
Z_dict = generate_all_data(X, y, ML_algo, param_grid, h)

In [16]:
# plot classification contour plot
plot_clf_contour(param_grid, X, y, Z_dict, h)

interactive(children=(SelectionSlider(description='max_depth', options=(1, 2, 3, 4, 5), value=1), SelectionSli…

FigureWidget({
    'data': [{'colorbar': {'title': {'text': 'predicted probability'}},
              'colorsca…

## k-Nearest Neighbors (kNN)

In [13]:
# initialize kNN classifier
ML_algo = KNeighborsClassifier()

# set parameter grid to tune the number of nearest neighbors and their weighting (e.g. by distance)
param_grid = {'weights':['uniform', 'distance'],
              'n_neighbors': np.arange(1,30,2)
} 

# generate Z dict for example data and ML alg
Z_dict = generate_all_data(X, y, ML_algo, param_grid, h)

In [14]:
# plot classification contour plot
plot_clf_contour(param_grid, X, y, Z_dict, h)

interactive(children=(Dropdown(description='weights', options=('uniform', 'distance'), value='uniform'), Selec…

FigureWidget({
    'data': [{'colorbar': {'title': {'text': 'predicted probability'}},
              'colorsca…