# "Low code" Machine Learning Classifing Tool

## Setup

### Libraries and Imports

In [7]:
!pip install numpy pandas ipywidgets scikit-learn xgboost



In [8]:
# Data Manipulation
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Widgets Support
import ipywidgets as widgets
from ipywidgets import Layout, Tab, FloatSlider
from IPython.display import display, clear_output, FileLink

# Machine Learning Setup
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import roc_curve, roc_auc_score

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC

import time

### Functions

In [9]:
def button(description='Click', disabled=False, button_style='info', tooltip = 'Click to execute!', icon=''):
    btn = widgets.Button(
        description=description,
        disabled=disabled,
        button_style=button_style, # 'success', 'info', 'warning', 'danger' or ''
        tooltip=tooltip,
        icon=icon # (FontAwesome names without the `fa-` prefix)
    )
    btn.style.button_color = '#577ae4'
    return btn

def checkbox(value=False, description='Option 1', disabled=False, indent=False):
    check = widgets.Checkbox(
        value=value,
        description=description,
        disabled=disabled,
        indent=indent
    )
    return check

# Graphic Format Dropdown
def dropdown(options, descrip):
    selectionFormat = widgets.Dropdown(
        options=options,
        description=descrip,
        layout={'width': 'max-content'},
        disabled=False
    )
    return selectionFormat

# Parameter selector control
def select_multiple(opt, descrip):
    selectionWords = widgets.SelectMultiple(
        options=opt,
        rows=10,
        description=descrip,
        disabled=False,
        layout=Layout(width='400px', height='150px')
    )
    return selectionWords

def list_checkbox(list_description, list_check_default=None, list_disabled=None, list_indent=None, layout='vertical', columns=1, all_disabled=False, all_indent=False, all_selected=False):
    if list_check_default is None:
        list_check_default = [all_selected] * len(list_description)

    if list_disabled is None:
        list_disabled = [all_disabled] * len(list_description)

    if list_indent is None:
        list_indent = [all_indent] * len(list_description)

    if len(list_description) != len(list_check_default):
        raise ValueError("The length of list_description and list_check_default must be the same.")
    if len(list_description) != len(list_disabled):
        raise ValueError("The length of list_description and list_disabled must be the same.")
    if len(list_description) != len(list_indent):
        raise ValueError("The length of list_description and list_indent must be the same.")

    checkboxes = []
    checkbox_dict = {}
    for desc, default, dis, inden in zip(list_description, list_check_default, list_disabled, list_indent):
        cb = checkbox(value=default, description=desc, disabled=dis, indent=inden)
        checkboxes.append(cb)
        checkbox_dict[desc] = cb

    if layout == 'vertical':
        container = widgets.VBox(checkboxes)
    elif layout == 'horizontal':
        if columns <= 0:
            raise ValueError("Number of columns must be greater than zero.")
        rows = []
        for i in range(0, len(checkboxes), columns):
            row = widgets.HBox(checkboxes[i:i+columns])
            rows.append(row)
        container = widgets.VBox(rows)
    else:
        raise ValueError("Invalid layout specified. Use 'vertical' or 'horizontal'.")

    return container, checkboxes, checkbox_dict

In [10]:
def evaluate_performance(y_test, y_pred):

    classes = y_test.nunique()

    # Binary classification
    if classes == 2:
      metrics = {
          "Accuracy": accuracy_score(y_test, y_pred),
          "Precision": precision_score(y_test, y_pred),
          "Recall": recall_score(y_test, y_pred),
          "F1-Score": f1_score(y_test, y_pred)
      }

    #Multi-class classification
    else:
      metrics = {
          "Accuracy": accuracy_score(y_test, y_pred),
          "Precision": precision_score(y_test, y_pred, average='macro'),
          "Recall": recall_score(y_test, y_pred, average='macro'),
          "F1-Score": f1_score(y_test, y_pred, average='macro')
      }

    return metrics

def train_models(df, features, target, modeltype, test_size=0.2, GridSearch=False, scoring='accuracy'):

    X = df[features]
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    best_parameters = None
    start_time = time.time()
    if modeltype == 1:
        param_grid = {'C': [0.1, 1, 10, 100], 'solver': ['liblinear', 'newton-cg', 'lbfgs', 'saga'], 'penalty': ['l2']}
        model = LogisticRegression(max_iter=10000)
    elif modeltype == 2:
        param_grid = {'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'max_features': [1.0,'sqrt', 'log2']}
        model = DecisionTreeClassifier(random_state=42)
    elif modeltype == 3:
        param_grid = {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}
        model = RandomForestClassifier(random_state=42)
    elif modeltype == 4:
        param_grid = {'n_estimators': [100, 200, 300], 'max_depth': [3, 5, 7], 'learning_rate': [0.1, 0.01, 0.001]}
        model = XGBClassifier(random_state=42)
    elif modeltype == 5:
        param_grid = {'n_neighbors': [3, 5, 7, 9, 11]}
        model = KNeighborsClassifier()
    elif modeltype == 6:
        param_grid = {'C': [0.1, 1, 10, 100], 'loss': ['hinge', 'squared_hinge']}
        model = LinearSVC(max_iter=10000, dual='auto', random_state=42)
    else:
        print("Error, model not found")
        return None

    if GridSearch:
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring=scoring, error_score='raise')
        grid_search.fit(X_train, y_train)
        best_parameters = grid_search.best_params_
        model = type(model)(**best_parameters)
        model.fit(X_train, y_train)

    else:
        model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    end_time = time.time()
    execution_time = end_time - start_time

    metrics = evaluate_performance(y_test, y_pred)
    metrics["Execution Time (s)"] = execution_time

    return model, best_parameters, metrics, y_test, y_pred, y_pred_prob


def plot_roc_curves(roc_info):
    plt.figure()

    for model_name, y_test, y_pred_prob in roc_info:
        if y_pred_prob is None:
            print(f"ROC curve is not available for model {model_name} as it does not support probability prediction.")
            continue

        fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
        auc = roc_auc_score(y_test, y_pred_prob)
        plt.plot(fpr, tpr, lw=2, label=f'{model_name} (area = {auc:.2f})')

    plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves Comparison')
    plt.legend(loc="lower right")
    plt.show()



def machine_learning_classification_models(df):
    features = widgets.SelectMultiple(
        options=df.columns.tolist(),
        description='Features:',
        disabled=False,
        layout=widgets.Layout(width='400px', height='140px')
    )

    target = widgets.Dropdown(
        options=df.columns.tolist(),
        description='Target:',
        disabled=False,
        layout=widgets.Layout(width='400px')
    )

    grid_search = widgets.RadioButtons(
        options=['yes', 'no'],
        description='GridSearch:',
        disabled=False,
        value='no',
        layout=widgets.Layout(width='150px')
    )

    test_size_slider = FloatSlider(
        value=0.2,
        min=0.05,
        max=0.5,
        step=0.05,
        description='Test size %:',
        continuous_update=False,
        orientation='horizontal',
        readout=True,
        readout_format='.0%',
        layout=widgets.Layout(width='400px')
    )


    models = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'XGBoost', 'kNN', 'SVM']
    models_widget, model_checkboxes, checkbox_dict = list_checkbox(models, layout='horizontal', columns=3, all_indent=True)

    scoring = widgets.Dropdown(
        options=['accuracy', 'precision', 'recall', 'f1'],
        description='Scoring',
        disabled=True,
        layout=widgets.Layout(width='247px')
    )

    status_label = widgets.Label(value="")

    def on_grid_search_change(change):
        if change['new'] == 'yes':
            scoring.disabled = False
        else:
            scoring.disabled = True

    grid_search.observe(on_grid_search_change, names='value')

    metrics_df = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'Execution Time (s)', 'Best Parameters'])

    def on_button_clicked(b):
      try:
          with output:
            clear_output()
            selected_features = list(features.value)
            selected_target = target.value
            use_grid_search = grid_search.value == 'yes'
            # selected_models = list(models.value)
            selected_models = [i for i, cb in enumerate(models, 1) if checkbox_dict[cb].value]
            # print('selected_models: ', selected_models)
            selected_scoring = scoring.value if use_grid_search else 'accuracy'
            test_size = test_size_slider.value

            if not selected_features:
                status_label.value = "Please select at least one feature."
                return

            if not selected_target:
                status_label.value = "Please select a target."
                return

            if not selected_models:
                status_label.value = "Please select at least one model."
                return

            model_map = {
                1: 'Logistic Regression',
                2: 'Decision Tree',
                3: 'Random Forest',
                4: 'XGBoost',
                5: 'kNN',
                6: 'SVM'
            }

            status_label.value = "Execution in progress, please wait..."
            metrics_df = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'Execution Time (s)', 'Best Parameters'])
            roc_info = []

            for modeltype in selected_models:
                model_name = model_map[modeltype]
                model, best_parameters, metrics, y_test, y_pred, y_pred_prob = train_models(df, selected_features, selected_target, modeltype, test_size, use_grid_search, selected_scoring)
                #model, best_parameters, metrics = train_models(df, selected_features, selected_target, modeltype, test_size, use_grid_search, selected_scoring)
                metrics['Model'] = model_name
                metrics['Best Parameters'] = best_parameters
                metrics_df.loc[len(metrics_df)] = metrics

                roc_info.append((model_name, y_test, y_pred_prob))

            csv_file = 'model_metrics.csv'
            metrics_df.to_csv(csv_file, index=False)

            status_label.value = "Execution completed successfully."
            display(metrics_df)
            display(FileLink('model_metrics.csv'))
            plot_roc_curves(roc_info)

      except Exception as e:
          print(f"An error occurred: {e}")

    classificator_button = button(description="Train Models", tooltip="Configure the desired options and click 'Train Models' to start training the models.")
    classificator_button.on_click(on_button_clicked)
    output = widgets.Output()
    message = widgets.Label(value="Check models:")

    display(features, target, test_size_slider)
    ui = widgets.HBox([grid_search, scoring])
    display(ui)
    display(message, models_widget, classificator_button, status_label, output)

## Data preparation

Choose one of the following two ways to obtain a simple dataset for testing the interface

### a. Binary Classification

In [11]:
from sklearn.datasets import load_breast_cancer

breast_cancer = load_breast_cancer()

df = pd.DataFrame(data=breast_cancer.data, columns=breast_cancer.feature_names)
df['type'] = breast_cancer.target

In [12]:
df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,type
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,1.0950,0.9053,8.589,153.40,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.01860,0.01340,0.01389,0.003532,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.006150,0.04006,0.03832,0.02058,0.02250,0.004571,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,0.4956,1.1560,3.445,27.23,0.009110,0.07458,0.05661,0.01867,0.05963,0.009208,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.011490,0.02461,0.05688,0.01885,0.01756,0.005115,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,1.1760,1.2560,7.673,158.70,0.010300,0.02891,0.05198,0.02454,0.01114,0.004239,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,0.7655,2.4630,5.203,99.04,0.005769,0.02423,0.03950,0.01678,0.01898,0.002498,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,0.4564,1.0750,3.425,48.55,0.005903,0.03731,0.04730,0.01557,0.01318,0.003892,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,0.7260,1.5950,5.772,86.22,0.006522,0.06158,0.07117,0.01664,0.02324,0.006185,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0


### b. Multi-Class Classification

In [None]:
from sklearn.datasets import load_iris

iris = load_iris()

df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['species'] = iris.target

In [None]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


## Classifiers Interface

In [13]:
machine_learning_classification_models(df)

SelectMultiple(description='Features:', layout=Layout(height='140px', width='400px'), options=('mean radius', …

Dropdown(description='Target:', layout=Layout(width='400px'), options=('mean radius', 'mean texture', 'mean pe…

FloatSlider(value=0.2, continuous_update=False, description='Test size %:', layout=Layout(width='400px'), max=…

HBox(children=(RadioButtons(description='GridSearch:', index=1, layout=Layout(width='150px'), options=('yes', …

Label(value='Check models:')

VBox(children=(HBox(children=(Checkbox(value=False, description='Logistic Regression'), Checkbox(value=False, …

Button(button_style='info', description='Train Models', style=ButtonStyle(button_color='#577ae4'), tooltip="Co…

Label(value='')

Output()