# Comparison with APL library on OpenML benchmark datasets

In this notebook we will compare our library with official SAP HANA Automated Predictive Library (details here: https://help.sap.com/viewer/cb31bd99d09747089754a0ba75067ed2/2.5.0.0/en-US). In most cases, our library beats it in accuracy. However, there is always room for improvement. 

In [None]:
from hana_automl.utils.perfomance import Benchmark
from hana_automl.utils.connection import connection_context
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from hana_automl.utils.cleanup import clean
%load_ext jupyternotify
import time

In [None]:
b = Benchmark(connection_context)
GRADIENT = False
apl_acc = []
hana_acc = []
datasets = ['kr-vs-kp.csv', 'australian.csv', 'phoneme.csv', 'adult.csv', 'blood.csv', 'sylvine.csv', 'credit.csv', 'kc1.csv']
df = pd.DataFrame()

In [None]:
def plot_results(task: str, boosting: bool):
    df['APL'] = apl_acc
    df['HANA AutoML'] = hana_acc
    df['Dataset'] = datasets
    x = np.arange(len(datasets))  # the label locations
    width = 0.35  # the width of the bars

    fig, ax = plt.subplots()
    rects1 = ax.bar(x - width/2, apl_acc, width, label='APL')
    rects2 = ax.bar(x + width/2, hana_acc, width, label='HANA AutoML')

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel('Accuracy')
    if boosting:
        ax.set_title(f'{task} (APL with Gradient Boosting)')
    else:
        ax.set_title(f'{task} (APL without Gradient Boosting)')
    
    ax.set_xticks(x)
    ax.set_xticklabels(datasets)
    ax.legend()

    ax.bar_label(rects1)
    ax.bar_label(rects2)

    plt.gcf().set_size_inches(20,10)
    plt.show()
    
def finish(benchmark):
    apl_acc.append(benchmark.apl_accuracy)
    hana_acc.append(benchmark.automl_accuracy)
    clean()
    time.sleep(180)

## Just APL, without Gradient Boosting enabled:
Don't know what is Gradient Boosting? Check here: https://machinelearningmastery.com/gentle-introduction-gradient-boosting-algorithm-machine-learning/ . We are using special benchmark datasets from OpenML to compare model accuracy.

## Classification

In [None]:
clean()
b.run('./data/benchmark/cls/kr-vs-kp.csv', task='cls', label='class', categorical=['class'], grad_boost=GRADIENT)
finish(b)

In [None]:
b.run('./data/benchmark/cls/australian.csv', task='cls', 
      label='A15', 
      categorical=['A15', 'A1', 'A4', 'A5', 'A6', 'A8', 'A9', 'A11', 'A12'],
      grad_boost=GRADIENT)
finish(b)

In [None]:
b.run('./data/benchmark/cls/phoneme.csv', task='cls', label='Class', categorical=['Class'], grad_boost=GRADIENT)
finish(b)

In [None]:
%%notify
b.run('./data/benchmark/cls/adult.csv', task='cls', label='class', categorical=['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'class'], grad_boost=GRADIENT)
finish(b)

In [None]:
%%notify
b.run('./data/benchmark/cls/blood.csv', task='cls', label='Class', categorical=['Class'], grad_boost=GRADIENT)
finish(b)

In [None]:
# b.run('./data/benchmark/cls/higgs.csv', task='cls', label='class', categorical=['class'], grad_boost=GRADIENT)
# apl_acc.append(b.apl_accuracy)
# hana_acc.append(b.automl_accuracy)
# clean()

In [None]:
b.run('./data/benchmark/cls/sylvine.csv', task='cls', label='class', categorical=['class'], grad_boost=GRADIENT)
finish(b)

In [None]:
b.run('./data/benchmark/cls/credit.csv', task='cls', label='class', categorical=['class', 'checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker'], grad_boost=GRADIENT)
finish(b)

In [None]:
b.run('./data/benchmark/cls/kc1.csv', task='cls', label='defects', categorical=['defects'], grad_boost=GRADIENT)
finish(b)

In [None]:
plot_results(task='Classification', boosting=GRADIENT)

## Regression

In [None]:
datasets = ['elevators.csv', 'wine_quality.csv', 'baseball.csv', 'boston.csv', 'tecator.csv', 'space_ga.csv', 'pol.csv', 'quake.csv']
hana_acc = []
apl_acc = []

In [None]:
# https://www.openml.org/d/216
b.run('./data/benchmark/reg/elevators.csv', task='reg', label='Goal', grad_boost=GRADIENT)
finish(b)

In [None]:
# https://www.openml.org/d/287
b.run('./data/benchmark/reg/wine_quality.csv', task='reg', label='quality', grad_boost=GRADIENT)
finish(b)

In [None]:
# https://www.openml.org/d/41021
b.run('./data/benchmark/reg/baseball.csv', task='reg', label='RS', grad_boost=GRADIENT)
finish(b)

In [None]:
b.run('./data/benchmark/reg/boston.csv', task='reg', label='MEDV', grad_boost=G)
finish(b)

In [None]:
b.run('./data/benchmark/reg/tecator.csv', task='reg', label='fat', grad_boost=GRADIENT)
finish(b)

In [None]:
b.run('./data/benchmark/reg/space_ga.csv', task='reg', label='ln(VOTES/POP)', grad_boost=GRADIENT)
finish(b)

In [None]:
b.run('./data/benchmark/reg/pol.csv', task='reg', label='foo', grad_boost=GRADIENT)
finish(b)

In [None]:
b.run('./data/benchmark/reg/quake.csv', task='reg', label='col_4', grad_boost=GRADIENT)
finish(b)

In [None]:
plot_results(task='Regression', boosting=False)

## Gradient boosting enabled

## Classification

In [None]:
datasets = ['kr-vs-kp.csv', 'australian.csv', 'phoneme.csv']
hana_acc = []
apl_acc = []

In [None]:
# https://www.openml.org/d/3
b.run('./data/benchmark/cls/kr-vs-kp.csv', task='cls', label='class', categorical=['class'], grad_boost=True)
apl_acc.append(b.apl_accuracy)
hana_acc.append(b.automl_accuracy)

In [None]:
# https://www.openml.org/d/40981
b.run('./data/benchmark/cls/australian.csv', task='cls', 
      label='A15', 
      categorical=['A15', 'A1', 'A4', 'A5', 'A6', 'A8', 'A9', 'A11', 'A12'],
      grad_boost=True)
apl_acc.append(b.apl_accuracy)
hana_acc.append(b.automl_accuracy)

In [None]:
# https://www.openml.org/d/1489
b.run('./data/benchmark/cls/phoneme.csv', task='cls', label='Class', categorical=['Class'], grad_boost=True)
apl_acc.append(b.apl_accuracy)
hana_acc.append(b.automl_accuracy)

In [None]:
plot_results(task='Classification', boosting=True)

## Regression

In [None]:
datasets = ['elevators.csv', 'wine_quality.csv', 'baseball.csv']
hana_acc = []
apl_acc = []

In [None]:
# https://www.openml.org/d/216
b.run('./data/benchmark/reg/elevators.csv', task='reg', label='Goal', grad_boost=True)
apl_acc.append(b.apl_accuracy)
hana_acc.append(b.automl_accuracy)

In [None]:
# https://www.openml.org/d/287
b.run('./data/benchmark/reg/wine_quality.csv', task='reg', label='quality', grad_boost=True)
apl_acc.append(b.apl_accuracy)
hana_acc.append(b.automl_accuracy)

In [None]:
# https://www.openml.org/d/41021
b.run('./data/benchmark/reg/baseball.csv', task='reg', label='RS', grad_boost=True)
apl_acc.append(b.apl_accuracy)
hana_acc.append(b.automl_accuracy)

In [None]:
plot_results(task='Regression', boosting=True)