In [3]:
!pip3 install auto-sklearn

Collecting auto-sklearn
  Downloading auto-sklearn-0.13.0.tar.gz (6.3 MB)
[K     |████████████████████████████████| 6.3 MB 3.8 MB/s 
Collecting scipy>=1.7.0
  Downloading scipy-1.7.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (28.5 MB)
[K     |████████████████████████████████| 28.5 MB 52 kB/s 
Collecting scikit-learn<0.25.0,>=0.24.0
  Downloading scikit_learn-0.24.2-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[K     |████████████████████████████████| 22.3 MB 1.3 MB/s 
Collecting distributed<2021.07,>=2.2.0
  Downloading distributed-2021.6.2-py3-none-any.whl (722 kB)
[K     |████████████████████████████████| 722 kB 54.4 MB/s 
Collecting liac-arff
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
Collecting threadpoolctl
  Downloading threadpoolctl-2.2.0-py3-none-any.whl (12 kB)
Collecting ConfigSpace<0.5,>=0.4.14
  Downloading ConfigSpace-0.4.19-cp37-cp37m-manylinux2014_x86_64.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 21.8 MB/s 
[?25hCollecting pynisher

In [1]:
%matplotlib inline


# Metrics

In *Auto-sklearn*, model is optimized over a metric, either built-in or
custom metric. Moreover, it is also possible to calculate multiple metrics
per run. The following examples show how to calculate metrics built-in
and self-defined metrics for a classification problem.


In [2]:
from autosklearn.experimental.askl2 import AutoSklearn2Classifier

In [3]:
import autosklearn.classification
import numpy as np
import pandas as pd
import sklearn.datasets
import sklearn.metrics
from autosklearn.metrics import balanced_accuracy, precision, recall, f1


def error(solution, prediction):
    # custom function defining error
    return np.mean(solution != prediction)


def get_metric_result(cv_results):
    results = pd.DataFrame.from_dict(cv_results)
    results = results[results['status'] == "Success"]
    cols = ['rank_test_scores', 'param_classifier:__choice__', 'mean_test_score']
    cols.extend([key for key in cv_results.keys() if key.startswith('metric_')])
    return results[cols]

## Data Loading



In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

#import all data
data = pd.read_csv(r'/content/drive/MyDrive/msc-project-data/features_with_outcome.csv',encoding='utf=8')

y_data = data['outcome']
X_data = data.drop(columns = ['org_uuid','outcome'])

#splitting data into test and full training set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_data,y_data, test_size = 0.2,stratify=y_data)

## Build and fit a classifier



In [None]:
error_rate = autosklearn.metrics.make_scorer(
    name='custom_error',
    score_func=error,
    optimum=0,
    greater_is_better=False,
    needs_proba=False,
    needs_threshold=False
)
cls = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=120,
    per_run_time_limit=30,
    scoring_functions=[balanced_accuracy, precision, recall, f1, error_rate]
)
cls.fit(X_train, y_train, X_test, y_test)

## Get the Score of the final ensemble



In [None]:
predictions = cls.predict(X_test)
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))

print("#" * 80)
print("Metric results")
print(get_metric_result(cls.cv_results_).to_string(index=False))