In [None]:
!pip3 install auto-sklearn

In [None]:
%matplotlib inline

In [1]:
import autosklearn.classification
import numpy as np
import pandas as pd
import sklearn.datasets
import sklearn.metrics
from autosklearn.metrics import accuracy, balanced_accuracy, precision, recall, f1, roc_auc

# Define Score Function


In [2]:
def error(solution, prediction):
    # custom function defining error
    return np.mean(solution != prediction)

def get_metric_result(cv_results):
    results = pd.DataFrame.from_dict(cv_results)
    results = results[results['status'] == "Success"]
    cols = ['rank_test_scores', 'param_classifier:__choice__', 'mean_test_score']
    cols.extend([key for key in cv_results.keys() if key.startswith('metric_')])
    return results[cols]

error_rate = autosklearn.metrics.make_scorer(
    name='custom_error',
    score_func=error,
    optimum=0,
    greater_is_better=False,
    needs_proba=False,
    needs_threshold=False
)

In [17]:
#define show result function
def evaluate_model(model):

  y_test_predict = model.predict(X_test)
  y_test_predict_proba = model.predict_proba(X_test)
  print("Accuracy score: ", sklearn.metrics.accuracy_score(y_test, y_test_predict))
  print('Balanced Accuracy Score: ', sklearn.metrics.balanced_accuracy_score(y_test,y_test_predict))
  print ("Log Loss: ", sklearn.metrics.log_loss(y_test, y_test_predict_proba))
  print("Precision Score: ", sklearn.metrics.precision_score(y_test, y_test_predict))
  print("Recall Score: : ", sklearn.metrics.recall_score(y_test, y_test_predict))
  print("F1 Score: ", sklearn.metrics.f1_score(y_test, y_test_predict))
  print("F1 (beta=2) Score: ", sklearn.metrics.fbeta_score(y_test, y_test_predict,beta=2))
  print("ROC Auc Score: ", sklearn.metrics.roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))
  print ('Confusion Matrix: \n', sklearn.metrics.confusion_matrix (y_test,y_test_predict))

  print("#" * 80)
  print("Metric results")
  print(get_metric_result(model.cv_results_).to_string(index=False))

  print(model.leaderboard())
  print(model.sprint_statistics())
  print(model.show_models())
  print(model.get_models_with_weights())

## Data Loading



In [4]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

#import all data
data = pd.read_csv(r'/content/drive/MyDrive/msc-project-data/features_with_outcome.csv',encoding='utf=8')

y_data = data['outcome']
X_data = data.drop(columns = ['org_uuid','outcome'])

#splitting data into test and full training set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_data,y_data, test_size = 0.2,stratify=y_data)

Mounted at /content/drive


# Print a list of available Classifier

In [None]:
import autosklearn.pipeline.components.classification
for name in autosklearn.pipeline.components.classification.ClassifierChoice.get_components():
    print(name)

adaboost
bernoulli_nb
decision_tree
extra_trees
gaussian_nb
gradient_boosting
k_nearest_neighbors
lda
liblinear_svc
libsvm_svc
mlp
multinomial_nb
passive_aggressive
qda
random_forest
sgd


## Print a list of available metrics

In [None]:
print("Available CLASSIFICATION metrics autosklearn.metrics.*:")
print("\t*" + "\n\t*".join(autosklearn.metrics.CLASSIFICATION_METRICS))

Available CLASSIFICATION metrics autosklearn.metrics.*:
	*accuracy
	*balanced_accuracy
	*roc_auc
	*average_precision
	*log_loss
	*precision
	*precision_macro
	*precision_micro
	*precision_samples
	*precision_weighted
	*recall
	*recall_macro
	*recall_micro
	*recall_samples
	*recall_weighted
	*f1
	*f1_macro
	*f1_micro
	*f1_samples
	*f1_weighted


## Build and fit a classifier (random forest) with no target metric



In [5]:
#set up classifier, set no target metric
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=3600,
    #per_run_time_limit=720,
    include_estimators =['random_forest'],
    resampling_strategy ='cv',
    resampling_strategy_arguments={'folds': 5},
    ensemble_size=1,
    include_preprocessors = ['no_preprocessing'],
    scoring_functions=[accuracy, balanced_accuracy, precision, recall, f1, error_rate, roc_auc],
    #metric=autosklearn.metrics.recall
)

In [6]:
#fit classifier
automl.fit(X_train, y_train, X_test, y_test)

AutoSklearnClassifier(ensemble_size=1, include_estimators=['random_forest'],
                      include_preprocessors=['no_preprocessing'],
                      per_run_time_limit=360, resampling_strategy='cv',
                      resampling_strategy_arguments={'folds': 5},
                      scoring_functions=[accuracy, balanced_accuracy, precision,
                                         recall, f1, custom_error, roc_auc])

## Get the Score of the final ensemble



In [18]:
evaluate_model(automl)

Accuracy score:  0.7878524190741385
Balanced Accuracy Score:  0.5775387417322171
Log Loss:  0.4759214984523444
Precision Score:  0.6482939632545932
Recall Score: :  0.18543543543543545
F1 Score:  0.28838295388207824
F1 (beta=2) Score:  0.21632510071816433
ROC Auc Score:  0.73370093587017
Confusion Matrix: 
 [[4280  134]
 [1085  247]]
################################################################################
Metric results
 rank_test_scores param_classifier:__choice__  mean_test_score  metric_accuracy  metric_balanced_accuracy  metric_precision  metric_recall  metric_f1  metric_custom_error  metric_roc_auc
                3               random_forest         0.784953         0.784953                  0.592316          0.594704       0.233068   0.334429             0.215047        0.739731
                9               random_forest         0.769288         0.769288                  0.503581          0.745087       0.008069   0.015958             0.230712        0.735676
       

# Build and fit classifier (random forest) with f1 as target metric

In [8]:
#set up classifier, set no target metric
automl1 = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=3600,
    #per_run_time_limit=720,
    include_estimators =['random_forest'],
    resampling_strategy ='cv',
    resampling_strategy_arguments={'folds': 5},
    ensemble_size=1,
    include_preprocessors = ['no_preprocessing'],
    scoring_functions=[accuracy, balanced_accuracy, precision, recall, f1, error_rate, roc_auc],
    metric=autosklearn.metrics.f1
)

In [None]:
automl1.fit(X_train,y_train,X_test,y_test)

In [19]:
evaluate_model(automl1)

Accuracy score:  0.7057083188304908
Balanced Accuracy Score:  0.6739933748882548
Log Loss:  0.5793671308007703
Precision Score:  0.4101151727591387
Recall Score: :  0.6148648648648649
F1 Score:  0.49203965154701107
F1 (beta=2) Score:  0.5590443686006825
ROC Auc Score:  0.7348969665179451
Confusion Matrix: 
 [[3236 1178]
 [ 513  819]]
################################################################################
Metric results
 rank_test_scores param_classifier:__choice__  mean_test_score  metric_accuracy  metric_balanced_accuracy  metric_precision  metric_recall  metric_f1  metric_custom_error  metric_roc_auc
               12               random_forest         0.493269         0.702058                  0.675184          0.407418       0.625068   0.493269             0.297942        0.739891
                7               random_forest         0.502510         0.717158                  0.681738          0.424613       0.615686   0.502510             0.282842        0.747412
       

# Build and fit classifier (random forest) with recall as target metric

In [11]:
#set up classifier, set recall as target metric
automl2 = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=3600,
    #per_run_time_limit=720,
    include_estimators =['random_forest'],
    resampling_strategy ='cv',
    resampling_strategy_arguments={'folds': 5},
    ensemble_size=1,
    include_preprocessors = ['no_preprocessing'],
    scoring_functions=[accuracy, balanced_accuracy, precision, recall, f1, error_rate, roc_auc],
    metric=autosklearn.metrics.recall
)

In [12]:
automl2.fit(X_train,y_train,X_test,y_test)

AutoSklearnClassifier(ensemble_size=1, include_estimators=['random_forest'],
                      include_preprocessors=['no_preprocessing'], metric=recall,
                      per_run_time_limit=360, resampling_strategy='cv',
                      resampling_strategy_arguments={'folds': 5},
                      scoring_functions=[accuracy, balanced_accuracy, precision,
                                         recall, f1, custom_error, roc_auc])

In [20]:
evaluate_model(automl2)

Accuracy score:  0.6928297946397494
Balanced Accuracy Score:  0.6700666457123186
Log Loss:  0.588719778275839
Precision Score:  0.3971496437054632
Recall Score: :  0.6276276276276276
F1 Score:  0.486470759383183
F1 (beta=2) Score:  0.5623570563702409
ROC Auc Score:  0.733087442902803
Confusion Matrix: 
 [[3145 1269]
 [ 496  836]]
################################################################################
Metric results
 rank_test_scores param_classifier:__choice__  mean_test_score  metric_accuracy  metric_balanced_accuracy  metric_precision  metric_recall  metric_f1  metric_custom_error  metric_roc_auc
               27               random_forest         0.091762         0.779949                  0.539735          0.697542       0.091762   0.162003             0.220051        0.744662
               24               random_forest         0.227249         0.785214                  0.590455          0.598705       0.227249   0.329166             0.214786        0.742091
           

# Build and fit classifier (random forest) with ROC AUC as target metric



In [14]:
#set up classifier, set ROC AUC as target metric
automl3 = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=3600,
    #per_run_time_limit=720,
    include_estimators =['random_forest'],
    resampling_strategy ='cv',
    resampling_strategy_arguments={'folds': 5},
    ensemble_size=1,
    include_preprocessors = ['no_preprocessing'],
    scoring_functions=[accuracy, balanced_accuracy, precision, recall, f1, error_rate, roc_auc],
    metric=autosklearn.metrics.roc_auc
)

In [None]:
automl3.fit(X_train,y_train,X_test,y_test)

In [21]:
evaluate_model(automl3)

Accuracy score:  0.7098851374869475
Balanced Accuracy Score:  0.6754015002768967
Log Loss:  0.5763551702889685
Precision Score:  0.41467142129393786
Recall Score: :  0.6111111111111112
F1 Score:  0.4940819423368741
F1 (beta=2) Score:  0.5582224660540392
ROC Auc Score:  0.7353983741330818
Confusion Matrix: 
 [[3265 1149]
 [ 518  814]]
################################################################################
Metric results
 rank_test_scores param_classifier:__choice__  mean_test_score  metric_accuracy  metric_balanced_accuracy  metric_precision  metric_recall  metric_f1  metric_custom_error  metric_roc_auc
               19               random_forest         0.742870         0.785170                  0.588723          0.601060       0.222370   0.324325             0.214830        0.742870
               17               random_forest         0.744231         0.707149                  0.682757          0.414617       0.637268   0.502295             0.292851        0.744231
       