In [None]:
 with!pip3 install auto-sklearn

In [None]:
%matplotlib inline

In [None]:
import autosklearn.classification
import numpy as np
import pandas as pd
import sklearn.metrics
from autosklearn.metrics import accuracy, balanced_accuracy, precision, recall, f1, roc_auc

# Define Score Function


In [None]:
def error(solution, prediction):
    # custom function defining error
    return np.mean(solution != prediction)

def get_metric_result(cv_results):
    results = pd.DataFrame.from_dict(cv_results)
    results = results[results['status'] == "Success"]
    cols = ['rank_test_scores', 'param_classifier:__choice__', 'mean_test_score']
    cols.extend([key for key in cv_results.keys() if key.startswith('metric_')])
    return results[cols]

error_rate = autosklearn.metrics.make_scorer(
    name='custom_error',
    score_func=error,
    optimum=0,
    greater_is_better=False,
    needs_proba=False,
    needs_threshold=False
)

In [None]:
#define show result function
def evaluate_model(model):

  y_test_predict = model.predict(X_test)
  y_test_predict_proba = model.predict_proba(X_test)
  print("Accuracy score: ", sklearn.metrics.accuracy_score(y_test, y_test_predict))
  print('Balanced Accuracy Score: ', sklearn.metrics.balanced_accuracy_score(y_test,y_test_predict))
  print ("Log Loss: ", sklearn.metrics.log_loss(y_test, y_test_predict_proba))
  print("Precision Score: ", sklearn.metrics.precision_score(y_test, y_test_predict))
  print("Recall Score: : ", sklearn.metrics.recall_score(y_test, y_test_predict))
  print("F1 Score: ", sklearn.metrics.f1_score(y_test, y_test_predict))
  print("F1 (beta=2) Score: ", sklearn.metrics.fbeta_score(y_test, y_test_predict,beta=2))
  print("ROC Auc Score: ", sklearn.metrics.roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))
  print ('Confusion Matrix: \n', sklearn.metrics.confusion_matrix (y_test,y_test_predict))

  print("#" * 80)
  print("Metric results")
  print(get_metric_result(model.cv_results_).to_string(index=False))

  print(model.leaderboard())
  print(model.sprint_statistics())
  print(model.show_models())
  print(model.get_models_with_weights())

## Data Loading



In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

#import all data
data = pd.read_csv(r'/content/drive/MyDrive/msc-project-data/features_with_outcome.csv',encoding='utf=8')

y_data = data['outcome']
X_data = data.drop(columns = ['org_uuid','outcome'])

#splitting data into test and full training set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_data,y_data, test_size = 0.2,stratify=y_data)

Mounted at /content/drive


# Print a list of available Classifier

In [None]:
import autosklearn.pipeline.components.classification
for name in autosklearn.pipeline.components.classification.ClassifierChoice.get_components():
    print(name)

adaboost
bernoulli_nb
decision_tree
extra_trees
gaussian_nb
gradient_boosting
k_nearest_neighbors
lda
liblinear_svc
libsvm_svc
mlp
multinomial_nb
passive_aggressive
qda
random_forest
sgd


## Print a list of available metrics

In [None]:
print("Available CLASSIFICATION metrics autosklearn.metrics.*:")
print("\t*" + "\n\t*".join(autosklearn.metrics.CLASSIFICATION_METRICS))

Available CLASSIFICATION metrics autosklearn.metrics.*:
	*accuracy
	*balanced_accuracy
	*roc_auc
	*average_precision
	*log_loss
	*precision
	*precision_macro
	*precision_micro
	*precision_samples
	*precision_weighted
	*recall
	*recall_macro
	*recall_micro
	*recall_samples
	*recall_weighted
	*f1
	*f1_macro
	*f1_micro
	*f1_samples
	*f1_weighted


## Build and fit a classifier - gradient boosting - with no target metric



In [None]:
#set up classifier, set no target metric
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=3600,
    #per_run_time_limit=720,
    include_estimators =['gradient_boosting'],
    resampling_strategy ='cv',
    resampling_strategy_arguments={'folds': 5},
    ensemble_size=1,
    include_preprocessors = ['no_preprocessing'],
    scoring_functions=[accuracy, balanced_accuracy, precision, recall, f1, error_rate, roc_auc],
    #metric=autosklearn.metrics.recall
)

In [None]:
#fit classifier
automl.fit(X_train, y_train, X_test, y_test)

In [None]:
evaluate_model(automl)

Accuracy score:  0.7868082144100244
Balanced Accuracy Score:  0.5902261572855139
Log Loss:  0.46595426571772963
Precision Score:  0.6094069529652352
Recall Score: :  0.22372372372372373
F1 Score:  0.32729269632070296
F1 (beta=2) Score:  0.2561457796114836
ROC Auc Score:  0.7529379458751909
Confusion Matrix: 
 [[4223  191]
 [1034  298]]
################################################################################
Metric results
 rank_test_scores param_classifier:__choice__  mean_test_score  metric_accuracy  metric_balanced_accuracy  metric_precision  metric_recall  metric_f1  metric_custom_error  metric_roc_auc
               14           gradient_boosting         0.781037         0.781037                  0.608892          0.553432       0.287860   0.378663             0.218963        0.731817
                4           gradient_boosting         0.789870         0.789870                  0.591453          0.634466       0.221429   0.328150             0.210130        0.754460
     

# Build and fit classifier (gradient boosting) with f1 as target metric

In [None]:
#set up classifier, set no target metric
automl1 = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=3600,
    #per_run_time_limit=720,
    include_estimators =['gradient_boosting'],
    resampling_strategy ='cv',
    resampling_strategy_arguments={'folds': 5},
    ensemble_size=1,
    include_preprocessors = ['no_preprocessing'],
    scoring_functions=[accuracy, balanced_accuracy, precision, recall, f1, error_rate, roc_auc],
    metric=autosklearn.metrics.f1
)

In [None]:
automl1.fit(X_train,y_train,X_test,y_test)

In [None]:
evaluate_model(automl1)

Accuracy score:  0.7044900800556909
Balanced Accuracy Score:  0.6821118241032151
Log Loss:  0.5815350332984409
Precision Score:  0.41167953667953666
Recall Score: :  0.6403903903903904
F1 Score:  0.5011750881316098
F1 (beta=2) Score:  0.5763513513513512
ROC Auc Score:  0.7488415579149607
Confusion Matrix: 
 [[3195 1219]
 [ 479  853]]
################################################################################
Metric results
 rank_test_scores param_classifier:__choice__  mean_test_score  metric_accuracy  metric_balanced_accuracy  metric_precision  metric_recall  metric_f1  metric_custom_error  metric_roc_auc
               33           gradient_boosting         0.378663         0.781037                  0.608892          0.553432       0.287860   0.378663             0.218963        0.731817
               32           gradient_boosting         0.381096         0.768548                  0.607575          0.501445       0.307379   0.381096             0.231452        0.700774
       

# Build and fit classifier (gradient boosting) with recall as target metric

In [None]:
#set up classifier, set recall as target metric
automl2 = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=3600,
    #per_run_time_limit=720,
    include_estimators =['gradient_boosting'],
    resampling_strategy ='cv',
    resampling_strategy_arguments={'folds': 5},
    ensemble_size=1,
    include_preprocessors = ['no_preprocessing'],
    scoring_functions=[accuracy, balanced_accuracy, precision, recall, f1, error_rate, roc_auc],
    metric=autosklearn.metrics.recall
)

In [None]:
automl2.fit(X_train,y_train,X_test,y_test)

AutoSklearnClassifier(ensemble_size=1, include_estimators=['gradient_boosting'],
                      include_preprocessors=['no_preprocessing'], metric=recall,
                      per_run_time_limit=360, resampling_strategy='cv',
                      resampling_strategy_arguments={'folds': 5},
                      scoring_functions=[accuracy, balanced_accuracy, precision,
                                         recall, f1, custom_error, roc_auc])

In [None]:
evaluate_model(automl2)

Accuracy score:  0.6936999651931779
Balanced Accuracy Score:  0.6766613124225267
Log Loss:  0.592243672130228
Precision Score:  0.40027958993476237
Recall Score: :  0.6448948948948949
F1 Score:  0.4939620471535365
F1 (beta=2) Score:  0.5746588172330747
ROC Auc Score:  0.7442084699107807
Confusion Matrix: 
 [[3127 1287]
 [ 473  859]]
################################################################################
Metric results
 rank_test_scores param_classifier:__choice__  mean_test_score  metric_accuracy  metric_balanced_accuracy  metric_precision  metric_recall  metric_f1  metric_custom_error  metric_roc_auc
               28           gradient_boosting         0.287860         0.781037                  0.608892          0.553432       0.287860   0.378663             0.218963        0.731817
                1           gradient_boosting         0.664290         0.694182                  0.683748          0.403249       0.664290   0.501851             0.305818        0.748867
        

# Build and fit classifier - ensemble - with ROC AUC as target metric

In [None]:
#set up classifier, set ROC AUC as target metric
automl3 = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=3600,
    #per_run_time_limit=720,
    include_estimators =['gradient_boosting'],
    resampling_strategy ='cv',
    resampling_strategy_arguments={'folds': 5},
    ensemble_size=1,
    include_preprocessors = ['no_preprocessing'],
    scoring_functions=[accuracy, balanced_accuracy, precision, recall, f1, error_rate, roc_auc],
    metric=autosklearn.metrics.roc_auc
)

In [None]:
automl3.fit(X_train,y_train,X_test,y_test)

AutoSklearnClassifier(ensemble_size=1, include_estimators=['gradient_boosting'],
                      include_preprocessors=['no_preprocessing'],
                      metric=roc_auc, per_run_time_limit=360,
                      resampling_strategy='cv',
                      resampling_strategy_arguments={'folds': 5},
                      scoring_functions=[accuracy, balanced_accuracy, precision,
                                         recall, f1, custom_error, roc_auc])

In [None]:
evaluate_model(automl3)

Accuracy score:  0.7878524190741385
Balanced Accuracy Score:  0.5895953157507303
Log Loss:  0.46666757551391796
Precision Score:  0.6194503171247357
Recall Score: :  0.21996996996996998
F1 Score:  0.3246537396121884
F1 (beta=2) Score:  0.2525426650577487
ROC Auc Score:  0.7521786058827291
Confusion Matrix: 
 [[4234  180]
 [1039  293]]
################################################################################
Metric results
 rank_test_scores param_classifier:__choice__  mean_test_score  metric_accuracy  metric_balanced_accuracy  metric_precision  metric_recall  metric_f1  metric_custom_error  metric_roc_auc
               37           gradient_boosting         0.731817         0.781037                  0.608892          0.553432       0.287860   0.378663             0.218963        0.731817
               44           gradient_boosting         0.700774         0.768548                  0.607575          0.501445       0.307379   0.381096             0.231452        0.700774
      