In [None]:
!pip3 install auto-sklearn

In [None]:
%matplotlib inline

In [None]:
import autosklearn.classification
import numpy as np
import pandas as pd
import sklearn.datasets
import sklearn.metrics
from autosklearn.metrics import accuracy, balanced_accuracy, precision, recall, f1, roc_auc

# Define Score Function


In [None]:
def error(solution, prediction):
    # custom function defining error
    return np.mean(solution != prediction)

def get_metric_result(cv_results):
    results = pd.DataFrame.from_dict(cv_results)
    results = results[results['status'] == "Success"]
    cols = ['rank_test_scores', 'param_classifier:__choice__', 'mean_test_score']
    cols.extend([key for key in cv_results.keys() if key.startswith('metric_')])
    return results[cols]

error_rate = autosklearn.metrics.make_scorer(
    name='custom_error',
    score_func=error,
    optimum=0,
    greater_is_better=False,
    needs_proba=False,
    needs_threshold=False
)

In [None]:
#define show result function
def evaluate_model(model):

  y_test_predict = model.predict(X_test)
  y_test_predict_proba = model.predict_proba(X_test)
  print("Accuracy score: ", sklearn.metrics.accuracy_score(y_test, y_test_predict))
  print('Balanced Accuracy Score: ', sklearn.metrics.balanced_accuracy_score(y_test,y_test_predict))
  print ("Log Loss: ", sklearn.metrics.log_loss(y_test, y_test_predict_proba))
  print("Precision Score: ", sklearn.metrics.precision_score(y_test, y_test_predict))
  print("Recall Score: : ", sklearn.metrics.recall_score(y_test, y_test_predict))
  print("F1 Score: ", sklearn.metrics.f1_score(y_test, y_test_predict))
  print("F1 (beta=2) Score: ", sklearn.metrics.fbeta_score(y_test, y_test_predict,beta=2))
  print("ROC Auc Score: ", sklearn.metrics.roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))
  print ('Confusion Matrix: \n', sklearn.metrics.confusion_matrix (y_test,y_test_predict))

  print("#" * 80)
  print("Metric results")
  print(get_metric_result(model.cv_results_).to_string(index=False))

  model.leaderboard()
  model.sprint_statistics()
  print(model.show_models())
  model.get_models_with_weights()

## Data Loading



In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

#import all data
data = pd.read_csv(r'/content/drive/MyDrive/msc-project-data/features_with_outcome.csv',encoding='utf=8')

y_data = data['outcome']
X_data = data.drop(columns = ['org_uuid','outcome'])

#splitting data into test and full training set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_data,y_data, test_size = 0.2,stratify=y_data)

Mounted at /content/drive


# Print a list of available Classifier

In [None]:
import autosklearn.pipeline.components.classification
for name in autosklearn.pipeline.components.classification.ClassifierChoice.get_components():
    print(name)

adaboost
bernoulli_nb
decision_tree
extra_trees
gaussian_nb
gradient_boosting
k_nearest_neighbors
lda
liblinear_svc
libsvm_svc
mlp
multinomial_nb
passive_aggressive
qda
random_forest
sgd


## Print a list of available metrics

In [None]:
print("Available CLASSIFICATION metrics autosklearn.metrics.*:")
print("\t*" + "\n\t*".join(autosklearn.metrics.CLASSIFICATION_METRICS))

Available CLASSIFICATION metrics autosklearn.metrics.*:
	*accuracy
	*balanced_accuracy
	*roc_auc
	*average_precision
	*log_loss
	*precision
	*precision_macro
	*precision_micro
	*precision_samples
	*precision_weighted
	*recall
	*recall_macro
	*recall_micro
	*recall_samples
	*recall_weighted
	*f1
	*f1_macro
	*f1_micro
	*f1_samples
	*f1_weighted


## Build and fit a classifier (ensemble) with no target metric



In [None]:
#set up classifier, set no target metric
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=3600,
    #per_run_time_limit=720,
    #include_estimators =['gradient_boosting'],
    resampling_strategy ='cv',
    resampling_strategy_arguments={'folds': 5},
    #ensemble_size=1,
    include_preprocessors = ['no_preprocessing'],
    scoring_functions=[accuracy, balanced_accuracy, precision, recall, f1, error_rate, roc_auc],
    #metric=autosklearn.metrics.recall
)

In [None]:
#fit classifier
automl.fit(X_train, y_train, X_test, y_test)

AutoSklearnClassifier(include_preprocessors=['no_preprocessing'],
                      per_run_time_limit=360, resampling_strategy='cv',
                      resampling_strategy_arguments={'folds': 5},
                      scoring_functions=[accuracy, balanced_accuracy, precision,
                                         recall, f1, custom_error, roc_auc])

## Get the Score of the final ensemble



In [None]:
evaluate_model(automl)

Accuracy score:  0.7923773059519665
Balanced Accuracy Score:  0.5893952969734574
Log Loss:  0.4680562249259098
Precision Score:  0.6643026004728132
Recall Score: :  0.21096096096096095
F1 Score:  0.3202279202279202
F1 (beta=2) Score:  0.24430533820205177
ROC Auc Score:  0.7578182509650566
Confusion Matrix: 
 [[4272  142]
 [1051  281]]
################################################################################
Metric results
 rank_test_scores param_classifier:__choice__  mean_test_score  metric_accuracy  metric_balanced_accuracy  metric_precision  metric_recall  metric_f1  metric_custom_error  metric_roc_auc
               27                         sgd         0.631304         0.631304                  0.469042          0.180540       0.166444   0.172958             0.368696        0.421670
               28          passive_aggressive         0.620861         0.620861                  0.582336          0.337617       0.510453   0.381005             0.379139        0.630803
      

# Build and fit classifier - ensemble - with f1 as target metric

In [None]:
#set up classifier, set no target metric
automl1 = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=3600,
    #per_run_time_limit=720,
    #include_estimators =['gradient_boosting'],
    resampling_strategy ='cv',
    resampling_strategy_arguments={'folds': 5},
    #ensemble_size=1,
    include_preprocessors = ['no_preprocessing'],
    scoring_functions=[accuracy, balanced_accuracy, precision, recall, f1, error_rate, roc_auc],
    metric=autosklearn.metrics.f1
)

In [None]:
automl1.fit(X_train,y_train,X_test,y_test)

AutoSklearnClassifier(include_preprocessors=['no_preprocessing'], metric=f1,
                      per_run_time_limit=360, resampling_strategy='cv',
                      resampling_strategy_arguments={'folds': 5},
                      scoring_functions=[accuracy, balanced_accuracy, precision,
                                         recall, f1, custom_error, roc_auc])

In [None]:
evaluate_model(automl1)

Accuracy score:  0.7312913331012878
Balanced Accuracy Score:  0.6874997448740086
Log Loss:  0.585400010860937
Precision Score:  0.4419496166484118
Recall Score: :  0.6058558558558559
F1 Score:  0.5110829639012033
F1 (beta=2) Score:  0.5640201285993849
ROC Auc Score:  0.7582943160650456
Confusion Matrix: 
 [[3395 1019]
 [ 525  807]]
################################################################################
Metric results
 rank_test_scores param_classifier:__choice__  mean_test_score  metric_accuracy  metric_balanced_accuracy  metric_precision  metric_recall  metric_f1  metric_custom_error  metric_roc_auc
               35                         sgd         0.172958         0.631304                  0.469042          0.180540       0.166444   0.172958             0.368696        0.421670
               28               random_forest         0.337676         0.785997                  0.593843          0.597468       0.235500   0.337676             0.214003        0.734419
         

# Build and fit classifier - ensemble - with recall as target metric

In [None]:
#set up classifier, set recall as target metric
automl2 = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=3600,
    #per_run_time_limit=720,
    #include_estimators =['gradient_boosting'],
    resampling_strategy ='cv',
    resampling_strategy_arguments={'folds': 5},
    #ensemble_size=1,
    include_preprocessors = ['no_preprocessing'],
    scoring_functions=[accuracy, balanced_accuracy, precision, recall, f1, error_rate, roc_auc],
    metric=autosklearn.metrics.recall
)

In [None]:
automl2.fit(X_train,y_train,X_test,y_test)

AutoSklearnClassifier(include_preprocessors=['no_preprocessing'], metric=recall,
                      per_run_time_limit=360, resampling_strategy='cv',
                      resampling_strategy_arguments={'folds': 5},
                      scoring_functions=[accuracy, balanced_accuracy, precision,
                                         recall, f1, custom_error, roc_auc])

In [None]:
evaluate_model(automl2)

Accuracy score:  0.4718064740689175
Balanced Accuracy Score:  0.5938278559483815
Log Loss:  0.6700164959324784
Precision Score:  0.2811616551015163
Recall Score: :  0.8213213213213213
F1 Score:  0.41891633161018565
F1 (beta=2) Score:  0.5933398416314133
ROC Auc Score:  0.7124121176001557
Confusion Matrix: 
 [[1617 2797]
 [ 238 1094]]
################################################################################
Metric results
 rank_test_scores param_classifier:__choice__  mean_test_score  metric_accuracy  metric_balanced_accuracy  metric_precision  metric_recall  metric_f1  metric_custom_error  metric_roc_auc
                3           gradient_boosting         0.662035         0.693007                  0.682195          0.401821       0.662035   0.500004             0.306993        0.747493
               24                         sgd         0.331582         0.443236                  0.404263          0.160647       0.331582   0.216334             0.556764        0.353826
       

# Build and fit classifier - ensemble - with ROC AUC as target metric

In [None]:
#set up classifier, set ROC AUC as target metric
automl3 = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=3600,
    #per_run_time_limit=720,
    #include_estimators =['gradient_boosting'],
    resampling_strategy ='cv',
    resampling_strategy_arguments={'folds': 5},
    #ensemble_size=1,
    include_preprocessors = ['no_preprocessing'],
    scoring_functions=[accuracy, balanced_accuracy, precision, recall, f1, error_rate, roc_auc],
    metric=autosklearn.metrics.roc_auc
)

In [None]:
automl3.fit(X_train,y_train,X_test,y_test)

AutoSklearnClassifier(include_preprocessors=['no_preprocessing'],
                      metric=roc_auc, per_run_time_limit=360,
                      resampling_strategy='cv',
                      resampling_strategy_arguments={'folds': 5},
                      scoring_functions=[accuracy, balanced_accuracy, precision,
                                         recall, f1, custom_error, roc_auc])

In [None]:
evaluate_model(automl3)

Accuracy score:  0.7923773059519665
Balanced Accuracy Score:  0.6004034732512304
Log Loss:  0.5320959522819054
Precision Score:  0.6370808678500987
Recall Score: :  0.2424924924924925
F1 Score:  0.3512778684067428
F1 (beta=2) Score:  0.2767780634104542
ROC Auc Score:  0.7583230602600789
Confusion Matrix: 
 [[4230  184]
 [1009  323]]
################################################################################
Metric results
 rank_test_scores param_classifier:__choice__  mean_test_score  metric_accuracy  metric_balanced_accuracy  metric_precision  metric_recall  metric_f1  metric_custom_error  metric_roc_auc
               34                         sgd         0.421670         0.631304                  0.469042          0.180540       0.166444   0.172958             0.368696        0.421670
               27                         qda         0.691916         0.768853                  0.502971          0.647824       0.007131   0.014099             0.231147        0.691916
        

In [52]:
automl1.get_models_with_weights()

[(0.14,
  SimpleClassificationPipeline({'balancing:strategy': 'weighting', 'classifier:__choice__': 'gradient_boosting', 'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'one_hot_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer', 'data_preprocessing:numerical_transformer:imputation:strategy': 'most_frequent', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'robust_scaler', 'feature_preprocessor:__choice__': 'no_preprocessing', 'classifier:gradient_boosting:early_stop': 'valid', 'classifier:gradient_boosting:l2_regularization': 1e-10, 'classifier:gradient_boosting:learning_rate': 0.10966957560713958, 'classifier:gradient_boosting:loss': 'auto', 'classifier:gradient_boosting:max_bins': 255, 'classifier:gradient_boosting:max_depth': 'None', 'classifier:gradient_boosting:max_leaf_nodes': 50, 'classifier:gradient_boosting:min_samples_leaf': 37, 'classifier:gradient_boosting:scoring': 'l

In [None]:
!pip install shap

In [56]:
import shap

In [None]:
shap.Explainer(automl1)