In [1]:
from pandas import DataFrame as df
from io import BytesIO
import pickle
import json
import pandas as pd
import io
import json
import re
import time

from azureml.automl.runtime.shared.model_wrappers import TruncatedSVDWrapper, SparseNormalizer, XGBoostClassifier, LightGBMClassifier
from azureml.train.automl.run import AutoMLRun
from azureml.core import Workspace, Experiment, Datastore, Dataset
from azureml.train.automl import AutoMLConfig
from azureml.core.compute_target import AbstractComputeTarget

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier, StackingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

def get_models_from_ensemble(experiment, id):
    automl = AutoMLRun(experiment, id)
    details = automl.get_details()
    return json.loads(details['properties']['ensembled_run_ids'].replace("'",'"')), json.loads(details['properties']['ensemble_weights'].replace("'",'"'))

def get_model_from_string(model_string, parameters):
    if model_string == "LogisticRegression":
        return LogisticRegression(**parameters)
    elif model_string == "GradientBoostingClassifier":
        return GradientBoostingClassifier(**parameters) 
    elif model_string == "RandomForestClassifier":
        return RandomForestClassifier(**parameters) 
    elif model_string == "ExtraTreesClassifier":
        return ExtraTreesClassifier(**parameters)  
    elif model_string == "XGBoostClassifier":
        return XGBClassifier(**parameters)
        #return XGBoostClassifier(**parameters) 
    elif model_string == "LightGBMClassifier":
        #return LightGBMClassifier(**parameters)
        parameters['min_data_in_leaf'] = float(parameters['min_data_in_leaf'])
        return LGBMClassifier(**parameters)
    elif model_string == "SVCWrapper":
        parameters.update(dict(probability=True))
        return SVC(**parameters)    
    elif model_string == "TruncatedSVDWrapper":
        return TruncatedSVDWrapper(**parameters)
    elif model_string == "MaxAbsScaler":
        return MaxAbsScaler(**parameters)
    elif model_string == "StandardScaler":
        return StandardScaler(**parameters)    
    elif model_string == "SparseNormalizer":
        return SparseNormalizer(**parameters)   
    
def get_model_object(experiment, id):
    automl = AutoMLRun(experiment, id)
    automl_details = automl.get_details()
    pipeline = json.loads(automl_details['properties']['pipeline_spec'])['objects']
    specs = pipeline
    pipeline = [(item['class_name'], get_model_from_string(item['class_name'], item['param_kwargs'])) for item in pipeline]
    
    return pipeline, specs

def automated_check_and_run(previous_run_id, ds):
    # Check if previous run is completed
    while True:
        automl = AutoMLRun(experiment, previous_run_id)
        details = automl.get_details()
        if details["status"] == 'Completed':
            break
        time.sleep(120)
        
def from_url(url):
    return re.search("(AutoML.*)\?wsid=", url).group(1)
def get_details(url):
    id = re.search("(AutoML.*)\?wsid=", url).group(1)
    model = AutoMLRun(experiment, id)
    details = model.get_details()
    keys = details.keys()
    keys = [key for key in keys if key != "inputDatasets"]
    return [details[key] for key in keys]
    
    # Submit new run with dataset ds
    compute_target = AbstractComputeTarget(compute_target_type="Machine Learning compute", name="brca")
    automl_classifier = AutoMLConfig(
        task='classification',       
        primary_metric='AUC_weighted',
        training_data=ds,
        label_column_name="label",
        n_cross_validations=5,
        compute_target=compute_target
    )
    run = experiment.submit(automl_classifier, show_output=False)
    print("Now the new run has started, let upload new data")
    return run

ws = Workspace.from_config()
datastore_name = 'workspaceblobstore'
datastore = Datastore.get(ws, datastore_name)
experiment_name = 'brca'
experiment = Experiment(ws, experiment_name)

## Get AutoML results from Azure portal

In [2]:
# Get datasource
def get_automl_results(url):
    automl_id = from_url(url)
    automl_run = AutoMLRun(experiment, automl_id)
    automl_details = automl_run.get_details()
    startTimeUtc = automl_details['startTimeUtc']
    endTimeUtc = automl_details['endTimeUtc']
    automl_dataset_str = str(automl_details['inputDatasets'][0]['dataset'])
    automl_dataset_dict = json.loads(automl_dataset_str[14:])
    datasource = re.search(",.'(.*)'",automl_dataset_dict['source'][0]).group(1)
    # get child
    models_in_automl = [child_run.get_details() for child_run in automl_run.get_children()]
    models_in_automl
    for model in models_in_automl:
        try:
            model.pop("inputDatasets")
        except:
            pass
    results = dict(automl=automl_id,datasource=datasource, models=models_in_automl, startTimeUtc=startTimeUtc, endTimeUtc=endTimeUtc)
    return results

In [8]:
automl_urls = [
    "https://ml.azure.com/experiments/id/730e5556-f857-4985-8e73-2236a1d25343/runs/AutoML_f1add042-43b6-47ae-9045-ee0b0e1f63d0?wsid=/subscriptions/7c5fa516-2253-44b4-8d98-e605134ce2bf/resourcegroups/transcriptomic/workspaces/transcriptomic&tid=c152cb07-614e-4abb-818a-f035cfa91a77",
    "https://ml.azure.com/experiments/id/730e5556-f857-4985-8e73-2236a1d25343/runs/AutoML_cd4b0346-9a83-45d8-b145-cdc1b580b463?wsid=/subscriptions/7c5fa516-2253-44b4-8d98-e605134ce2bf/resourcegroups/transcriptomic/workspaces/transcriptomic&tid=c152cb07-614e-4abb-818a-f035cfa91a77",
    "https://ml.azure.com/experiments/id/730e5556-f857-4985-8e73-2236a1d25343/runs/AutoML_11192533-c814-4034-8c3a-372ce61e2340?wsid=/subscriptions/7c5fa516-2253-44b4-8d98-e605134ce2bf/resourcegroups/transcriptomic/workspaces/transcriptomic&tid=c152cb07-614e-4abb-818a-f035cfa91a77",
    "https://ml.azure.com/experiments/id/730e5556-f857-4985-8e73-2236a1d25343/runs/AutoML_574777b7-d459-4c43-abfc-a2cd1dec228f?wsid=/subscriptions/7c5fa516-2253-44b4-8d98-e605134ce2bf/resourcegroups/transcriptomic/workspaces/transcriptomic&tid=c152cb07-614e-4abb-818a-f035cfa91a77",
    "https://ml.azure.com/experiments/id/730e5556-f857-4985-8e73-2236a1d25343/runs/AutoML_1c798080-4213-4372-b2d2-e8cd95b96dbc?wsid=/subscriptions/7c5fa516-2253-44b4-8d98-e605134ce2bf/resourcegroups/transcriptomic/workspaces/transcriptomic&tid=c152cb07-614e-4abb-818a-f035cfa91a77",
    "https://ml.azure.com/experiments/id/730e5556-f857-4985-8e73-2236a1d25343/runs/AutoML_9b16ca9c-953c-419b-9078-52553936b864?wsid=/subscriptions/7c5fa516-2253-44b4-8d98-e605134ce2bf/resourcegroups/transcriptomic/workspaces/transcriptomic&tid=c152cb07-614e-4abb-818a-f035cfa91a77",
    "https://ml.azure.com/experiments/id/730e5556-f857-4985-8e73-2236a1d25343/runs/AutoML_8bd49342-a15a-497b-801e-e78177e4d900?wsid=/subscriptions/7c5fa516-2253-44b4-8d98-e605134ce2bf/resourcegroups/transcriptomic/workspaces/transcriptomic&tid=c152cb07-614e-4abb-818a-f035cfa91a77",
    "https://ml.azure.com/experiments/id/730e5556-f857-4985-8e73-2236a1d25343/runs/AutoML_a6f3cdd5-65a1-4f14-9efd-203ec3fd2ae1?wsid=/subscriptions/7c5fa516-2253-44b4-8d98-e605134ce2bf/resourcegroups/transcriptomic/workspaces/transcriptomic&tid=c152cb07-614e-4abb-818a-f035cfa91a77",
    "https://ml.azure.com/experiments/id/730e5556-f857-4985-8e73-2236a1d25343/runs/AutoML_9464535e-a45d-4313-9a72-71e575d4df1d?wsid=/subscriptions/7c5fa516-2253-44b4-8d98-e605134ce2bf/resourcegroups/transcriptomic/workspaces/transcriptomic&tid=c152cb07-614e-4abb-818a-f035cfa91a77",
    "https://ml.azure.com/experiments/id/730e5556-f857-4985-8e73-2236a1d25343/runs/AutoML_1037f5bb-fb68-4ca1-b1e8-23b98152c06f?wsid=/subscriptions/7c5fa516-2253-44b4-8d98-e605134ce2bf/resourcegroups/transcriptomic/workspaces/transcriptomic&tid=c152cb07-614e-4abb-818a-f035cfa91a77",
    "https://ml.azure.com/experiments/id/730e5556-f857-4985-8e73-2236a1d25343/runs/AutoML_04d8d5a1-712e-4d1c-a138-e29287aa3d8a?wsid=/subscriptions/7c5fa516-2253-44b4-8d98-e605134ce2bf/resourcegroups/transcriptomic/workspaces/transcriptomic&tid=c152cb07-614e-4abb-818a-f035cfa91a77",
    "https://ml.azure.com/experiments/id/730e5556-f857-4985-8e73-2236a1d25343/runs/AutoML_eab8f1ba-9378-4a1a-a76e-6c439d44956f?wsid=/subscriptions/7c5fa516-2253-44b4-8d98-e605134ce2bf/resourcegroups/transcriptomic/workspaces/transcriptomic&tid=c152cb07-614e-4abb-818a-f035cfa91a77",
    "https://ml.azure.com/experiments/id/730e5556-f857-4985-8e73-2236a1d25343/runs/AutoML_dda3c01e-e609-40f0-91d3-bf4d294b0215?wsid=/subscriptions/7c5fa516-2253-44b4-8d98-e605134ce2bf/resourcegroups/transcriptomic/workspaces/transcriptomic&tid=c152cb07-614e-4abb-818a-f035cfa91a77",
    "https://ml.azure.com/experiments/id/730e5556-f857-4985-8e73-2236a1d25343/runs/AutoML_fca7acec-0377-4bc5-be4e-e4ca397584e0?wsid=/subscriptions/7c5fa516-2253-44b4-8d98-e605134ce2bf/resourcegroups/transcriptomic/workspaces/transcriptomic&tid=c152cb07-614e-4abb-818a-f035cfa91a77",
    "https://ml.azure.com/experiments/id/730e5556-f857-4985-8e73-2236a1d25343/runs/AutoML_cb6c3cef-cfbc-45dc-89cd-34279d5737e3?wsid=/subscriptions/7c5fa516-2253-44b4-8d98-e605134ce2bf/resourcegroups/transcriptomic/workspaces/transcriptomic&tid=c152cb07-614e-4abb-818a-f035cfa91a77",
    "https://ml.azure.com/experiments/id/730e5556-f857-4985-8e73-2236a1d25343/runs/AutoML_7ef6bce3-e5a9-47d5-852b-d5ef15b469a3?wsid=/subscriptions/7c5fa516-2253-44b4-8d98-e605134ce2bf/resourcegroups/transcriptomic/workspaces/transcriptomic&tid=c152cb07-614e-4abb-818a-f035cfa91a77",
    "https://ml.azure.com/experiments/id/730e5556-f857-4985-8e73-2236a1d25343/runs/AutoML_802c0c49-6c62-4321-bb0d-8349743945f1?wsid=/subscriptions/7c5fa516-2253-44b4-8d98-e605134ce2bf/resourcegroups/transcriptomic/workspaces/transcriptomic&tid=c152cb07-614e-4abb-818a-f035cfa91a77",
    "https://ml.azure.com/experiments/id/730e5556-f857-4985-8e73-2236a1d25343/runs/AutoML_15e2edad-940c-46f4-a1b0-ca3b3b3e6482?wsid=/subscriptions/7c5fa516-2253-44b4-8d98-e605134ce2bf/resourcegroups/transcriptomic/workspaces/transcriptomic&tid=c152cb07-614e-4abb-818a-f035cfa91a77",
    "https://ml.azure.com/experiments/id/730e5556-f857-4985-8e73-2236a1d25343/runs/AutoML_5ac421d7-3807-407d-9224-ef673d598b0e?wsid=/subscriptions/7c5fa516-2253-44b4-8d98-e605134ce2bf/resourcegroups/transcriptomic/workspaces/transcriptomic&tid=c152cb07-614e-4abb-818a-f035cfa91a77",
    "https://ml.azure.com/experiments/id/730e5556-f857-4985-8e73-2236a1d25343/runs/AutoML_cdb4c08b-d797-4ed1-882f-5266e11e433f?wsid=/subscriptions/7c5fa516-2253-44b4-8d98-e605134ce2bf/resourcegroups/transcriptomic/workspaces/transcriptomic&tid=c152cb07-614e-4abb-818a-f035cfa91a77",
]
automl_results = []

In [9]:
for i, url in enumerate(automl_urls):
    automl_results.append(get_automl_results(url))
    print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


In [13]:
result_df = pd.concat([df(automl) for automl in automl_results])\
    .assign(model_id = lambda x: x['models'].map(lambda models: models['runId']))\
    .assign(task = lambda x: x['datasource'].str.split("/",expand=True)[1])\
    .assign(fold = lambda x: x['datasource'].str.split("/",expand=True)[2].astype('int'))\
    .assign(AutoML_startTimeUtc = lambda x: pd.to_datetime(x['startTimeUtc']))\
    .assign(AutoML_endTimeUtc = lambda x: pd.to_datetime(x['endTimeUtc']))\
    .drop(columns=["automl", "datasource"])\
    .assign(time_train = lambda x: pd.to_timedelta(x['AutoML_endTimeUtc']-x['AutoML_startTimeUtc']).dt.seconds)\
    .assign(pipeline_spec = lambda x: x['models'].map(lambda models: models['properties']['pipeline_spec']))\
    .assign(objects = lambda x: x['pipeline_spec'].map(lambda ps: json.loads(ps)['objects'] ))\
    .assign(model_name = lambda x: x['objects'].map(lambda objects: [object["class_name"] for object in objects]))\
    .assign(model_params = lambda x: x['objects'].map(lambda objects: [object["param_kwargs"] for object in objects]))\
    .assign(score = lambda x: x['models'].map(lambda models: float(models['properties']['score']) if "score" in models['properties'] else None))\
    .assign(not_ensemble_lgbm = lambda x: x["model_name"].map(lambda models: len([model for model in models if re.match(".*(Ensemble|LightGBMClassifier).*", model)])) == 0)\
    .assign(model_name = lambda x: x['model_name'].map(lambda model_name: ", ".join(model_name)))\
    .assign(pk_file = lambda x: "tmp/tmp_data_" + x['task'] + ".pk")\
    .loc[:, ["model_id", "model_name","not_ensemble_lgbm","task", "fold", "AutoML_startTimeUtc", "AutoML_endTimeUtc", "time_train","model_params", "score", "pk_file"]]

## Filtering top models without Ensemble and LightGBM

In [14]:
top_models = result_df.query("not_ensemble_lgbm").groupby(["task","fold"])['score'].max().to_frame()\
                .merge(result_df, how="inner", left_on="score", right_on="score")
top_models.to_pickle("top_models.pk")

## Evalulating models for c-statistic with nested cross-validation

In [16]:
results = []   

for i in top_models.index:
    model = top_models.iloc[i]
    with open(model['pk_file'], "rb") as f:
        X, y, outer = pickle.loads(f.read())
    outer_folds = [(train_index, test_index) for train_index, test_index in outer.split(X)]

    fold = model['fold']
    model_id = model['model_id']
    model_name = model["model_name"]
    print(f"Evaluating model {i}: {model_name}")
    try:
        pipeline, specs = get_model_object(experiment, model_id)

        train_index, test_index = outer_folds[fold]
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        train = pd.concat([X_train, y_train], axis=1)

        clf = Pipeline(pipeline)
        clf.fit(X_train, y_train)        
        y_test_predicted = clf.predict_proba(X_test)[:,1]
        score = roc_auc_score(y_test, y_test_predicted)
        result = dict(model_name=model_name, fold=fold, model_id=model_id, cv_score=score, error=None)
        results.append(result)
        print(f"successfully evaluated!, The score is {score}, original score is {model['score']}")
    except Exception as err:
        print("Error!!")
        results.append(dict(model_name=model_name, fold=fold, model_id=model_id, cv_score=None, error=str(err)))
        
    with open("results_tmp.pk", "wb") as f:
        f.write(pickle.dumps(results))

Evaluating model 0: MaxAbsScaler, XGBoostClassifier




successfully evaluated!, The score is 0.7298136645962733, original score is 0.6682322497066197
Evaluating model 1: StandardScaler, SVCWrapper
successfully evaluated!, The score is 0.6391734972677596, original score is 0.6580676828625699
Evaluating model 2: MaxAbsScaler, ExtraTreesClassifier
successfully evaluated!, The score is 0.5988838507150331, original score is 0.6769838120532803
Evaluating model 3: StandardScaler, XGBoostClassifier




successfully evaluated!, The score is 0.6579606440071557, original score is 0.6794583630039288
Evaluating model 4: RobustScaler, ExtraTreesClassifier
successfully evaluated!, The score is 0.6239697802197803, original score is 0.6636758307624472
Evaluating model 5: StandardScaler, SVCWrapper
successfully evaluated!, The score is 0.8446346280447663, original score is 0.8956565899484114
Evaluating model 6: StandardScaler, LogisticRegression




successfully evaluated!, The score is 0.8758169934640523, original score is 0.9041014867367408
Evaluating model 7: MaxAbsScaler, LogisticRegression
successfully evaluated!, The score is 0.8886287625418061, original score is 0.9142174369747899
Evaluating model 8: SparseNormalizer, XGBoostClassifier




successfully evaluated!, The score is 0.9002713704206241, original score is 0.9008537265429506
Evaluating model 9: MinMaxScaler, LogisticRegression
successfully evaluated!, The score is 0.9292857142857144, original score is 0.8958086574816437
Evaluating model 10: RobustScaler, SVCWrapper
successfully evaluated!, The score is 0.5760233918128655, original score is 0.7036008868994408
Evaluating model 11: MaxAbsScaler, LogisticRegression




successfully evaluated!, The score is 0.6588145896656534, original score is 0.7044431810363438
Evaluating model 12: TruncatedSVDWrapper, LogisticRegression




successfully evaluated!, The score is 0.6808, original score is 0.6806693805410108
Evaluating model 13: StandardScaler, SVCWrapper
successfully evaluated!, The score is 0.6653343023255813, original score is 0.7023299710559018
Evaluating model 14: SparseNormalizer, RandomForestClassifier
successfully evaluated!, The score is 0.68733153638814, original score is 0.6937567746464709
Evaluating model 15: SparseNormalizer, XGBoostClassifier




successfully evaluated!, The score is 0.7315051020408163, original score is 0.783437326766595
Evaluating model 16: MinMaxScaler, LogisticRegression




successfully evaluated!, The score is 0.7089460784313725, original score is 0.779120693006251
Evaluating model 17: StandardScaler, LogisticRegression
successfully evaluated!, The score is 0.7702205882352942, original score is 0.7621818870771524
Evaluating model 18: StandardScaler, LogisticRegression
successfully evaluated!, The score is 0.805458768873403, original score is 0.7749326388203153
Evaluating model 19: SparseNormalizer, XGBoostClassifier




successfully evaluated!, The score is 0.6974148061104584, original score is 0.7693286225228722


In [21]:
top_models_cv = df(results)\
.merge(top_models.drop(columns=["model_name", "fold", "pk_file"]), left_on="model_id", right_on="model_id")
top_models_cv

Unnamed: 0,model_name,fold,model_id,cv_score,error,score,not_ensemble_lgbm,task,AutoML_startTimeUtc,AutoML_endTimeUtc,time_train,model_params
0,"MaxAbsScaler, XGBoostClassifier",0,AutoML_1037f5bb-fb68-4ca1-b1e8-23b98152c06f_1,0.73,,0.67,True,O_clr_train_LUAD_stage,2021-08-31 11:13:35.217396+00:00,2021-08-31 13:55:20.661617+00:00,9705,"[{}, {'tree_method': 'auto'}]"
1,"StandardScaler, SVCWrapper",1,AutoML_9464535e-a45d-4313-9a72-71e575d4df1d_22,0.64,,0.66,True,O_clr_train_LUAD_stage,2021-08-31 12:38:12.388853+00:00,2021-08-31 15:16:51.044860+00:00,9518,"[{'with_mean': False, 'with_std': True}, {'C':..."
2,"MaxAbsScaler, ExtraTreesClassifier",2,AutoML_a6f3cdd5-65a1-4f14-9efd-203ec3fd2ae1_19,0.6,,0.68,True,O_clr_train_LUAD_stage,2021-08-31 12:54:30.704156+00:00,2021-08-31 15:24:49.195618+00:00,9018,"[{}, {'bootstrap': False, 'class_weight': 'bal..."
3,"StandardScaler, XGBoostClassifier",3,AutoML_11192533-c814-4034-8c3a-372ce61e2340_47,0.66,,0.68,True,O_clr_train_LUAD_stage,2021-08-31 17:44:24.245852+00:00,2021-08-31 21:21:16.337197+00:00,13012,"[{'with_mean': False, 'with_std': False}, {'bo..."
4,"RobustScaler, ExtraTreesClassifier",4,AutoML_cd4b0346-9a83-45d8-b145-cdc1b580b463_14,0.62,,0.66,True,O_clr_train_LUAD_stage,2021-08-31 17:47:41.161570+00:00,2021-08-31 21:13:00.362666+00:00,12319,"[{'quantile_range': [10, 90], 'with_centering'..."
5,"StandardScaler, SVCWrapper",0,AutoML_8bd49342-a15a-497b-801e-e78177e4d900_11,0.84,,0.9,True,O_clr_train_UCEC_grade,2021-08-31 13:56:10.816688+00:00,2021-08-31 16:18:39.934154+00:00,8549,"[{'with_mean': True, 'with_std': True}, {'C': ..."
6,"StandardScaler, LogisticRegression",1,AutoML_9b16ca9c-953c-419b-9078-52553936b864_28,0.88,,0.9,True,O_clr_train_UCEC_grade,2021-08-31 15:56:55.131983+00:00,2021-08-31 19:05:55.511415+00:00,11340,"[{'with_mean': True, 'with_std': True}, {'C': ..."
7,"MaxAbsScaler, LogisticRegression",2,AutoML_1c798080-4213-4372-b2d2-e8cd95b96dbc_41,0.89,,0.91,True,O_clr_train_UCEC_grade,2021-08-31 15:57:05.933055+00:00,2021-08-31 19:43:57.610999+00:00,13611,"[{}, {'C': 719.6856730011514, 'class_weight': ..."
8,"SparseNormalizer, XGBoostClassifier",3,AutoML_574777b7-d459-4c43-abfc-a2cd1dec228f_34,0.9,,0.9,True,O_clr_train_UCEC_grade,2021-08-31 16:28:31.434909+00:00,2021-08-31 20:11:52.916876+00:00,13401,"[{'norm': 'l2'}, {'booster': 'gbtree', 'colsam..."
9,"MinMaxScaler, LogisticRegression",4,AutoML_f1add042-43b6-47ae-9045-ee0b0e1f63d0_39,0.93,,0.9,True,O_clr_train_UCEC_grade,2021-08-31 19:12:18.850488+00:00,2021-08-31 22:59:35.447278+00:00,13636,"[{}, {'C': 4714.8663634573895, 'class_weight':..."


In [15]:
top_models_cv.to_csv("top_models_transcriptomic.csv")
top_models_cv.to_pickle("top_models_transcriptomic.pk")
top_models_cv.to_pickle("top_models_transcriptomic.pk")
result_df.to_pickle("all_models_transcriptomic.pk")