In [1]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
from pipeline_utils import (
    COMPUTE, CONCURRENT_TRIALS, DELAY_EVALUATION, EVALUATION_INTERVAL, GOAL, METRIC, SAMPLING_ALGORITHM, TIMEOUT, TIMEOUT_PLUS, TOTAL_TRIALS, #SWEEP
    SUBSCRIPTION, RESOURCE_GROUP, WS_NAME,  #AUTHENTICATE
    GBC_PATH, NBC_PATH, RFC_PATH, SVC_PATH, XGB_PATH, MULT_NBC_PATH,    #COMPONENTS PATHS
    PEARSON_PATH,    #FEAT SEL PATHS
    PREP_DATA_PATH,  #PREP DATA PATH
    GBC_BY_PEARSON, NBC_BY_PEARSON, RFC_BY_PEARSON, SVC_BY_PEARSON, XGB_BY_PEARSON, MULT_NBC_BY_PEARSON,   #PIPELINES
    PEARSON,   #FEAT SEL
    N_FEATURES, TRAIN_DATA, TEST_DATA, DATA_VERSION, #PIPELINE VALUES
    get_experiment_names,   #FUNCTIONS
)

# authenticate
credential = DefaultAzureCredential()

# Get a handle to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id=SUBSCRIPTION,
    resource_group_name=RESOURCE_GROUP,
    workspace_name=WS_NAME,
)

In [2]:
data_to_train = ml_client.data.get(name=TRAIN_DATA.split('.')[0], version=DATA_VERSION)
print(f"Data to train asset URI: {data_to_train.path} - name: {TRAIN_DATA.split('.')[0]}")

data_to_test = ml_client.data.get(name=TEST_DATA.split('.')[0], version=DATA_VERSION)
print(f"Data to test asset URI: {data_to_test.path} - name: {TEST_DATA.split('.')[0]}")

Data to train asset URI: azureml://subscriptions/da6ec459-95c4-4f18-8440-d275df8d38b7/resourcegroups/tcc-exp-rg/workspaces/tcc-experiments/datastores/workspaceblobstore/paths/LocalUpload/3cfdebd258ea2db7363c55bb841ca887/vrex_encoded_tf_idf_2008_2009_2010_2011_2012_2013_2014_2015_2016_2017.csv - name: vrex_encoded_tf_idf_2008_2009_2010_2011_2012_2013_2014_2015_2016_2017
Data to test asset URI: azureml://subscriptions/da6ec459-95c4-4f18-8440-d275df8d38b7/resourcegroups/tcc-exp-rg/workspaces/tcc-experiments/datastores/workspaceblobstore/paths/LocalUpload/3992df39ce41e34367e629d6656f4e9f/vrex_encoded_tf_idf_2018_2019_2020_2021.csv - name: vrex_encoded_tf_idf_2018_2019_2020_2021


In [3]:
# importing the Component Package
from azure.ai.ml import load_component

data_prep_component = load_component(source=PREP_DATA_PATH)
data_prep_component = ml_client.create_or_update(data_prep_component)

feat_sel_component = load_component(source=PEARSON_PATH)
feat_sel_component = ml_client.create_or_update(feat_sel_component)

train_gbc = load_component(source=GBC_PATH)
train_gbc = ml_client.create_or_update(train_gbc)

train_nbc = load_component(source=NBC_PATH)
train_nbc = ml_client.create_or_update(train_nbc)

train_rfc = load_component(source=RFC_PATH)
train_rfc = ml_client.create_or_update(train_rfc)

train_svc = load_component(source=SVC_PATH)
train_svc = ml_client.create_or_update(train_svc)

train_xgb = load_component(source=XGB_PATH)
train_xgb = ml_client.create_or_update(train_xgb)

mult_nbc_train = load_component(source=MULT_NBC_PATH)
mult_nbc_train = ml_client.create_or_update(mult_nbc_train)


In [4]:

# the dsl decorator tells the sdk that we are defining an Azure Machine Learning pipeline
from azure.ai.ml import dsl, Input, Output
from azure.ai.ml.sweep import Choice, Uniform, MedianStoppingPolicy

In [5]:
@dsl.pipeline(
    name=GBC_BY_PEARSON,
    compute=COMPUTE,
    description="E2E data_perp-train pipeline",
)
def train_gbc_pipeline(
    data_to_train,
    data_to_test,
    feature_quantity,
    flag_remove_null_values,
    flag_remove_values_by_percentage,
    percentage_to_remove_column,
):

    data_prep_job = data_prep_component(
        data_to_train=data_to_train,
        data_to_test=data_to_test,
        flag_remove_null_values=flag_remove_null_values,
        flag_remove_values_by_percentage=flag_remove_values_by_percentage,
        percentage_to_remove_column=percentage_to_remove_column,
    )

    feat_sel_job = feat_sel_component(
        train_data=data_prep_job.outputs.train_data,
        test_data=data_prep_job.outputs.test_data,
        feature_quantity=feature_quantity,
    )

    train_gbc_job = train_gbc(
        train_data=feat_sel_job.outputs.train_data_feat_sel,  
        test_data=feat_sel_job.outputs.test_data_feat_sel,  
        n_estimators_to_gbc=Choice(values=[50, 100, 200]),
        learning_rate_to_gbc=Uniform(min_value=0.01, max_value=0.3),
    )

    sweep_step_to_gbc = train_gbc_job.sweep(
        compute=COMPUTE,
        sampling_algorithm=SAMPLING_ALGORITHM,
        primary_metric=METRIC,
        goal=GOAL,
    )

    sweep_step_to_gbc.set_limits(max_total_trials=2*TOTAL_TRIALS, max_concurrent_trials=CONCURRENT_TRIALS, timeout=TIMEOUT)

    sweep_step_to_gbc.early_termination = MedianStoppingPolicy(delay_evaluation=DELAY_EVALUATION, evaluation_interval=EVALUATION_INTERVAL)


In [6]:
@dsl.pipeline(
    name=NBC_BY_PEARSON,
    compute=COMPUTE,
    description="E2E data_perp-train pipeline",
)
def train_nbc_pipeline(
    data_to_train,
    data_to_test,
    feature_quantity,
    flag_remove_null_values,
    flag_remove_values_by_percentage,
    percentage_to_remove_column,
):

    data_prep_job = data_prep_component(
        data_to_train=data_to_train,
        data_to_test=data_to_test,
        flag_remove_null_values=flag_remove_null_values,
        flag_remove_values_by_percentage=flag_remove_values_by_percentage,
        percentage_to_remove_column=percentage_to_remove_column,
    )

    feat_sel_job = feat_sel_component(
        train_data=data_prep_job.outputs.train_data,
        test_data=data_prep_job.outputs.test_data,
        feature_quantity=feature_quantity,
    )

    train_nbc_job = train_nbc(
        train_data=feat_sel_job.outputs.train_data_feat_sel,  
        test_data=feat_sel_job.outputs.test_data_feat_sel,  
    )

In [7]:
@dsl.pipeline(
    name=RFC_BY_PEARSON,
    compute=COMPUTE,
    description="E2E data_perp-train pipeline",
)
def train_rfc_pipeline(
    data_to_train,
    data_to_test,
    feature_quantity,
    flag_remove_null_values,
    flag_remove_values_by_percentage,
    percentage_to_remove_column,
):

    data_prep_job = data_prep_component(
        data_to_train=data_to_train,
        data_to_test=data_to_test,
        flag_remove_null_values=flag_remove_null_values,
        flag_remove_values_by_percentage=flag_remove_values_by_percentage,
        percentage_to_remove_column=percentage_to_remove_column,
    )

    feat_sel_job = feat_sel_component(
        train_data=data_prep_job.outputs.train_data,
        test_data=data_prep_job.outputs.test_data,
        feature_quantity=feature_quantity,
    )

    train_rfc_job = train_rfc(
        train_data=feat_sel_job.outputs.train_data_feat_sel,  
        test_data=feat_sel_job.outputs.test_data_feat_sel, 
        n_estimators_to_rfc=Choice(values=[50, 100, 200]),
    )

    sweep_step_to_rfc = train_rfc_job.sweep(
        compute=COMPUTE,
        sampling_algorithm=SAMPLING_ALGORITHM,
        primary_metric=METRIC,
        goal=GOAL,
    )

    sweep_step_to_rfc.set_limits(max_total_trials=TOTAL_TRIALS, max_concurrent_trials=CONCURRENT_TRIALS, timeout=TIMEOUT)
    sweep_step_to_rfc.early_termination = MedianStoppingPolicy(delay_evaluation=DELAY_EVALUATION, evaluation_interval=EVALUATION_INTERVAL)

In [8]:
@dsl.pipeline(
    name=SVC_BY_PEARSON,
    compute=COMPUTE,
    description="E2E data_perp-train pipeline",
)
def train_svc_pipeline(
    data_to_train,
    data_to_test,
    feature_quantity,
    flag_remove_null_values,
    flag_remove_values_by_percentage,
    percentage_to_remove_column,
):

    data_prep_job = data_prep_component(
        data_to_train=data_to_train,
        data_to_test=data_to_test,
        flag_remove_null_values=flag_remove_null_values,
        flag_remove_values_by_percentage=flag_remove_values_by_percentage,
        percentage_to_remove_column=percentage_to_remove_column,
    )

    feat_sel_job = feat_sel_component(
        train_data=data_prep_job.outputs.train_data,
        test_data=data_prep_job.outputs.test_data,
        feature_quantity=feature_quantity,
    )

    train_svc_job = train_svc(
        train_data=feat_sel_job.outputs.train_data_feat_sel,  
        test_data=feat_sel_job.outputs.test_data_feat_sel,   
        kernel_to_svc=Choice(values=["linear", "rbf", "poly", "sigmoid","precomputed"]),
        gamma_to_svc=Choice(values=["scale", "auto"]),
    )

    sweep_step_to_svc = train_svc_job.sweep(
        compute=COMPUTE,
        sampling_algorithm=SAMPLING_ALGORITHM,
        primary_metric=METRIC,
        goal=GOAL,
    )

    sweep_step_to_svc.set_limits(max_total_trials=4*TOTAL_TRIALS, max_concurrent_trials=CONCURRENT_TRIALS, timeout=2*TIMEOUT_PLUS)
    sweep_step_to_svc.early_termination = MedianStoppingPolicy(delay_evaluation=DELAY_EVALUATION, evaluation_interval=EVALUATION_INTERVAL)

In [9]:
@dsl.pipeline(
    name=XGB_BY_PEARSON,
    compute=COMPUTE,
    description="E2E data_perp-train pipeline",
)
def train_xgb_pipeline(
    data_to_train,
    data_to_test,
    feature_quantity,
    flag_remove_null_values,
    flag_remove_values_by_percentage,
    percentage_to_remove_column,
):

    data_prep_job = data_prep_component(
        data_to_train=data_to_train,
        data_to_test=data_to_test,
        flag_remove_null_values=flag_remove_null_values,
        flag_remove_values_by_percentage=flag_remove_values_by_percentage,
        percentage_to_remove_column=percentage_to_remove_column,
    )

    feat_sel_job = feat_sel_component(
        train_data=data_prep_job.outputs.train_data,
        test_data=data_prep_job.outputs.test_data,
        feature_quantity=feature_quantity,
    )

    train_xgb_job = train_xgb(
        train_data=feat_sel_job.outputs.train_data_feat_sel,  
        test_data=feat_sel_job.outputs.test_data_feat_sel,   
        n_estimators_to_xgb=Choice(values=[100, 500, 1000]),
        learning_rate_to_xgb=Uniform(min_value=0.01, max_value=0.3),
    )

    sweep_step_to_xgb = train_xgb_job.sweep(
        compute=COMPUTE,
        sampling_algorithm=SAMPLING_ALGORITHM,
        primary_metric=METRIC,
        goal=GOAL,
    )

    sweep_step_to_xgb.set_limits(max_total_trials=2*TOTAL_TRIALS, max_concurrent_trials=CONCURRENT_TRIALS, timeout=TIMEOUT_PLUS)
    sweep_step_to_xgb.early_termination = MedianStoppingPolicy(delay_evaluation=DELAY_EVALUATION, evaluation_interval=EVALUATION_INTERVAL)

In [10]:
@dsl.pipeline(
    name=MULT_NBC_BY_PEARSON,
    compute=COMPUTE,
    description="E2E data_perp-train pipeline",
)
def train_mult_nbc_pipeline(
    data_to_train,
    data_to_test,
    feature_quantity,
    flag_remove_null_values,
    flag_remove_values_by_percentage,
    percentage_to_remove_column,
):

    data_prep_job = data_prep_component(
        data_to_train=data_to_train,
        data_to_test=data_to_test,
        flag_remove_null_values=flag_remove_null_values,
        flag_remove_values_by_percentage=flag_remove_values_by_percentage,
        percentage_to_remove_column=percentage_to_remove_column,
    )

    feat_sel_job = feat_sel_component(
        train_data=data_prep_job.outputs.train_data,
        test_data=data_prep_job.outputs.test_data,
        feature_quantity=feature_quantity,
    )

    mult_nbc_train_job = mult_nbc_train(
        train_data=feat_sel_job.outputs.train_data_feat_sel,  
        test_data=feat_sel_job.outputs.test_data_feat_sel,  
        alpha=Choice(values=[0.01, 0.1, 0.5, 1.0, 2, 5]),
        fit_prior=Choice(values=[True, False]),
    )

    sweep_step_to_mult_nbc = mult_nbc_train_job.sweep(
        compute=COMPUTE,
        sampling_algorithm=SAMPLING_ALGORITHM,
        primary_metric=METRIC,
        goal=GOAL,
    )

    sweep_step_to_mult_nbc.set_limits(max_total_trials=2*TOTAL_TRIALS, max_concurrent_trials=CONCURRENT_TRIALS, timeout=TIMEOUT_PLUS)

    sweep_step_to_mult_nbc.early_termination = MedianStoppingPolicy(delay_evaluation=DELAY_EVALUATION, evaluation_interval=EVALUATION_INTERVAL)


In [11]:
pipelines = []

for n_feature in N_FEATURES:
    pipelines.append(train_mult_nbc_pipeline(
        data_to_train=Input(type="uri_file", path=data_to_train.path),
        data_to_test=Input(type="uri_file", path=data_to_test.path),
        feature_quantity=n_feature,
        flag_remove_null_values=False,
        flag_remove_values_by_percentage=False,
        percentage_to_remove_column=0,
    ))

    pipelines.append(train_nbc_pipeline(
        data_to_train=Input(type="uri_file", path=data_to_train.path),
        data_to_test=Input(type="uri_file", path=data_to_test.path),
        feature_quantity=n_feature,
        flag_remove_null_values=False,
        flag_remove_values_by_percentage=False,
        percentage_to_remove_column=0,
    ))

    pipelines.append(train_gbc_pipeline(
        data_to_train=Input(type="uri_file", path=data_to_train.path),
        data_to_test=Input(type="uri_file", path=data_to_test.path),
        feature_quantity=n_feature,
        flag_remove_null_values=False,
        flag_remove_values_by_percentage=False,
        percentage_to_remove_column=0,
    ))

    pipelines.append(train_rfc_pipeline(
        data_to_train=Input(type="uri_file", path=data_to_train.path),
        data_to_test=Input(type="uri_file", path=data_to_test.path),
        feature_quantity=n_feature,
        flag_remove_null_values=False,
        flag_remove_values_by_percentage=False,
        percentage_to_remove_column=0,
    ))
    
    pipelines.append(train_svc_pipeline(
        data_to_train=Input(type="uri_file", path=data_to_train.path),
        data_to_test=Input(type="uri_file", path=data_to_test.path),
        feature_quantity=n_feature,
        flag_remove_null_values=False,
        flag_remove_values_by_percentage=False,
        percentage_to_remove_column=0,
    ))

    pipelines.append(train_xgb_pipeline(
        data_to_train=Input(type="uri_file", path=data_to_train.path),
        data_to_test=Input(type="uri_file", path=data_to_test.path),
        feature_quantity=n_feature,
        flag_remove_null_values=False,
        flag_remove_values_by_percentage=False,
        percentage_to_remove_column=0,
    ))


In [12]:
experiment_names = get_experiment_names(PEARSON)
for pipeline, experiment_name in zip(pipelines, experiment_names):
    pipeline_job = ml_client.jobs.create_or_update(
        pipeline,
        experiment_name=experiment_name,
    )
    ml_client.jobs.stream(pipeline_job.name)

Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


Pearson_MultGaussianNB_N_FEAT_10_DATE_2024_07_02_13_33_39
Pearson_GaussianNB_N_FEAT_10_DATE_2024_07_02_13_33_39
Pearson_GradientBoostingClassifier_N_FEAT_10_DATE_2024_07_02_13_33_39
Pearson_RandomForestClassifier_N_FEAT_10_DATE_2024_07_02_13_33_39
Pearson_SVC_N_FEAT_10_DATE_2024_07_02_13_33_39
Pearson_XGBoost_N_FEAT_10_DATE_2024_07_02_13_33_39
Pearson_MultGaussianNB_N_FEAT_5_DATE_2024_07_02_13_33_39
Pearson_GaussianNB_N_FEAT_5_DATE_2024_07_02_13_33_39
Pearson_GradientBoostingClassifier_N_FEAT_5_DATE_2024_07_02_13_33_39
Pearson_RandomForestClassifier_N_FEAT_5_DATE_2024_07_02_13_33_39
Pearson_SVC_N_FEAT_5_DATE_2024_07_02_13_33_39
Pearson_XGBoost_N_FEAT_5_DATE_2024_07_02_13_33_39
RunId: serene_beard_vbvwxyzywc
Web View: https://ml.azure.com/runs/serene_beard_vbvwxyzywc?wsid=/subscriptions/da6ec459-95c4-4f18-8440-d275df8d38b7/resourcegroups/tcc-exp-rg/workspaces/tcc-experiments

Streaming logs/azureml/executionlogs.txt

[2024-07-02 13:33:44Z] Submitting 1 runs, first five are: 0e595859:6db

Bad pipe message: %s [b'\x98\x93s[\xce|\xed\xfa2\xddbu\xdc\xef\xfd\xc9t\x04 \xc0\x9a\x8c;gv\xcc\x01\xb2\xc3\xac\n\xe7\x89f\x1c\x02\xdf;\x93\x8a\xef\xdf\x8b\x90\xbb\x8a\xb6gD\x85J\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n']
Bad pipe message: %s [b'\xa2\x988\xb3X\xffY\xf5\xd5o\xed']
Bad pipe message: %s [b'\x02\x06\x95\xa6\xa3 AgK7\xd4\t\xfet)Ic\xd6\x8b=\xce\xdf\xc2', b'Ut\xcf~\xec8y\rU\xf3V\xd7U\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06\x01\x00+\x00\x03\x02\x03\x04\x00-\x00\x02\x01\x01\x003\x00&\x00$\x00', b' \xca>\xe0\x90\xf17\xbf+\x14\x15P@\x80\x9c(JC)\x12\xbdc\xe

[2024-07-02 15:31:11Z] Completing processing run id 9d29965c-da53-477a-95d0-86f69eb4eacf.

Execution Summary
RunId: yellow_jelly_fdbqpw7jw6
Web View: https://ml.azure.com/runs/yellow_jelly_fdbqpw7jw6?wsid=/subscriptions/da6ec459-95c4-4f18-8440-d275df8d38b7/resourcegroups/tcc-exp-rg/workspaces/tcc-experiments

RunId: calm_whistle_kbxzzlzjp4
Web View: https://ml.azure.com/runs/calm_whistle_kbxzzlzjp4?wsid=/subscriptions/da6ec459-95c4-4f18-8440-d275df8d38b7/resourcegroups/tcc-exp-rg/workspaces/tcc-experiments

Streaming logs/azureml/executionlogs.txt

[2024-07-02 15:31:20Z] Completing processing run id 76f71ce4-c4bc-4138-9073-a50b7b7d6eac.
[2024-07-02 15:31:21Z] Completing processing run id 341578c1-0d8c-4cb7-930f-0a0ff2eae8a2.
[2024-07-02 15:31:22Z] Submitting 1 runs, first five are: 1fb77cf9:ddd39daf-f58c-43d1-bb0e-d5f6c86ca487
[2024-07-02 15:35:01Z] Completing processing run id ddd39daf-f58c-43d1-bb0e-d5f6c86ca487.

Execution Summary
RunId: calm_whistle_kbxzzlzjp4
Web View: https://ml.