In [1]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
from pipeline_utils import (
    COMPUTE, CONCURRENT_TRIALS, DELAY_EVALUATION, EVALUATION_INTERVAL, GOAL, METRIC, SAMPLING_ALGORITHM, TIMEOUT, TIMEOUT_PLUS, TOTAL_TRIALS, #SWEEP
    SUBSCRIPTION, RESOURCE_GROUP, WS_NAME,  #AUTHENTICATE
    NBC_PATH,    #COMPONENTS PATHS
    GINI_PATH, INFOGAIN_PATH, PEARSON_PATH, SPEARMAN_PATH,    #FEAT SEL PATHS
    PREP_DATA_PATH,  #PREP DATA PATH
    NBC_BY_GINI, NBC_BY_INFOGAIN, NBC_BY_PEARSON, NBC_BY_SPEARMAN   #PIPELINES
)

# authenticate
credential = DefaultAzureCredential()

# Get a handle to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id=SUBSCRIPTION,
    resource_group_name=RESOURCE_GROUP,
    workspace_name=WS_NAME,
)

In [2]:
TRAIN_DATAS = [
    "vrex_encoded_tf_idf_2008_2009_2010_2011_2012_2013_2014_2015_2016_2017.csv"
]

TEST_DATAS = [
    "vrex_encoded_tf_idf_2018_2019_2020_2021.csv"
]

version = "v1"

arr_data_to_train = []
arr_data_to_test = []

for to_train, to_test in zip(TRAIN_DATAS, TEST_DATAS):
    data_to_train = ml_client.data.get(name=to_train.split(".")[0], version=version)
    arr_data_to_train.append(data_to_train)
    print(f"Data to train asset URI: {data_to_train.path} - name: {to_train.split('.')[0]}")

    data_to_test = ml_client.data.get(name=to_test.split('.')[0], version=version)
    arr_data_to_test.append(data_to_test)
    print(f"Data to test asset URI: {data_to_test.path} - name: {to_test.split('.')[0]}")

Data to train asset URI: azureml://subscriptions/1f51a68b-69d8-4818-bf63-7c5e10b81967/resourcegroups/luiz.victor.dev-rg/workspaces/tcc-tests/datastores/workspaceblobstore/paths/LocalUpload/3cfdebd258ea2db7363c55bb841ca887/vrex_encoded_tf_idf_2008_2009_2010_2011_2012_2013_2014_2015_2016_2017.csv - name: vrex_encoded_tf_idf_2008_2009_2010_2011_2012_2013_2014_2015_2016_2017
Data to test asset URI: azureml://subscriptions/1f51a68b-69d8-4818-bf63-7c5e10b81967/resourcegroups/luiz.victor.dev-rg/workspaces/tcc-tests/datastores/workspaceblobstore/paths/LocalUpload/3992df39ce41e34367e629d6656f4e9f/vrex_encoded_tf_idf_2018_2019_2020_2021.csv - name: vrex_encoded_tf_idf_2018_2019_2020_2021


In [3]:
# importing the Component Package
from azure.ai.ml import load_component

data_prep_component = load_component(source=PREP_DATA_PATH)
data_prep_component = ml_client.create_or_update(data_prep_component)

gini_feat_sel_component = load_component(source=GINI_PATH)
gini_feat_sel_component = ml_client.create_or_update(gini_feat_sel_component)

infogain_feat_sel_component = load_component(source=INFOGAIN_PATH)
infogain_feat_sel_component = ml_client.create_or_update(infogain_feat_sel_component)

spearman_feat_sel_component = load_component(source=SPEARMAN_PATH)
spearman_feat_sel_component = ml_client.create_or_update(spearman_feat_sel_component)

pearson_feat_sel_component = load_component(source=PEARSON_PATH)
pearson_feat_sel_component = ml_client.create_or_update(pearson_feat_sel_component)

train_nbc = load_component(source=NBC_PATH)
train_nbc = ml_client.create_or_update(train_nbc)

In [4]:

# the dsl decorator tells the sdk that we are defining an Azure Machine Learning pipeline
from azure.ai.ml import dsl, Input, Output
from azure.ai.ml.sweep import Choice, Uniform, MedianStoppingPolicy

In [5]:
@dsl.pipeline(
    name=NBC_BY_GINI,
    compute=COMPUTE,
    description="E2E data_perp-train pipeline",
)
def train_nbc_by_gini_pipeline(
    data_to_train,
    data_to_test,
    feature_quantity,
):

    data_prep_job = data_prep_component(
        data_to_train=data_to_train,
        data_to_test=data_to_test,
        flag_remove_null_values=False,
        flag_remove_values_by_percentage=False,
        percentage_to_remove_column=0,
    )

    feat_sel_job = gini_feat_sel_component(
        train_data=data_prep_job.outputs.train_data,
        test_data=data_prep_job.outputs.test_data,
        feature_quantity=feature_quantity,
    )

    train_nbc_job = train_nbc(
        train_data=feat_sel_job.outputs.train_data_feat_sel,  
        test_data=feat_sel_job.outputs.test_data_feat_sel,  
    )

In [6]:


pipeline_job = ml_client.jobs.create_or_update(
    train_nbc_by_gini_pipeline(
        data_to_train=Input(type="uri_file", path=data_to_train.path),
        data_to_test=Input(type="uri_file", path=data_to_test.path),
        feature_quantity=20,
    ),
    experiment_name="train_nbc_by_gini_pipeline",
    )

ml_client.jobs.stream(pipeline_job.name)

Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


RunId: calm_toe_yx9ghgqcjt
Web View: https://ml.azure.com/runs/calm_toe_yx9ghgqcjt?wsid=/subscriptions/1f51a68b-69d8-4818-bf63-7c5e10b81967/resourcegroups/luiz.victor.dev-rg/workspaces/tcc-tests

Streaming logs/azureml/executionlogs.txt

[2024-06-16 19:32:22Z] Submitting 1 runs, first five are: 4ffa5437:9176a18e-5878-4c30-8e5c-dd9833793c3b
[2024-06-16 19:32:24Z] Completing processing run id 9176a18e-5878-4c30-8e5c-dd9833793c3b.
[2024-06-16 19:32:24Z] Submitting 1 runs, first five are: ee92fee0:fc996b13-ad30-49a0-836c-fd52bb5fa5b6
[2024-06-16 19:36:44Z] Completing processing run id fc996b13-ad30-49a0-836c-fd52bb5fa5b6.
[2024-06-16 19:36:45Z] Submitting 1 runs, first five are: 393defe3:e4a7b528-a5fa-4388-ba6a-c5e42cb79af5
[2024-06-16 19:42:29Z] Completing processing run id e4a7b528-a5fa-4388-ba6a-c5e42cb79af5.

Execution Summary
RunId: calm_toe_yx9ghgqcjt
Web View: https://ml.azure.com/runs/calm_toe_yx9ghgqcjt?wsid=/subscriptions/1f51a68b-69d8-4818-bf63-7c5e10b81967/resourcegroups/luiz.v

In [7]:
@dsl.pipeline(
    name=NBC_BY_INFOGAIN,
    compute=COMPUTE,
    description="E2E data_perp-train pipeline",
)
def train_nbc_by_infogain_pipeline(
    data_to_train,
    data_to_test,
    feature_quantity,
):

    data_prep_job = data_prep_component(
        data_to_train=data_to_train,
        data_to_test=data_to_test,
        flag_remove_null_values=False,
        flag_remove_values_by_percentage=False,
        percentage_to_remove_column=0,
    )

    feat_sel_job = infogain_feat_sel_component(
        train_data=data_prep_job.outputs.train_data,
        test_data=data_prep_job.outputs.test_data,
        feature_quantity=feature_quantity,
    )

    train_nbc_job = train_nbc(
        train_data=feat_sel_job.outputs.train_data_feat_sel,  
        test_data=feat_sel_job.outputs.test_data_feat_sel,  
    )

In [8]:


pipeline_job = ml_client.jobs.create_or_update(
    train_nbc_by_infogain_pipeline(
        data_to_train=Input(type="uri_file", path=data_to_train.path),
        data_to_test=Input(type="uri_file", path=data_to_test.path),
        feature_quantity=20,
    ),
    experiment_name="train_nbc_by_infogain_pipeline",
    )

ml_client.jobs.stream(pipeline_job.name)

RunId: jolly_whistle_yvp85y38y3
Web View: https://ml.azure.com/runs/jolly_whistle_yvp85y38y3?wsid=/subscriptions/1f51a68b-69d8-4818-bf63-7c5e10b81967/resourcegroups/luiz.victor.dev-rg/workspaces/tcc-tests

Streaming logs/azureml/executionlogs.txt

[2024-06-16 19:42:50Z] Completing processing run id 729373b8-11df-44cb-82db-263aafc7d019.
[2024-06-16 19:42:51Z] Submitting 1 runs, first five are: c1f37c15:2b2e9565-19ca-40de-941b-dade6fe74464
[2024-06-16 19:47:20Z] Completing processing run id 2b2e9565-19ca-40de-941b-dade6fe74464.
[2024-06-16 19:47:21Z] Submitting 1 runs, first five are: c1050188:f65fc70b-d832-43f8-a4ae-5e03cd4fa0a9
[2024-06-16 19:48:38Z] Completing processing run id f65fc70b-d832-43f8-a4ae-5e03cd4fa0a9.

Execution Summary
RunId: jolly_whistle_yvp85y38y3
Web View: https://ml.azure.com/runs/jolly_whistle_yvp85y38y3?wsid=/subscriptions/1f51a68b-69d8-4818-bf63-7c5e10b81967/resourcegroups/luiz.victor.dev-rg/workspaces/tcc-tests



In [12]:
@dsl.pipeline(
    name=NBC_BY_PEARSON,
    compute=COMPUTE,
    description="E2E data_perp-train pipeline",
)
def train_nbc_by_pearson_pipeline(
    data_to_train,
    data_to_test,
    feature_quantity,
):

    data_prep_job = data_prep_component(
        data_to_train=data_to_train,
        data_to_test=data_to_test,
        flag_remove_null_values=False,
        flag_remove_values_by_percentage=False,
        percentage_to_remove_column=0,
    )

    feat_sel_job = pearson_feat_sel_component(
        train_data=data_prep_job.outputs.train_data,
        test_data=data_prep_job.outputs.test_data,
        feature_quantity=feature_quantity,
    )

    train_nbc_job = train_nbc(
        train_data=feat_sel_job.outputs.train_data_feat_sel,  
        test_data=feat_sel_job.outputs.test_data_feat_sel,  
    )

In [13]:


pipeline_job = ml_client.jobs.create_or_update(
    train_nbc_by_pearson_pipeline(
        data_to_train=Input(type="uri_file", path=data_to_train.path),
        data_to_test=Input(type="uri_file", path=data_to_test.path),
        feature_quantity=20,
    ),
    experiment_name="train_nbc_by_pearson_pipeline",
    )

ml_client.jobs.stream(pipeline_job.name)

RunId: busy_farm_941d7ttnld
Web View: https://ml.azure.com/runs/busy_farm_941d7ttnld?wsid=/subscriptions/1f51a68b-69d8-4818-bf63-7c5e10b81967/resourcegroups/luiz.victor.dev-rg/workspaces/tcc-tests

Streaming logs/azureml/executionlogs.txt

[2024-06-16 20:04:12Z] Completing processing run id 612732f6-9fbf-499d-88d0-cdea01352d6c.
[2024-06-16 20:04:12Z] Submitting 1 runs, first five are: 76a7cf21:3355abaf-1115-44b8-964b-6596576d0c2a
[2024-06-16 20:08:55Z] Completing processing run id 3355abaf-1115-44b8-964b-6596576d0c2a.
[2024-06-16 20:08:56Z] Submitting 1 runs, first five are: 48db523f:7dca697b-118c-4c7a-8e03-a4cba66b8c74
[2024-06-16 20:13:24Z] Completing processing run id 7dca697b-118c-4c7a-8e03-a4cba66b8c74.

Execution Summary
RunId: busy_farm_941d7ttnld
Web View: https://ml.azure.com/runs/busy_farm_941d7ttnld?wsid=/subscriptions/1f51a68b-69d8-4818-bf63-7c5e10b81967/resourcegroups/luiz.victor.dev-rg/workspaces/tcc-tests



In [14]:
@dsl.pipeline(
    name=NBC_BY_SPEARMAN,
    compute=COMPUTE,
    description="E2E data_perp-train pipeline",
)
def train_nbc_by_spearman_pipeline(
    data_to_train,
    data_to_test,
    feature_quantity,
):

    data_prep_job = data_prep_component(
        data_to_train=data_to_train,
        data_to_test=data_to_test,
        flag_remove_null_values=False,
        flag_remove_values_by_percentage=False,
        percentage_to_remove_column=0,
    )

    feat_sel_job = spearman_feat_sel_component(
        train_data=data_prep_job.outputs.train_data,
        test_data=data_prep_job.outputs.test_data,
        feature_quantity=feature_quantity,
    )

    train_nbc_job = train_nbc(
        train_data=feat_sel_job.outputs.train_data_feat_sel,  
        test_data=feat_sel_job.outputs.test_data_feat_sel,  
    )

In [15]:


pipeline_job = ml_client.jobs.create_or_update(
    train_nbc_by_spearman_pipeline(
        data_to_train=Input(type="uri_file", path=data_to_train.path),
        data_to_test=Input(type="uri_file", path=data_to_test.path),
        feature_quantity=20,
    ),
    experiment_name="train_nbc_by_spearman_pipeline",
    )

ml_client.jobs.stream(pipeline_job.name)

RunId: bold_hat_26xjtjltqj
Web View: https://ml.azure.com/runs/bold_hat_26xjtjltqj?wsid=/subscriptions/1f51a68b-69d8-4818-bf63-7c5e10b81967/resourcegroups/luiz.victor.dev-rg/workspaces/tcc-tests

Streaming logs/azureml/executionlogs.txt

[2024-06-16 20:13:42Z] Completing processing run id 6e6a26dc-ea62-4342-b6ab-99972646999b.
[2024-06-16 20:13:43Z] Submitting 1 runs, first five are: eb5798b3:3b96d6b1-9a66-4048-aaa2-b9491141455f
[2024-06-16 20:14:43Z] Completing processing run id 3b96d6b1-9a66-4048-aaa2-b9491141455f.
[2024-06-16 20:14:43Z] Submitting 1 runs, first five are: fa37d698:262b83f8-1a7f-4310-bfc0-795f9a444255
[2024-06-16 20:15:54Z] Completing processing run id 262b83f8-1a7f-4310-bfc0-795f9a444255.

Execution Summary
RunId: bold_hat_26xjtjltqj
Web View: https://ml.azure.com/runs/bold_hat_26xjtjltqj?wsid=/subscriptions/1f51a68b-69d8-4818-bf63-7c5e10b81967/resourcegroups/luiz.victor.dev-rg/workspaces/tcc-tests

