In [1]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

# authenticate
credential = DefaultAzureCredential()

SUBSCRIPTION="2944a580-6c5f-4258-8c86-b9c5de957998"
RESOURCE_GROUP="calebebraga08-rg"
WS_NAME="tcc-experiments"

# Get a handle to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id=SUBSCRIPTION,
    resource_group_name=RESOURCE_GROUP,
    workspace_name=WS_NAME,
)

In [2]:
# Verify that the handle works correctly.  
# If you ge an error here, modify your SUBSCRIPTION, RESOURCE_GROUP, and WS_NAME in the previous cell.
ws = ml_client.workspaces.get(WS_NAME)
print(ws.location,":", ws.resource_group)

eastus2 : calebebraga08-rg


In [3]:
TRAIN_DATAS = [
    "vrex_2009_2010_2011_2012_2013_.csv",
    "vrex_2014_2015_2016_2017_2018_.csv",
]

TEST_DATAS = [
    "vrex_2014.csv",
    "vrex_2019.csv"
]

version = "original"

arr_data_to_train = []
arr_data_to_test = []

for to_train, to_test in zip(TRAIN_DATAS, TEST_DATAS):
    data_to_train = ml_client.data.get(name=to_train.split(".")[0], version=version)
    arr_data_to_train.append(data_to_train)
    print(f"Data to train asset URI: {data_to_train.path} - name: {to_train.split('.')[0]}")

    data_to_test = ml_client.data.get(name=to_test.split('.')[0], version=version)
    arr_data_to_test.append(data_to_test)
    print(f"Data to test asset URI: {data_to_test.path} - name: {to_test.split('.')[0]}")

Data to train asset URI: azureml://subscriptions/2944a580-6c5f-4258-8c86-b9c5de957998/resourcegroups/calebebraga08-rg/workspaces/tcc-experiments/datastores/workspaceblobstore/paths/LocalUpload/20c8eaed3e0076d5623ceabb00067339/vrex_2009_2010_2011_2012_2013_.csv - name: vrex_2009_2010_2011_2012_2013_
Data to test asset URI: azureml://subscriptions/2944a580-6c5f-4258-8c86-b9c5de957998/resourcegroups/calebebraga08-rg/workspaces/tcc-experiments/datastores/workspaceblobstore/paths/LocalUpload/f4ecea294ba5aa10eadec0cdaf59ac5c/vrex_2014.csv - name: vrex_2014
Data to train asset URI: azureml://subscriptions/2944a580-6c5f-4258-8c86-b9c5de957998/resourcegroups/calebebraga08-rg/workspaces/tcc-experiments/datastores/workspaceblobstore/paths/LocalUpload/38bbb10032af817b2a7427008e191128/vrex_2014_2015_2016_2017_2018_.csv - name: vrex_2014_2015_2016_2017_2018_
Data to test asset URI: azureml://subscriptions/2944a580-6c5f-4258-8c86-b9c5de957998/resourcegroups/calebebraga08-rg/workspaces/tcc-experiments

# Load Components

In [4]:
# importing the Component Package
from azure.ai.ml import load_component

# Loading the component from the yml file
data_prep_component = load_component(source="/home/azureuser/cloudfiles/code/Users/calebebraga08/tcc_experiments_az_ml/src/components/data_prep.yaml")

# Now we register the component to the workspace
data_prep_component = ml_client.create_or_update(data_prep_component)

# Create (register) the component in your workspace
print(
    f"Component {data_prep_component.name} with Version {data_prep_component.version} is registered"
)

[32mUploading components (0.01 MBs): 100%|██████████| 12051/12051 [00:00<00:00, 87996.37it/s]
[39m



Component data_prep_vrex_defaults_model with Version 2024-04-06-22-37-50-7203382 is registered


In [5]:
# importing the Component Package
from azure.ai.ml import load_component

# Loading the component from the yml file
train_component = load_component(source="/home/azureuser/cloudfiles/code/Users/calebebraga08/tcc_experiments_az_ml/src/components/train.yaml")

# Now we register the component to the workspace
train_component = ml_client.create_or_update(train_component)

# Create (register) the component in your workspace
print(
    f"Component {train_component.name} with Version {train_component.version} is registered"
)

Component train_vrex_defaults_model with Version 2024-04-06-22-37-52-3483919 is registered


In [8]:
# the dsl decorator tells the sdk that we are defining an Azure Machine Learning pipeline
from azure.ai.ml import dsl, Input, Output


@dsl.pipeline(
    compute="serverless",
    description="E2E data_perp-train pipeline",
)
def prep_data_and_train_defaults_pipeline(
    data_to_train,
    data_to_test,
    flag_remove_null_values,
    learning_rate_to_train,
    flag_remove_values_by_percentage,
    percentage_to_remove_column
):

    data_prep_job = data_prep_component(
        data_to_train=data_to_train,
        data_to_test=data_to_test,
        flag_remove_null_values=flag_remove_null_values,
        flag_remove_values_by_percentage=flag_remove_values_by_percentage,
        percentage_to_remove_column=percentage_to_remove_column,
    )

    train_job = train_component(
        train_data=data_prep_job.outputs.train_data,  # note: using outputs from previous step
        test_data=data_prep_job.outputs.test_data,  # note: using outputs from previous step
        learning_rate=learning_rate_to_train,  # note: using a pipeline input as parameter
    )


In [9]:
pipelines = []
percentages = [0.5, 0.6, 0.7, 0.8, 0.85, 0.90, 0.95, 0.97]

for percentage in percentages:
    for data_to_train, data_to_test in zip(arr_data_to_train, arr_data_to_test):
        pipeline = prep_data_and_train_defaults_pipeline(
            data_to_train=Input(type="uri_file", path=data_to_train.path),
            data_to_test=Input(type="uri_file", path=data_to_test.path),
            flag_remove_null_values=True,
            learning_rate_to_train=0.05,
            flag_remove_values_by_percentage=True,
            percentage_to_remove_column=percentage,
        )
        
        pipelines.append(pipeline)

print("Quantidade de pipelines: ", len(pipelines))

Quantidade de pipelines:  16


In [13]:
import datetime as dt

def _get_experiment_names() -> [str]:
    experiment_names = []
    for percentage in percentages:
        for train_name, test_name in zip(TRAIN_DATAS, TEST_DATAS):
            current_time = dt.datetime.now()
            formatted_time = current_time.strftime("%Y_%m_%d_%H_%M_%S")  # Formata a data e hora atual
            train_name_base = train_name.split('.')[0]
            test_name_base = test_name.split('.')[0]
            name = f"{train_name_base}tested_{test_name_base}_executed_in_{formatted_time}_removed_null_values_by_percentage_{str(percentage)}"
            experiment_names.append(name)
            print(name)
    return experiment_names


In [14]:
_get_experiment_names()

vrex_2009_2010_2011_2012_2013_tested_vrex_2014_executed_in_2024_04_06_22_40_52_removed_null_values_by_percentage_0.5
vrex_2014_2015_2016_2017_2018_tested_vrex_2019_executed_in_2024_04_06_22_40_52_removed_null_values_by_percentage_0.5
vrex_2009_2010_2011_2012_2013_tested_vrex_2014_executed_in_2024_04_06_22_40_52_removed_null_values_by_percentage_0.6
vrex_2014_2015_2016_2017_2018_tested_vrex_2019_executed_in_2024_04_06_22_40_52_removed_null_values_by_percentage_0.6
vrex_2009_2010_2011_2012_2013_tested_vrex_2014_executed_in_2024_04_06_22_40_52_removed_null_values_by_percentage_0.7
vrex_2014_2015_2016_2017_2018_tested_vrex_2019_executed_in_2024_04_06_22_40_52_removed_null_values_by_percentage_0.7
vrex_2009_2010_2011_2012_2013_tested_vrex_2014_executed_in_2024_04_06_22_40_52_removed_null_values_by_percentage_0.8
vrex_2014_2015_2016_2017_2018_tested_vrex_2019_executed_in_2024_04_06_22_40_52_removed_null_values_by_percentage_0.8
vrex_2009_2010_2011_2012_2013_tested_vrex_2014_executed_in_2024_

['vrex_2009_2010_2011_2012_2013_tested_vrex_2014_executed_in_2024_04_06_22_40_52_removed_null_values_by_percentage_0.5',
 'vrex_2014_2015_2016_2017_2018_tested_vrex_2019_executed_in_2024_04_06_22_40_52_removed_null_values_by_percentage_0.5',
 'vrex_2009_2010_2011_2012_2013_tested_vrex_2014_executed_in_2024_04_06_22_40_52_removed_null_values_by_percentage_0.6',
 'vrex_2014_2015_2016_2017_2018_tested_vrex_2019_executed_in_2024_04_06_22_40_52_removed_null_values_by_percentage_0.6',
 'vrex_2009_2010_2011_2012_2013_tested_vrex_2014_executed_in_2024_04_06_22_40_52_removed_null_values_by_percentage_0.7',
 'vrex_2014_2015_2016_2017_2018_tested_vrex_2019_executed_in_2024_04_06_22_40_52_removed_null_values_by_percentage_0.7',
 'vrex_2009_2010_2011_2012_2013_tested_vrex_2014_executed_in_2024_04_06_22_40_52_removed_null_values_by_percentage_0.8',
 'vrex_2014_2015_2016_2017_2018_tested_vrex_2019_executed_in_2024_04_06_22_40_52_removed_null_values_by_percentage_0.8',
 'vrex_2009_2010_2011_2012_2013_

In [15]:
experiment_names = _get_experiment_names()

for pipeline, experiment_name in zip(pipelines, experiment_names):
    pipeline_job = ml_client.jobs.create_or_update(
        pipeline,
        experiment_name=experiment_name,
    )

    ml_client.jobs.stream(pipeline_job.name)

Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


vrex_2009_2010_2011_2012_2013_tested_vrex_2014_executed_in_2024_04_06_22_42_26_removed_null_values_by_percentage_0.5
vrex_2014_2015_2016_2017_2018_tested_vrex_2019_executed_in_2024_04_06_22_42_26_removed_null_values_by_percentage_0.5
vrex_2009_2010_2011_2012_2013_tested_vrex_2014_executed_in_2024_04_06_22_42_26_removed_null_values_by_percentage_0.6
vrex_2014_2015_2016_2017_2018_tested_vrex_2019_executed_in_2024_04_06_22_42_26_removed_null_values_by_percentage_0.6
vrex_2009_2010_2011_2012_2013_tested_vrex_2014_executed_in_2024_04_06_22_42_26_removed_null_values_by_percentage_0.7
vrex_2014_2015_2016_2017_2018_tested_vrex_2019_executed_in_2024_04_06_22_42_26_removed_null_values_by_percentage_0.7
vrex_2009_2010_2011_2012_2013_tested_vrex_2014_executed_in_2024_04_06_22_42_26_removed_null_values_by_percentage_0.8
vrex_2014_2015_2016_2017_2018_tested_vrex_2019_executed_in_2024_04_06_22_42_26_removed_null_values_by_percentage_0.8
vrex_2009_2010_2011_2012_2013_tested_vrex_2014_executed_in_2024_

HttpResponseError: (BadRequest) Create job jolly_pasta_ggfqy7qxhq failed due to error: Response status code does not indicate success: 400 (Experiment name must be 1-256 characters, start with a letter or a number, and can only contain lett).
Microsoft.RelInfra.Common.Exceptions.ErrorResponseException: Experiment name must be 1-256 characters, start with a letter or a number, and can only contain letters, numbers, underscores, and dashes.
Code: BadRequest
Message: Create job jolly_pasta_ggfqy7qxhq failed due to error: Response status code does not indicate success: 400 (Experiment name must be 1-256 characters, start with a letter or a number, and can only contain lett).
Microsoft.RelInfra.Common.Exceptions.ErrorResponseException: Experiment name must be 1-256 characters, start with a letter or a number, and can only contain letters, numbers, underscores, and dashes.
Additional Information:Type: ComponentName
Info: {
    "value": "managementfrontend"
}Type: Correlation
Info: {
    "value": {
        "operation": "3bb60966d660c67506adcd0533a1c77e",
        "request": "3f4910dded9db243"
    }
}Type: Environment
Info: {
    "value": "eastus2"
}Type: Location
Info: {
    "value": "eastus2"
}Type: Time
Info: {
    "value": "2024-04-06T22:42:30.7872633+00:00"
}