In [1]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
from pipeline_utils import (
    COMPUTE, CONCURRENT_TRIALS, DELAY_EVALUATION, EVALUATION_INTERVAL, GOAL, METRIC, SAMPLING_ALGORITHM, TIMEOUT, TIMEOUT_PLUS, TOTAL_TRIALS, #SWEEP
    SUBSCRIPTION, RESOURCE_GROUP, WS_NAME,  #AUTHENTICATE
    GBC_PATH, NBC_PATH, RFC_PATH, SVC_PATH, XGB_PATH, MULT_NBC_PATH,    #COMPONENTS PATHS
    GINI_PATH,    #FEAT SEL PATHS
    PREP_DATA_PATH,  #PREP DATA PATH
    GBC_BY_GINI, NBC_BY_GINI, RFC_BY_GINI, SVC_BY_GINI, XGB_BY_GINI, MULT_NBC_BY_GINI,   #PIPELINES
    GINI,   #FEAT SEL
    N_FEATURES, TRAIN_DATA, TEST_DATA, DATA_VERSION, #PIPELINE VALUES
    get_experiment_names,   #FUNCTIONS
)

# authenticate
credential = DefaultAzureCredential()

# Get a handle to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id=SUBSCRIPTION,
    resource_group_name=RESOURCE_GROUP,
    workspace_name=WS_NAME,
)

ModuleNotFoundError: No module named 'pipeline_utils'

In [None]:
data_to_train = ml_client.data.get(name=TRAIN_DATA.split('.')[0], version=DATA_VERSION)
print(f"Data to train asset URI: {data_to_train.path} - name: {TRAIN_DATA.split('.')[0]}")

data_to_test = ml_client.data.get(name=TEST_DATA.split('.')[0], version=DATA_VERSION)
print(f"Data to test asset URI: {data_to_test.path} - name: {TEST_DATA.split('.')[0]}")

In [None]:
# importing the Component Package
from azure.ai.ml import load_component

data_prep_component = load_component(source=PREP_DATA_PATH)
data_prep_component = ml_client.create_or_update(data_prep_component)

feat_sel_component = load_component(source=GINI_PATH)
feat_sel_component = ml_client.create_or_update(feat_sel_component)

train_xgb = load_component(source=XGB_PATH)
train_xgb = ml_client.create_or_update(train_xgb)


In [None]:

# the dsl decorator tells the sdk that we are defining an Azure Machine Learning pipeline
from azure.ai.ml import dsl, Input, Output
from azure.ai.ml.sweep import Choice, Uniform, MedianStoppingPolicy

In [None]:
@dsl.pipeline(
    name=XGB_BY_GINI,
    compute=COMPUTE,
    description="E2E data_perp-train pipeline",
)
def train_xgb_pipeline(
    data_to_train,
    data_to_test,
    feature_quantity,
    flag_remove_null_values,
    flag_remove_values_by_percentage,
    percentage_to_remove_column,
):

    data_prep_job = data_prep_component(
        data_to_train=data_to_train,
        data_to_test=data_to_test,
        flag_remove_null_values=flag_remove_null_values,
        flag_remove_values_by_percentage=flag_remove_values_by_percentage,
        percentage_to_remove_column=percentage_to_remove_column,
    )

    feat_sel_job = feat_sel_component(
        train_data=data_prep_job.outputs.train_data,
        test_data=data_prep_job.outputs.test_data,
        feature_quantity=feature_quantity,
    )

    train_xgb_job = train_xgb(
        train_data=feat_sel_job.outputs.train_data_feat_sel,  
        test_data=feat_sel_job.outputs.test_data_feat_sel,   
        n_estimators_to_xgb=Choice(values=[100, 500, 1000]),
        learning_rate_to_xgb=Uniform(min_value=0.01, max_value=0.3),
    )

    sweep_step_to_xgb = train_xgb_job.sweep(
        compute=COMPUTE,
        sampling_algorithm=SAMPLING_ALGORITHM,
        primary_metric=METRIC,
        goal=GOAL,
    )

    sweep_step_to_xgb.set_limits(max_total_trials=2*TOTAL_TRIALS, max_concurrent_trials=CONCURRENT_TRIALS, timeout=TIMEOUT_PLUS)
    sweep_step_to_xgb.early_termination = MedianStoppingPolicy(delay_evaluation=DELAY_EVALUATION, evaluation_interval=EVALUATION_INTERVAL)

In [None]:

pipeline = train_xgb_pipeline(
        data_to_train=Input(type="uri_file", path=data_to_train.path),
        data_to_test=Input(type="uri_file", path=data_to_test.path),
        feature_quantity=200,
        flag_remove_null_values=False,
        flag_remove_values_by_percentage=False,
        percentage_to_remove_column=0,
    )

pipeline_job = ml_client.jobs.create_or_update(
    pipeline,
    experiment_name="testing_xgb",
)
    
ml_client.jobs.stream(pipeline_job.name)