##### LIBRERIAS NECESARIAS PARA LA ELABORACION DEL PIPELINE

In [1]:
from azure.ai.ml import load_component
from azure.ai.ml.dsl import pipeline
from azure.ai.ml import MLClient, Input, command, Output
from azure.identity import DefaultAzureCredential, EnvironmentCredential
from azure.ai.ml.entities import AmlCompute
import os
import pandas as pd
from azure.ai.ml.entities import Environment

##### CREACION DEL CLUSTER PARA EJECUTAR EL PIPELINE

EL CULSTER ELEGIDO ES UNO OPTIMIZADO EN MEMORIA, QUE ES PARA ARCHIVOS PEQUEÑOS.

In [2]:
def get_comput_target(ml_client, name="cpu-cluster", family='Standard_DS2_v2'):
    cpu_compute_target = name
    
    try:
        # let's see if the compute target already exists
        cpu_cluster = ml_client.compute.get(cpu_compute_target)
    except Exception:
        cpu_cluster = AmlCompute(
            name=cpu_compute_target,
            type="amlcompute",
            size=family,
            min_instances=0,
            max_instances=4,
            idle_time_before_scale_down=180,
            tier="Dedicated",
        )
    
        cpu_cluster = ml_client.compute.begin_create_or_update(cpu_cluster).result()

#### CARGA DE CREDENCIALES DE AZURE

In [3]:
credential = DefaultAzureCredential()
ml_client = MLClient.from_config(credential=credential)

Found the config file in: /config.json


In [4]:
compute_target = get_comput_target(ml_client)

##### DESCARGA DEL DATASET PREVIAMENTE CARGADO A DROPBOX, PARA LA UTILIZACION EN EL PIPELINE

In [9]:
# Change to a different location if you prefer
dataset_parent_dir = "./data"

# create data folder if it doesnt exist.
os.makedirs(dataset_parent_dir, exist_ok=True)

# download data
download_url = "https://dl.dropboxusercontent.com/scl/fi/y322utp4yfc1n168ykqi8/water_potability_ds.zip?rlkey=x411mytkj6smn4yfjhr8ypk42"

# Extract current dataset name from dataset url
dataset_name = os.path.split(download_url)[-1].split(".")[0]
# Get dataset path for later use
dataset_dir = os.path.join(dataset_parent_dir, dataset_name)

INICIO DE LA DESCARGA DEL ARCHIVO

In [10]:
import urllib
# Get the data zip file path
data_file = os.path.join(dataset_parent_dir, f"{dataset_name}.zip")

# Download the dataset
urllib.request.urlretrieve(download_url, filename=data_file)

('./data/water_potability_ds.zip', <http.client.HTTPMessage at 0x7f1d582b7c70>)

DESCOMPRIMIR EL ARCHIVO DESCARGADO

In [11]:
from zipfile import ZipFile
# extract files
with ZipFile(data_file, "r") as zip:
    print("extracting files...")
    zip.extractall(path=dataset_parent_dir)
    print("done")

extracting files...
done


ELIMINADO DEL ARCHIVO COMPRESO DESCARGADO

In [12]:
# delete zip file
os.remove(data_file)

CREACION DEL MLTABLE PARA EL DATASET DESCARGADO

In [1]:
import mltable
from mltable import MLTableHeaders, MLTableFileEncoding, DataType

paths = [{"file": "./data/water_potability_ds.csv"}]
tbl = mltable.from_delimited_files(
    paths= paths,
    delimiter=",",
    header=MLTableHeaders.all_files_same_headers,
    infer_column_types=True,
    include_path_column=False,
    encoding=MLTableFileEncoding.utf8,
)
mltable_folder = "./data/"
tbl.save(mltable_folder)

##### PARA SUBIR EL DATASET AL DATASTORAGE DE AZURE

In [29]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes, InputOutputModes
from azure.ai.ml import Input

my_path = "./data/water_potability_ds.csv"
# set the version number of the data asset
v1 = "initial"

my_data = Data(
    name="water_potability_ds",
    version=v1,
    description="1potable 0nopotable",
    path=my_path,
    type=AssetTypes.URI_FILE,
)

## create data asset if it doesn't already exist:
try:
    data_asset = ml_client.data.get(name="water_potability_ds", version=v1)
    print(
        f"Data asset already exists. Name: {my_data.name}, version: {my_data.version}"
    )
except:
    ml_client.data.create_or_update(my_data)
    print(f"Data asset created. Name: {my_data.name}, version: {my_data.version}")

Data asset created. Name: water_potability_ds, version: initial


##### OBTENER LA DIRECCION URI DEL DATASET A UTILIZAR

In [30]:
import pandas as pd

# get a handle of the data asset and print the URI
data_asset = ml_client.data.get(name="water_potability_ds", version=v1)
path = data_asset.path
print(f"Data asset URI: {path}")

Data asset URI: azureml://subscriptions/fc14670d-c5e7-4c9f-aaa4-11ab881abb5f/resourcegroups/proyecto-2-modulo-4/workspaces/proyecto_2/datastores/workspaceblobstore/paths/LocalUpload/fb076fecbb1f1d2741801ec605e41e88/water_potability_ds.csv


##### CARGA DE COMPONENTES NECESARIOS PARA EL PROYECTO DE LA MANERA DE .YML

In [31]:
split_component = load_component(source="./split-component/split.yml")
train_LR_component = load_component(source="./train-LR-component/train_LR.yml")
train_DT_component = load_component(source="./train-DT-component/train_DT.yml")
score_component = load_component(source="./score-component/score.yml")
eval_component = load_component(source="./eval-component/eval.yml")

##### CREACION DE LA DIRECCION DONDE SE GUARDARA EL CONDA.YAML

In [32]:
import os

dependencies_dir = "./dependencies"
os.makedirs(dependencies_dir, exist_ok=True)

##### CREACION DEL CONDA.YAML, DONDE ESTERAN LAS LIBRERIAS NECESARIAS PARA CORRER EL PIPELINE PARA CARGAR AL ENVIROMENT.

In [33]:
%%writefile {dependencies_dir}/conda.yaml
name: model-env
channels:
  - conda-forge
dependencies:
  - python=3.8
  - numpy=1.21.2
  - pip=21.2.4
  - scikit-learn=0.24.2
  - scipy=1.7.1
  - pandas>=1.1,<1.2
  - pip:
    - inference-schema[numpy-support]==1.3.0
    - xlrd==2.0.1
    - azureml-mlflow==1.42.0
    - seaborn

Overwriting ./dependencies/conda.yaml


##### CARGA DEL ENVIRONMENT A AZURE.

In [34]:
from azure.ai.ml.entities import Environment

custom_env_name = "aml-scikit-learn"

pipeline_job_env = Environment(
    name=custom_env_name,
    description="Custom environment for Credit Card Defaults pipeline",
    tags={"scikit-learn": "0.24.2"},
    conda_file=os.path.join(dependencies_dir, "conda.yaml"),
    image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest",
    version="0.1.1",
)
pipeline_job_env = ml_client.environments.create_or_update(pipeline_job_env)

print(
    f"Environment with name {pipeline_job_env.name} is registered to workspace, the environment version is {pipeline_job_env.version}"
)



Environment with name aml-scikit-learn is registered to workspace, the environment version is 0.1.1


##### LUGAR DONDE SE CREARA EL COMPONENTE DE PROCESAMIENTO DEL DATASET

In [35]:
import os

proce_data_src_dir = "./proce-data-component/proce-data_src"
os.makedirs(proce_data_src_dir, exist_ok=True)

CREACION DEL COMPONENTE DE PROCESAMIENTO DEL DATASET, POR MEDIO DE "PROGRAMMATIC DEFINITION".

SE CREO DE ESTA MANERA YA QUE ASI SE VIO LA FORMA DE UTILIZAR EL ENVIAR

In [36]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

proce_data_component = command(
    name="proce_data",
    display_name="Data preparation for training",
    description="reads a .csv input, filter the input to train and test",
    inputs={
        "data": Input(type="uri_file"),
    },
    outputs=dict(
        data_filtrado=Output(type="uri_folder", mode="rw_mount"),
        imagen_correlacion=Output(type="uri_folder", mode="rw_mount"),
    ),
    # The source folder of the component
    code=proce_data_src_dir,
    command="""python proce-data.py \
            --data ${{inputs.data}} \
            --data_filtrado ${{outputs.data_filtrado}} --imagen_correlacion ${{outputs.imagen_correlacion}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)

##### CARGA DEL COMPONENTE PROCE_DATA AL ML_CLIENT.

In [37]:
proce_data_component = ml_client.create_or_update(proce_data_component.component)

###### COMANDO PARA CARGAR ENLAZAR LOS COMPONENTES, Y OBTENER LAS SALIDAS NECESARIAS COMO SER: IMAGEN DE LA CORRELACION DEL COMPONENTE DE "PROCE_DATA_COMPONENT", EL MODELO ENTRENADO DE LOS DOS MODELOS Y LAS METRICAS DE CADA MODELO.

In [38]:
# define a pipeline containing 3 nodes: Prepare data node, train node, and score node
@pipeline(
    default_compute='cpu-cluster',
)

def water_potability_decision_tree_dummy(pipeline_input_data):

    data_filter = proce_data_component(
        data = pipeline_input_data
    )

    split_node = split_component(
        data_filtrado = data_filter.outputs.data_filtrado,
        split_ratio_train = 0.8
    )

    
    train_lr_node = train_LR_component(
        X_train=split_node.outputs.X_train,
        y_train=split_node.outputs.y_train
    )

    train_dt_node = train_DT_component(
        X_train=split_node.outputs.X_train,
        y_train=split_node.outputs.y_train,
        criterion = "entropy",
        min_samples_split = 3,
        max_depth = 4

    )
    
    #train_node.compute = gpu_compute_target

    score_node = score_component(
        model_input=train_lr_node.outputs.model_output,
        X_test_data=split_node.outputs.X_test
    )

    eval_node = eval_component(
        scoring_result=score_node.outputs.score_output,
        y_test=split_node.outputs.y_test
    )

    score_node_2 = score_component(
        model_input=train_dt_node.outputs.model_output,
        X_test_data=split_node.outputs.X_test
    )

    eval_node_2 = eval_component(
        scoring_result=score_node_2.outputs.score_output,
        y_test=split_node.outputs.y_test
    )

    return {        
        "pipeline_correlacion": data_filter.outputs.imagen_correlacion,
        "pipeline_modelo_lr": train_lr_node.outputs.model_pkl,
        "pipeline_modelo_dt": train_dt_node.outputs.model_pkl,
        "pipeline_metrica_lr": eval_node.outputs.eval_output,
        "pipeline_metrica_dt": eval_node_2.outputs.eval_output       
        
    }


# create a pipeline
water_potability_ds =  Input(
            type="uri_file",
            path=path,
        )
pipeline_job = water_potability_decision_tree_dummy(pipeline_input_data=water_potability_ds)



###### CARGA DEL PIPELINE CREADO ANTERIORMENTE AL ML_CLIENT DE AZURE PARA SU EJECUCION.

In [39]:
pipeline_job = ml_client.jobs.create_or_update(
    pipeline_job, experiment_name="pipeline_water_potability_dummy2",
)
pipeline_job

Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


Experiment,Name,Type,Status,Details Page
pipeline_water_potability_dummy2,neat_stamp_3xnfzkzhx9,pipeline,Preparing,Link to Azure Machine Learning studio


###### COMANDO PARA VER EN QUE ESTADO ESTA LA EJECUCION DEL PIPELINE.

In [40]:
# wait until the job completes
ml_client.jobs.stream(pipeline_job.name)


RunId: neat_stamp_3xnfzkzhx9
Web View: https://ml.azure.com/runs/neat_stamp_3xnfzkzhx9?wsid=/subscriptions/fc14670d-c5e7-4c9f-aaa4-11ab881abb5f/resourcegroups/proyecto-2-modulo-4/workspaces/proyecto_2

Streaming logs/azureml/executionlogs.txt

[2023-11-07 23:44:33Z] Submitting 1 runs, first five are: b256cf04:5a4d6b62-cd5e-4e95-8003-bf57f3e571f9
[2023-11-07 23:50:59Z] Completing processing run id 5a4d6b62-cd5e-4e95-8003-bf57f3e571f9.
[2023-11-07 23:51:00Z] Submitting 1 runs, first five are: 6adf1673:1f9740ab-765c-4d4f-aee2-759bb9ace98d
[2023-11-07 23:52:58Z] Completing processing run id 1f9740ab-765c-4d4f-aee2-759bb9ace98d.
[2023-11-07 23:52:58Z] Submitting 2 runs, first five are: 49f96cc6:b02ceca4-0dc2-4b05-9426-6475ec2a7f9c,7e94df33:1d32993f-d15c-4e40-a627-95044b026201
[2023-11-07 23:53:39Z] Completing processing run id 1d32993f-d15c-4e40-a627-95044b026201.
[2023-11-07 23:53:40Z] Submitting 1 runs, first five are: c3abfc5b:19038e17-e11c-4ad6-9a11-f39695980080
[2023-11-07 23:54:09Z] C

###### COMANDO PARA DESCARGAR LAS SALIDAS DEL PIPELINE, EN UN ARICHIVO LLAMADO "PIPELINE_OUTPUT", CUYA CARPETA TENDRA DOS SUBCARPETAS, PERO LA QUE NOS INTERESA ES EL "NAMED_OUTPUTS", QUE ES DONDE ESTAN LAS SALIDAS SOLICITADAS PARA EL PROYECTO.

In [41]:
# Download all the outputs of the job
output = ml_client.jobs.download(name=pipeline_job.name, download_path='./pipeline_output', all=True)

Downloading artifact azureml://subscriptions/fc14670d-c5e7-4c9f-aaa4-11ab881abb5f/resourcegroups/proyecto-2-modulo-4/workspaces/proyecto_2/datastores/workspaceblobstore/paths/azureml/5a4d6b62-cd5e-4e95-8003-bf57f3e571f9/imagen_correlacion/ to pipeline_output/named-outputs/pipeline_correlacion
Downloading artifact azureml://subscriptions/fc14670d-c5e7-4c9f-aaa4-11ab881abb5f/resourcegroups/proyecto-2-modulo-4/workspaces/proyecto_2/datastores/workspaceblobstore/paths/azureml/1d32993f-d15c-4e40-a627-95044b026201/model_pkl/ to pipeline_output/named-outputs/pipeline_modelo_dt
Downloading artifact azureml://subscriptions/fc14670d-c5e7-4c9f-aaa4-11ab881abb5f/resourcegroups/proyecto-2-modulo-4/workspaces/proyecto_2/datastores/workspaceblobstore/paths/azureml/b02ceca4-0dc2-4b05-9426-6475ec2a7f9c/model_pkl/ to pipeline_output/named-outputs/pipeline_modelo_lr
Downloading artifact azureml://subscriptions/fc14670d-c5e7-4c9f-aaa4-11ab881abb5f/resourcegroups/proyecto-2-modulo-4/workspaces/proyecto_2/d