In [None]:
# Installing the the required library
!pip install azure-ai-ml
!pip show azure-ai-ml

In [1]:
# Connecting to the workspace
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml import MLClient

try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()

In [2]:
# Get a handle to azure services (worksopace)
ml_client = MLClient.from_config(credential=credential)

Found the config file in: /config.json


In [3]:
# Create a data asset
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

my_path = './data/diabetes.csv'

my_data = Data(
    path=my_path,
    type=AssetTypes.URI_FILE,
    description="Data asset pointing to a local file, automatically uploaded to the default datastore",
    name="my-diabetes-local"
)

ml_client.data.create_or_update(my_data)

Data({'skip_validation': False, 'mltable_schema_url': None, 'referenced_uris': None, 'type': 'uri_file', 'is_anonymous': False, 'auto_increment_version': False, 'name': 'my-diabetes-local', 'description': 'Data asset pointing to a local file, automatically uploaded to the default datastore', 'tags': {}, 'properties': {}, 'print_as_yaml': True, 'id': '/subscriptions/2a21ade8-9d70-4d5a-a619-083b264d1d56/resourceGroups/mlcertificate/providers/Microsoft.MachineLearningServices/workspaces/ft_ml/data/my-diabetes-local/versions/2', 'Resource__source_path': None, 'base_path': '/mnt/batch/tasks/shared/LS_root/mounts/clusters/farbodtaymouri1/code/Users/farbodtaymouri/my-azure-ml-projects', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x7fc3bdc5b6d0>, 'serialize': <msrest.serialization.Serializer object at 0x7fc3bdc5b130>, 'version': '2', 'latest_version': None, 'path': 'azureml://subscriptions/2a21ade8-9d70-4d5a-a619-083b264d1d56/resourcegroups/mlcertificate/worksp

In [8]:
# Create a source directory
import os

# create a folder for the script files
script_folder = 'src'
os.makedirs(script_folder, exist_ok=True)
print(script_folder, 'folder created')

src folder created


In [16]:
%%writefile $script_folder/train-model-mlflow.py
# import libraries
import mlflow
import argparse
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
from mlflow.models import infer_signature



from mlflow.pyfunc import PythonModel, PythonModelContext
# https://learn.microsoft.com/en-us/azure/machine-learning/how-to-log-mlflow-models?view=azureml-api-2&tabs=wrapper#logging-custom-models
class ModelWrapper(PythonModel):
    def __init__(self, model):
        self._model = model

    def predict(self, context: PythonModelContext, data):
        # You don't have to keep the semantic meaning of `predict`. You can use here model.recommend(), model.forecast(), etc
        return self._model.predict_proba(data)

    # You can even add extra functions if you need to. Since the model is serialized,
    # all of them will be available when you load your model back.
    def predict_batch(self, data):
        pass





def main(args):
    # read data
    df = get_data(args.training_data)

    # split data
    X_train, X_test, y_train, y_test = split_data(df)

    # train model
    model = train_model(args.n_estimators, args.max_depth, X_train, X_test, y_train, y_test)

    # evaluate model
    eval_model(model, X_test, y_test)

# function that reads the data
def get_data(path):
    print("Reading data...")
    df = pd.read_csv(path)
    
    return df

# function that splits the data
def split_data(df):
    print("Splitting data...")
    X, y = df[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness',
    'SerumInsulin','BMI','DiabetesPedigree','Age']].values, df['Diabetic'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

    return X_train, X_test, y_train, y_test

# function that trains the model
def train_model(n_estimators, max_depth, X_train, X_test, y_train, y_test):

    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)


    clf = RandomForestClassifier(max_depth = max_depth, n_estimators = n_estimators, random_state=0)
    model = clf.fit(X_train, y_train)


    # mlflow.log_param("Regularization rate", reg_rate)
    # print("Training model...")
    # model = LogisticRegression(C=1/reg_rate, solver="liblinear").fit(X_train, y_train)

    y_probs = model.predict_proba(X_test)

    #Logging the model artifact
    signature = infer_signature(X_test, y_probs)
    mlflow.pyfunc.log_model("classifier", 
                        python_model=ModelWrapper(model),
                        signature=signature)
    

    return model

# function that evaluates the model
def eval_model(model, X_test, y_test):
    # calculate accuracy
    y_hat = model.predict(X_test)
    acc = np.average(y_hat == y_test)
    print('Accuracy:', acc)
    mlflow.log_metric("Accuracy", acc)

    # calculate AUC
    y_scores = model.predict_proba(X_test)
    auc = roc_auc_score(y_test,y_scores[:,1])
    print('AUC: ' + str(auc))
    mlflow.log_metric("AUC", auc)

    # plot ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])
    fig = plt.figure(figsize=(6, 4))
    # Plot the diagonal 50% line
    plt.plot([0, 1], [0, 1], 'k--')
    # Plot the FPR and TPR achieved by our model
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.savefig("ROC-Curve.png")
    mlflow.log_artifact("ROC-Curve.png")    

def parse_args():
    # setup arg parser
    parser = argparse.ArgumentParser()

    # add arguments
    parser.add_argument("--training_data", dest='training_data',
                        type=str)
    parser.add_argument("--n_estimators", dest='n_estimators',
                        type=int, default=10)
    parser.add_argument("--max_depth", dest='max_depth',
                        type=int, default=3)

    # parse args
    args = parser.parse_args()

    # return args
    return args

# run script
if __name__ == "__main__":
    # add space in logs
    print("\n\n")
    print("*" * 60)

    # parse args
    args = parse_args()

    # run main function
    main(args)

    # add space in logs
    print("*" * 60)
    print("\n\n")


Overwriting src/train-model-mlflow.py


## Run the script as a commnd job with taking inputs as arga


In [9]:
# Run the script as command
from azure.ai.ml import command
from azure.ai.ml import command, Input
from azure.ai.ml.constants import AssetTypes

# configure job

job = command(
    code="./src",
    command="python train-model-mlflow.py --training_data ${{inputs.diabetes_data}} --n_estimators ${{inputs.n_estimators}} --max_depth ${{inputs.max_depth}}",
    inputs={
    "diabetes_data": Input(
        type=AssetTypes.URI_FILE, 
        path="azureml:my-diabetes-local:1"
        ),
    "n_estimators": 5,
    "max_depth":2,
    },
    environment="AzureML-sklearn-0.24-ubuntu18.04-py37-cpu@latest",
    compute="farbodtaymouri1",
    display_name="diabetes-train-mlflow",
    experiment_name="diabetes-training2", 
    tags={"model_type": "RandomForest"}
    )

# submit job if you want to run it
returned_job = ml_client.create_or_update(job)
aml_url = returned_job.studio_url
print("Monitor your job at", aml_url)

## Hyperparamter tuning

In [10]:
# Defining the variables and values for selection
from azure.ai.ml.sweep import Choice

# Note that such values can be initiated during creating the initial job in the above cell as well
command_job_for_sweep = job(
    n_estimators=Choice(values=[5, 10, 20]),
    max_depth = Choice(values=[2, 3, 5]),
)

In [31]:
# apply the sweep parameter to obtain the sweep_job
sweep_job = command_job_for_sweep.sweep(
    compute="farbodtaymouri1",
    sampling_algorithm="bayesian",
    primary_metric="training_accuracy_score",
    goal="Maximize",
)

# set the name of the sweep job experiment
sweep_job.experiment_name="RF-sweep-diabetes"

# define the limits for this sweep
sweep_job.set_limits(max_total_trials=4, max_concurrent_trials=2, timeout=7200)

In [13]:
help(command_job_for_sweep.sweep )

Help on method sweep in module azure.ai.ml.entities._builders.command:

sweep(*, primary_metric: str, goal: str, sampling_algorithm: str = 'random', compute: Optional[str] = None, max_concurrent_trials: Optional[int] = None, max_total_trials: Optional[int] = None, timeout: Optional[int] = None, trial_timeout: Optional[int] = None, early_termination_policy: Union[azure.ai.ml.entities._job.sweep.early_termination_policy.EarlyTerminationPolicy, str, NoneType] = None, search_space: Optional[Dict[str, Union[azure.ai.ml.entities._job.sweep.search_space.Choice, azure.ai.ml.entities._job.sweep.search_space.LogNormal, azure.ai.ml.entities._job.sweep.search_space.LogUniform, azure.ai.ml.entities._job.sweep.search_space.Normal, azure.ai.ml.entities._job.sweep.search_space.QLogNormal, azure.ai.ml.entities._job.sweep.search_space.QLogUniform, azure.ai.ml.entities._job.sweep.search_space.QNormal, azure.ai.ml.entities._job.sweep.search_space.QUniform, azure.ai.ml.entities._job.sweep.search_space.Rand

In [15]:
# returned_sweep_job = ml_client.create_or_update(sweep_job)
# aml_url = returned_sweep_job.studio_url
# print("Monitor your job at", aml_url)

Monitor your job at https://ml.azure.com/runs/modest_basket_gc827bgs23?wsid=/subscriptions/2a21ade8-9d70-4d5a-a619-083b264d1d56/resourcegroups/mlcertificate/workspaces/ft_ml&tid=71f8feea-4caa-4230-a785-dca61147bceb


In [32]:
# https://learn.microsoft.com/en-us/azure/machine-learning/how-to-train-tensorflow?view=azureml-api-2
returned_sweep_job = ml_client.create_or_update(sweep_job)

# stream the output and wait until the job is finished
ml_client.jobs.stream(returned_sweep_job.name)

# refresh the latest status of the job after streaming
returned_sweep_job = ml_client.jobs.get(name=returned_sweep_job.name)

RunId: willing_owl_dxsmksgy9v
Web View: https://ml.azure.com/runs/willing_owl_dxsmksgy9v?wsid=/subscriptions/2a21ade8-9d70-4d5a-a619-083b264d1d56/resourcegroups/mlcertificate/workspaces/ft_ml

Execution Summary
RunId: willing_owl_dxsmksgy9v
Web View: https://ml.azure.com/runs/willing_owl_dxsmksgy9v?wsid=/subscriptions/2a21ade8-9d70-4d5a-a619-083b264d1d56/resourcegroups/mlcertificate/workspaces/ft_ml



In [35]:
returned_sweep_job.properties["best_child_run_id"]

KeyError: 'best_child_run_id'