In [None]:
# Uncomment the following line to install the required packages
# restart the kernel after running this cell
#!pip install pyyaml logger azure-ai-ml pandas --upgrade --user --no-cache-dir

In [21]:
# Uncomment the following line to log in to Azure
#!az login

In [29]:
# Import Libraries
import logging
import yaml
from azure.ai.ml import MLClient, Input, Output, command
from azure.ai.ml.entities import Environment, Model
from azure.identity import DefaultAzureCredential
from azure.ai.ml.constants import AssetTypes

In [23]:
# Set global logging level
logging.getLogger().setLevel(logging.WARNING)

# Specifically reduce Azure-related logging in this notebook
logging.getLogger("azure").setLevel(logging.ERROR)
logging.getLogger("azure.identity").setLevel(logging.ERROR)
logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(logging.ERROR)

In [27]:
# read the Azure ML workspace configuration from config.yml
with open("config.yml", "r") as f:
    config = yaml.safe_load(f)

# Azure ML workspace configuration
subscription_id = config["subscription_id"]
resource_group = config["resource_group"]
workspace_name = config["workspace_name"]

# Training environment configuration
training_env_name = config["training_env_name"]
training_env_version = config["training_env_version"]
training_env_description = config["training_env_description"]

training_env_requirements = config["training_env_requirements"]
training_env_base_image = config["training_env_base_image"]
training_env_conda_file = config["training_env_conda_file"]

# Training Compute configuration
training_compute_cluster_name = config["training_compute_cluster_name"]
training_compute_cluster_size = config["training_compute_cluster_size"]

# Directory configuration
data_dir = config["data_dir"]
train_dir = config["train_dir"]
output_dir = config["output_dir"]
model_dir = config["model_dir"]

# finetuned model configuration
finetuned_model_name = config["finetuned_model_name"]
finetuned_model_version = config["finetuned_model_version"]


In [None]:
# Initialize ML Client
ml_client = MLClient(DefaultAzureCredential(), subscription_id, resource_group, workspace_name)

Overriding of current TracerProvider is not allowed
Overriding of current LoggerProvider is not allowed
Overriding of current MeterProvider is not allowed
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented


In [None]:
# Create the data asset - I am uploading the data directly as part of the training job

In [None]:
# Create the compute -  I have done this via the Azure ML Studio

In [None]:
# Create the training environment - I am creating the environment reusing the Azure ML base image and then installing the packages using the conda yml file.
# This takes around 20 minutes to build the environment - check the Azure ML Studio > Environments to see the progress of the environment creation.

try:
    # Check if the environment already exists
    existing_env = ml_client.environments.get(name=training_env_name, version=training_env_version)
    print(f"Environment '{training_env_name}:{training_env_version}' already exists.")
except Exception as e:
    print(f"Environment '{training_env_name}:{training_env_version}' does not exist. Creating a new one.")
    # Define the custom environment
    custom_env = Environment(
        name=training_env_name,
        version=training_env_version,
        description=training_env_description,
        image=training_env_base_image,
        conda_file=training_env_conda_file
    )

    # Register the environment (this will build it)
    print("Creating and registering environment..")
    registered_env = ml_client.environments.create_or_update(custom_env)
    print(f"Environment '{training_env_name}:{training_env_version}' registered successfully")

Creating and registering environment..
Environment 'ft-env-phi-4-mini-instruct-lora:1' registered successfully


In [26]:
# Create the training job
job = command(
    inputs={
        "dataset_path": Input(
                            type="uri_folder",
                            path=data_dir  # Local folder path
                        ) # Input dataset path
    },
    outputs={
        "output": Output(type="uri_folder") # Directory to save the output including the checkpoints and model
    },
    code=train_dir,
    command="python train.py --dataset_path ${{inputs.dataset_path}} --output_dir ${{outputs.output}}",
    environment=f"{training_env_name}:{training_env_version}",
    compute=training_compute_cluster_name,
    description="phi 4 mini instruct LoRA fine-tuning",
    display_name="ft-e3-tb8-eb8-ga4-lr2e-05-r64-a128-d0.1", # update this to reflect your job training parameters
    experiment_name="finetune-phi-4-mini-instruct",
)

# Submit the training job
returned_job = ml_client.jobs.create_or_update(job)
print(f"Submitted job: {returned_job.name}")

pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored


Submitted job: blue_frog_cnlg9dp1rf


In [38]:
# Register the final model
path = f"azureml://datastores/workspaceblobstore/paths/azureml/{returned_job.name}/output/model/" # datastore uri

model_asset = Model(
    path=path,
    name=finetuned_model_name,
    version=finetuned_model_version,
    description="finetuned phi 4 mini instruct model",
    type=AssetTypes.CUSTOM_MODEL, 
    properties={
        "azureml.job_name": f"{returned_job.name}"
    },
)


registered_model = ml_client.models.create_or_update(model_asset)

In [None]:
# Print the registered model ID. This can be used for deployment in next notebook
print("Model ID:", registered_model.id)

In [None]:
# Download the final model to local directory
model = ml_client.models.get(name=finetuned_model_name, version=finetuned_model_version)
ml_client.models.download(name=model.name, version=model.version, download_path="model")

In [37]:
# If the model is not downloaded, you can use the azcopy to download it. You will get the warnings in the above cell on how to do it. It might look like this:
# !azcopy copy 'https://<storage>.blob.core.windows.net/<azureml_blobstore>/azureml/<job_name>/output/model/' 'model\<finetuned_model_name>\model'