In [1]:
# Import Libraries
import logging
import yaml
from azure.ai.ml import MLClient
from azure.ai.ml.entities import Environment, ManagedOnlineDeployment, ManagedOnlineEndpoint, CodeConfiguration
from azure.identity import DefaultAzureCredential


In [3]:
# Uncomment the following line to log in to Azure
#!az login

In [2]:
# Set global logging level
logging.getLogger().setLevel(logging.WARNING)

# Specifically reduce Azure-related logging in this notebook
logging.getLogger("azure").setLevel(logging.ERROR)
logging.getLogger("azure.identity").setLevel(logging.ERROR)
logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(logging.ERROR)

In [13]:
# read the Azure ML workspace configuration from config.yml
with open("config.yml", "r") as f:
    config = yaml.safe_load(f)

# Azure ML workspace configuration
subscription_id = config["subscription_id"]
resource_group = config["resource_group"]
workspace_name = config["workspace_name"]

# finetuned model configuration
finetuned_model_name = config["finetuned_model_name"]
finetuned_model_version = config["finetuned_model_version"]

# inference environment configuration
inference_env_name = config["inference_env_name"]
inference_env_version = config["inference_env_version"]
inference_env_description = config["inference_env_description"]
inference_env_conda_file = config["inference_env_conda_file"]
inference_env_base_image = config["inference_env_base_image"]

# finetuned model real-time endpoint configuration
endpoint_name = config["endpoint_name"]
deployment_name = config["deployment_name"]
deployment_instance_type = config["deployment_instance_type"]
deployment_instance_count = config["deployment_instance_count"]

In [4]:
# Initialize ML Client
ml_client = MLClient(DefaultAzureCredential(), subscription_id, resource_group, workspace_name)

In [6]:
# Get the registered model ID from previous notebook that can be used for deployment.
registered_model = ml_client.models.get(name=finetuned_model_name, version=finetuned_model_version)

In [7]:
# Create or get inference environment
try:
    # Try to get existing environment
    env_asset = ml_client.environments.get(name=inference_env_name, version=inference_env_version)
    print(f"Using existing environment: {inference_env_name}:{inference_env_version}")
except:
    # Create new environment if it doesn't exist
    print(f"Creating new environment: {inference_env_name}")
    env_asset = Environment(
        name=inference_env_name,
        conda_file=inference_env_conda_file,
        image=inference_env_base_image,
        description="Environment for realtime inference with fine-tuned model"
    )
    env_asset = ml_client.environments.create_or_update(env_asset)
    print(f"Environment created: {env_asset.id}")

Using existing environment: finetuned-phi4-model-env:4


In [8]:
# Create the endpoint
try:
    # Try to get existing endpoint
    endpoint = ml_client.online_endpoints.get(endpoint_name)
    print(f"Using existing endpoint: {endpoint_name}")
except:
    print(f"Creating new endpoint: {endpoint_name}")
    
    # Create the end point
    endpoint = ManagedOnlineEndpoint(
        name=endpoint_name,
        auth_mode="key",
        description="Endpoint for fine-tuned model inference"
    )
    endpoint = ml_client.begin_create_or_update(endpoint).result()
    print(f"Endpoint '{endpoint_name}' created successfully")

Creating new endpoint: ft-phi4-mini-instruct-endpoint
Endpoint 'ft-phi4-mini-instruct-endpoint' created successfully


In [6]:
# Delete the endpoint in case of any issues and retry creating it
# ml_client.online_endpoints.begin_delete(name=endpoint_name).result()

.....

In [None]:
# Create deployment for the endpoint
try:
    # Try to get existing deployment
    deployment = ml_client.online_deployments.get(deployment_name, endpoint_name)
    print(f"Using existing deployment: {deployment_name}")
except:
    print(f"Creating new deployment: {deployment_name}")
    deployment = ManagedOnlineDeployment(
        name=deployment_name,
        endpoint_name=endpoint_name,
        model=registered_model.id,
        environment=env_asset.id,
        code_configuration=CodeConfiguration(code="./serve", scoring_script="score_real_time.py"),
        instance_type=deployment_instance_type,
        instance_count=deployment_instance_count
    )

    print("Creating deployment... This may take several minutes.")
    deployment = ml_client.begin_create_or_update(deployment).result()
    print(f"Deployment '{deployment_name}' created successfully")

Check: endpoint ft-phi4-mini-instruct-endpoint exists


Creating new deployment: blue
Creating deployment... This may take several minutes.
.......................................................................................................Deployment 'blue' created successfully


In [None]:
# Set traffic to deployment
# blue deployment takes 100 traffic
endpoint.traffic = {"blue": 100}
ml_client.online_endpoints.begin_create_or_update(endpoint).result()

## Testing the real-time end point

In [24]:
user_content = """
Question:
A 21-year-old sexually active male complains of fever, pain during urination, and inflammation and pain in the right knee. A culture of the joint fluid shows a bacteria that does not ferment maltose and has no polysaccharide capsule. The physician orders antibiotic therapy for the patient. The mechanism of action of action of the medication given blocks cell wall synthesis, which of the following was given?
            
Options:
A. Chloramphenicol
B. Gentamicin
C. Ciprofloxacin
D. Ceftriaxone
E. Trimethoprim"""

sample = {
    "messages": [
        {
            "role": "system",
            "content": "You are a medical expert. Read the following USMLE question and choose the best answer. Just give the option."
        },
        {
            "role": "user",
            "content": user_content
        }
    ],
    "temperature": 0.0,
    "max_new_tokens": 10,
    "do_sample": False,
}

In [25]:
import json
with open("request.json", "w") as f:
    json.dump(sample, f, indent=4)

In [26]:
response = ml_client.online_endpoints.invoke(
    endpoint_name=endpoint_name,
    request_file="request.json"
)
print(response)

"C"


### Make sure to delete the clusters (dedicated ones) and endpoints after evaluation. Low-priority clusters scale to 0 automatically, so they don't incur costs when idle.