# Introduction

# Motivation

In [46]:
# Import Libraries

import azureml.core
from azureml.core import Workspace, Dataset, Datastore
from azureml.data.datapath import DataPath
from azureml.core.authentication import ServicePrincipalAuthentication


import os

## Connect to Workspace

In [112]:

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.48.0 to work with azmachinelearning


In [48]:


default_ds = ws.get_default_datastore()

if 'titanic dataset' not in ws.datasets:
    Dataset.File.upload_directory(src_dir='data',
                              target=DataPath(default_ds, 'titanic-data/')
                              )

    #Create a tabular dataset from the path on the datastore (this may take a short while)
    tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'titanic-data/titanic.csv'))

    # Register the tabular dataset
    try:
        tab_data_set = tab_data_set.register(workspace=ws, 
                                name='titanic dataset',
                                description='titanic data',
                                tags = {'format':'CSV'},
                                create_new_version=True)
        print('Dataset registered.')
    except Exception as ex:
        print(ex)
else:
    print('Dataset already registered.')

Dataset already registered.


In [49]:

# Create a folder for the pipeline step files
experiment_folder = 'titanic_pipeline'
os.makedirs(experiment_folder, exist_ok=True)

print(experiment_folder)

titanic_pipeline


In [50]:
%%writefile $experiment_folder/prep_titanic.py
# Import libraries
import os
import argparse
import pandas as pd
import joblib
from azureml.core import Run
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from azureml.core import Workspace, Dataset, Datastore
from azureml.data.datapath import DataPath
from azureml.core.authentication import ServicePrincipalAuthentication

svc_pr = ServicePrincipalAuthentication(
    tenant_id="aaf80b90-a7b0-4d67-a45e-9f667ca00d2a",
    service_principal_id="f42dabcf-89aa-49b4-934e-151f64127f09",
    service_principal_password="rJ68Q~3LC1_dBnuNYmJ0UDCu7tWFH0QXnEBHicA0") ## BAD Practice, but testing right now. use environment varibale instead or azure secret vault


# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--input-data", type=str, dest='raw_dataset_id', help='raw dataset')
parser.add_argument('--prepped-data', type=str, dest='prepped_data', default='prepped_data', help='Folder for results')
args = parser.parse_args()
save_folder = args.prepped_data

# Need a better way to define workspace as opposed to hard code
subscription_id = '0fc728de-7dd8-42de-97fb-5ff957b4f4f1'
resource_group = 'azmachinelearning'
workspace_name = 'azmachinelearning'


# Authenticate via service principal
ws= Workspace(subscription_id, resource_group, workspace_name, auth = svc_pr)

default_ds = ws.get_default_datastore()


# Get the experiment run context
run = Run.get_context()

# load the data (passed as an input dataset)
print("Loading Data...")
df = run.input_datasets['raw_data'].to_pandas_dataframe()

# Log raw row count
row_count = (len(df))
run.log('raw_rows', row_count)

# remove nulls
df = df.dropna()

# Normalize the numeric columns
# Scaling isn't necessary for decision trees
scaler = MinMaxScaler()
num_cols = ['Age','Fare']
df[num_cols] = scaler.fit_transform(df[num_cols])

df['Sex'] = df['Sex'].replace({'male':1,'female':0})

# Log processed rows
row_count = (len(df))
run.log('processed_rows', row_count)

# Save the prepped data
print("Saving Data...")
os.makedirs(save_folder, exist_ok=True)
save_path = os.path.join(save_folder,'data.csv')
df.to_csv(save_path, index=False, header=True)

# Saving Scalar File
os.makedirs('outputs', exist_ok=True)
scaler_file = os.path.join('outputs', 'titanic_scaler.pkl')
joblib.dump(value=scaler, filename=scaler_file)


default_ds = ws.get_default_datastore()
folder_data = Dataset.File.upload_directory(src_dir='outputs',
                              target=DataPath(default_ds, 'titanic-data/')
                              )

try:
    folder_data.register(workspace=ws, name='titanic_scaler.pkl')
except:
    print('file already there')

# End the run
run.complete()

Overwriting titanic_pipeline/prep_titanic.py


In [51]:
%%writefile $experiment_folder/train_titanic.py
# Import libraries
from azureml.core import Run, Model
import argparse
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--training-data", type=str, dest='training_data', help='training data')
args = parser.parse_args()
training_data = args.training_data

# Get the experiment run context
run = Run.get_context()

# load the prepared data file in the training folder
print("Loading Data...")
file_path = os.path.join(training_data,'data.csv')
titanic = pd.read_csv(file_path)

# Separate features and labels
X, y = titanic[['Age','Sex','Fare']].values, titanic['Survived'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train adecision tree model
print('Training a decision tree model...')
model = LogisticRegression(random_state = 42).fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

# plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])
fig = plt.figure(figsize=(6, 4))
# Plot the diagonal 50% line
plt.plot([0, 1], [0, 1], 'k--')
# Plot the FPR and TPR achieved by our model
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
run.log_image(name = "ROC", plot = fig)
plt.show()

# Save the trained model in the outputs folder
print("Saving model...")
os.makedirs('outputs', exist_ok=True)
model_file = os.path.join('outputs', 'titanic_model.pkl')
joblib.dump(value=model, filename=model_file)


# Register the model
print('Registering model...')
Model.register(workspace=run.experiment.workspace,
               model_path = model_file,
               model_name = 'titanic_model',
               tags={'Training context':'Pipeline'},
               properties={'AUC': np.float(auc), 'Accuracy': np.float(acc)})


run.complete()

Overwriting titanic_pipeline/train_titanic.py


In [52]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "sweetdreams"

try:
    # Check for existing compute target
    pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        pipeline_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

Found existing cluster, use it.


In [53]:
%%writefile $experiment_folder/experiment_env.yml
name: experiment_env
dependencies:
- python=3.6.2
- scikit-learn
- ipykernel
- matplotlib
- pandas
- pip
- pip:
  - azureml-defaults
  - pyarrow

Overwriting titanic_pipeline/experiment_env.yml


In [54]:
from azureml.core import Environment
from azureml.core.runconfig import RunConfiguration

# Create a Python environment for the experiment (from a .yml file)
experiment_env = Environment.from_conda_specification("experiment_env", experiment_folder + "/experiment_env.yml")

# Register the environment 
experiment_env.register(workspace=ws)
registered_env = Environment.get(ws, 'experiment_env')

# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

# Use the compute you created above. 
pipeline_run_config.target = pipeline_cluster

# Assign the environment to the run configuration
pipeline_run_config.environment = registered_env

print ("Run configuration created.")

Run configuration created.


In [55]:
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep


# Get the training dataset
titanic_ds = ws.datasets.get("titanic dataset")

# Create an OutputFileDatasetConfig (temporary Data Reference) for data passed from step 1 to step 2
prepped_data = OutputFileDatasetConfig("prepped_data")

# Step 1, Run the data prep script
prep_step = PythonScriptStep(name = "Prepare Data",
                                source_directory = experiment_folder,
                                script_name = "prep_titanic.py",
                                arguments = ['--input-data', titanic_ds.as_named_input('raw_data'),
                                             '--prepped-data', prepped_data],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

# Step 2, run the training script
train_step = PythonScriptStep(name = "Train and Register Model",
                                source_directory = experiment_folder,
                                script_name = "train_titanic.py",
                                arguments = ['--training-data', prepped_data.as_input()],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

print("Pipeline steps defined")

Pipeline steps defined


In [56]:

from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

# Construct the pipeline
pipeline_steps = [prep_step, train_step]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'titanic-pipeline')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

Pipeline is built.
Created step Prepare Data [f78366d3][9920a7f8-86fb-4fae-ba38-e173a4ea7c76], (This step will run and generate new outputs)
Created step Train and Register Model [25596e72][c2ac3002-b92b-43bf-a8ee-c8e8c07b6fe9], (This step will run and generate new outputs)
Submitted PipelineRun a96c351b-6760-43f7-acbb-b64c1c91f22a
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/a96c351b-6760-43f7-acbb-b64c1c91f22a?wsid=/subscriptions/0fc728de-7dd8-42de-97fb-5ff957b4f4f1/resourcegroups/azmachinelearning/workspaces/azmachinelearning&tid=aaf80b90-a7b0-4d67-a45e-9f667ca00d2a
Pipeline submitted for execution.


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO', '…

PipelineRunId: a96c351b-6760-43f7-acbb-b64c1c91f22a
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/a96c351b-6760-43f7-acbb-b64c1c91f22a?wsid=/subscriptions/0fc728de-7dd8-42de-97fb-5ff957b4f4f1/resourcegroups/azmachinelearning/workspaces/azmachinelearning&tid=aaf80b90-a7b0-4d67-a45e-9f667ca00d2a
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: 1ce37a68-f6b8-481f-b109-34e45888940e
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/1ce37a68-f6b8-481f-b109-34e45888940e?wsid=/subscriptions/0fc728de-7dd8-42de-97fb-5ff957b4f4f1/resourcegroups/azmachinelearning/workspaces/azmachinelearning&tid=aaf80b90-a7b0-4d67-a45e-9f667ca00d2a
StepRun( Prepare Data ) Status: Queued
StepRun( Prepare Data ) Status: Running

StepRun(Prepare Data) Execution Summary
StepRun( Prepare Data ) Status: Finished


ExperimentExecutionException: ExperimentExecutionException:
	Message: The output streaming for the run interrupted.
But the run is still executing on the compute target. 
Details for canceling the run can be found here: https://aka.ms/aml-docs-cancel-run
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "The output streaming for the run interrupted.\nBut the run is still executing on the compute target. \nDetails for canceling the run can be found here: https://aka.ms/aml-docs-cancel-run"
    }
}

In [57]:
for run in pipeline_run.get_children():
    print(run.name, ':')
    metrics = run.get_metrics()
    for metric_name in metrics:
        print('\t',metric_name, ":", metrics[metric_name])

Train and Register Model :
	 Accuracy : 0.7837837837837838
	 AUC : 0.8788819875776397
	 ROC : aml://artifactId/ExperimentRun/dcid.cfaaba1f-3085-4a3c-960b-b6e60dffc923/ROC_1678464663.png
Prepare Data :
	 raw_rows : 891
	 processed_rows : 183


In [58]:
import os

# Create a folder for the deployment files
deployment_folder = './titanic_service'
os.makedirs(deployment_folder, exist_ok=True)
print(deployment_folder, 'folder created.')

# Set path for scoring script
script_file = 'score_titanic.py'
script_path = os.path.join(deployment_folder,script_file)
     

./titanic_service folder created.


In [109]:
%%writefile $script_path
import json
import joblib
import numpy as np
import os
import azureml.core
from azureml.core.authentication import ServicePrincipalAuthentication
from azureml.core import Workspace, Dataset, Datastore
from azureml.data.datapath import DataPath


import os
# Called when the service is loaded
def init():
    global model
    # Get the path to the deployed model file and load it
    model_path = os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'titanic_model.pkl')
    model = joblib.load(model_path)
    
    svc_pr = ServicePrincipalAuthentication(
        tenant_id="aaf80b90-a7b0-4d67-a45e-9f667ca00d2a",
        service_principal_id="f42dabcf-89aa-49b4-934e-151f64127f09",
        service_principal_password="rJ68Q~3LC1_dBnuNYmJ0UDCu7tWFH0QXnEBHicA0") ## BAD Practice, but testing right now. use environment varibale instead or azure secret vault
    
    # Need a better way to define workspace as opposed to hard code
    subscription_id = '0fc728de-7dd8-42de-97fb-5ff957b4f4f1'
    resource_group = 'azmachinelearning'
    workspace_name = 'azmachinelearning'


    # Authenticate via service principal
    ws= Workspace(subscription_id, resource_group, workspace_name, auth = svc_pr)
    
    # Download scalar from datastore
    #path = 'titanic-data/titanic_scaler.pkl'
    #datastore = Datastore.get(ws, "workspaceblobstore")
    #scalar_file = Dataset.File.from_files(path=(datastore, path))

    #scalar_file.download(target_path='./titanic_service')
    #scalar = joblib.load('titanic_service/titanic_scaler.pkl')


# Called when a request is received
def run(raw_data):
    # Get the input data as a numpy array
    data = np.array(json.loads(raw_data)['data'])
    
    ## Perform Scalar Transformation
    x_transformed = scalar.transform(np.array(data)[:,[0,2]]).tolist() # transforming age and fare    
    for n in range(data.shape[0]):
        data[n][0] = x_transformed[n][0] # Age 
        data[n][2] = x_transformed[n][1] # Fare


    # Get a prediction from the model
    predictions = model.predict(data)
    # Get the corresponding classname for each prediction (0 or 1)
    classnames = ['Non-Survived', 'Survived']
    predicted_classes = []
    for prediction in predictions:
        predicted_classes.append(classnames[prediction])
    # Return the predictions as JSON
    return json.dumps(predicted_classes)

Overwriting ./titanic_service/score_titanic.py


In [102]:
model = ws.models['titanic_model']
print(model.name, 'version', model.version)

titanic_model version 1


In [111]:
from azureml.core import Environment
from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice
from azureml.core import Model

# Configure the scoring environment
service_env = Environment.get(workspace=ws, name="AzureML-sklearn-0.24.1-ubuntu18.04-py37-cpu-inference")
service_env.inferencing_stack_version="latest"

inference_config = InferenceConfig(source_directory=deployment_folder,
                                   entry_script=script_file,
                                   environment=service_env)

# Configure the web service container
deployment_config = AciWebservice.deploy_configuration(cpu_cores=1, memory_gb=1)

# Deploy the model as a service
print('Deploying model...')
service_name = "titanic-service3"
service = Model.deploy(ws, service_name, [model], inference_config, deployment_config, overwrite=True)
service.wait_for_deployment(True)
print(service.state)
     

Deploying model...


azureml.core.model:
To leverage new model deployment capabilities, AzureML recommends using CLI/SDK v2 to deploy models as online endpoint, 
please refer to respective documentations 
https://docs.microsoft.com/azure/machine-learning/how-to-deploy-managed-online-endpoints /
https://docs.microsoft.com/azure/machine-learning/how-to-attach-kubernetes-anywhere 
For more information on migration, see https://aka.ms/acimoemigration. 


Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2023-03-10 17:12:48+00:00 Creating Container Registry if not exists.
2023-03-10 17:12:49+00:00 Use the existing image.
2023-03-10 17:12:50+00:00 Generating deployment configuration.
2023-03-10 17:12:51+00:00 Submitting deployment to compute..
2023-03-10 17:13:03+00:00 Checking the status of deployment titanic-service3..
2023-03-10 17:14:07+00:00 Checking the status of inference endpoint titanic-service3.

KeyboardInterrupt: 

In [108]:
service.get_logs()

In [None]:
for webservice_name in ws.webservices:
    print(webservice_name)

In [86]:
path = 'titanic-data/titanic_scaler.pkl'
datastore = Datastore.get(ws, "workspaceblobstore")
#scalar_file = Dataset.File.from_files(path=(datastore, path))
#mounted_path = scalar_file.mount()

#scalar_file.download(target_path='./titanic_service')

In [87]:
scalar_file = Dataset.File.from_files(path=(datastore, path))

KeyboardInterrupt: 

In [None]:
scalar_file = Dataset.File.from_files(path=(default_ds, 'titanic-data/titanic_scalar.pkl'))


In [None]:
scalar_file = scalar_file.register(workspace=ws, 
                                name='titanic scalar',
                                description='titanic scalar',
                                tags = {'format':'pkl'},
                                create_new_version=True)

In [68]:
import joblib
scaler = joblib.load('titanic_service/titanic_scaler.pkl')

Trying to unpickle estimator MinMaxScaler from version 0.24.2 when using version 0.22.1. This might lead to breaking code or invalid results. Use at your own risk.


In [90]:
x_new = [[22,0,7.25],[54,1,52]]
x_transformed = scaler.transform(np.array(x_new)[:,[0,2]]).tolist() # transforming age and fare

In [84]:
for n in range(len(x_new)):
    x_new[n][0] = x_transformed[n][0]
    x_new[n][2] = x_transformed[n][1]
    
   

In [91]:
dyrus = np.array(x_new)

In [96]:
dyrus

array([[0.2665655 , 0.        , 0.01415106],
       [0.67121902, 1.        , 0.10149724]])

In [95]:
for n in range(dyrus.shape[0]):
    dyrus[n][0] = x_transformed[n][0]
    dyrus[n][2] = x_transformed[n][1]
    

In [85]:
x_new 

[[0.2665655032878098, 0, 0.014151057562208049],
 [0.6712190187152252, 1, 0.10149724044618187]]

In [None]:
## Invoke Endpoint
import pickle
import json
import numpy as np

# Age, Sex, Fare
x_new = [[22,0,7.25],[54,1,52]] # After transformation is applied expected x input is: [[0.2711736617240512,0, 0.014151057562208049], [0.6732847449107816, 1,0.10149724044618187]]

# Convert the array to a serializable list in a JSON document
input_json = json.dumps({"data": x_new})

# Call the web service, passing the input data (the web service will also accept the data in binary format)
predictions = service.run(input_data = input_json)

# Get the predicted class - it'll be the first (and only) one.
predicted_classes = json.loads(predictions)

for i in range(len(x_new)):
    print ("Passenger {}".format(x_new[i]), predicted_classes[i] )
     

In [None]:
#service.delete()
print ('Service deleted.')

Research/References:

- https://machinelearningmastery.com/how-to-save-and-load-models-and-data-preparation-in-scikit-learn-for-later-use/