# Introduction

# Motivation

In [1]:
# Import Libraries

import azureml.core
from azureml.core import Workspace

from azureml.core import Dataset
from azureml.data.datapath import DataPath

import os

## Connect to Workspace

In [2]:

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.48.0 to work with testerinos


In [3]:


default_ds = ws.get_default_datastore()

if 'titanic dataset' not in ws.datasets:
    Dataset.File.upload_directory(src_dir='data',
                              target=DataPath(default_ds, 'titanic-data/')
                              )

    #Create a tabular dataset from the path on the datastore (this may take a short while)
    tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'titanic-data/titanic.csv'))

    # Register the tabular dataset
    try:
        tab_data_set = tab_data_set.register(workspace=ws, 
                                name='titanic dataset',
                                description='titanic data',
                                tags = {'format':'CSV'},
                                create_new_version=True)
        print('Dataset registered.')
    except Exception as ex:
        print(ex)
else:
    print('Dataset already registered.')

Validating arguments.
Arguments validated.
Uploading file to titanic-data/
Uploading an estimated of 3 files
Uploading data/.amlignore
Uploaded data/.amlignore, 1 files out of an estimated total of 3
Uploading data/.amlignore.amltmp
Uploaded data/.amlignore.amltmp, 2 files out of an estimated total of 3
Uploading data/titanic.csv
Uploaded data/titanic.csv, 3 files out of an estimated total of 3
Uploaded 3 files
Creating new dataset
Dataset registered.


In [4]:

# Create a folder for the pipeline step files
experiment_folder = 'titanic_pipeline'
os.makedirs(experiment_folder, exist_ok=True)

print(experiment_folder)

titanic_pipeline


In [5]:
%%writefile $experiment_folder/prep_titanic.py
# Import libraries
import os
import argparse
import pandas as pd
import joblib
from azureml.core import Run
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from azureml.core import Workspace, Dataset, Datastore

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--input-data", type=str, dest='raw_dataset_id', help='raw dataset')
parser.add_argument('--prepped-data', type=str, dest='prepped_data', default='prepped_data', help='Folder for results')
args = parser.parse_args()
save_folder = args.prepped_data


subscription_id = '71fa0172-ce90-403c-94a9-14ce1e88f56a'
resource_group = 'rg_eastus_44930_1_1677358905717'
workspace_name = 'testerinos'

# when this line executes, Azure will ask to authenticate... just need a better way to do it automatically as opposed to checking the userlogs
ws= Workspace(subscription_id, resource_group, workspace_name)

# Get the experiment run context
run = Run.get_context()

# load the data (passed as an input dataset)
print("Loading Data...")
df = run.input_datasets['raw_data'].to_pandas_dataframe()

# Log raw row count
row_count = (len(df))
run.log('raw_rows', row_count)

# remove nulls
df = df.dropna()

# Normalize the numeric columns
# Scaling isn't necessary for decision trees
scaler = MinMaxScaler()
num_cols = ['Age','Fare']
#df[num_cols] = scaler.fit_transform(df[num_cols])

df['Sex'] = df['Sex'].replace({'male':1,'female':0})

# Log processed rows
row_count = (len(df))
run.log('processed_rows', row_count)

# Save the prepped data
print("Saving Data...")
os.makedirs(save_folder, exist_ok=True)
save_path = os.path.join(save_folder,'data.csv')
df.to_csv(save_path, index=False, header=True)

# Saving Scalar File
os.makedirs('outputs', exist_ok=True)
scaler_file = os.path.join('outputs', 'titanic_scaler.pkl')
joblib.dump(value=scaler, filename=scaler_file)


default_ds = ws.get_default_datastore()
Dataset.File.upload_directory(src_dir='outputs',
                              target=DataPath(default_ds, 'titanic-data/')
                              )

# End the run
run.complete()

Overwriting titanic_pipeline/prep_titanic.py


In [6]:
%%writefile $experiment_folder/train_titanic.py
# Import libraries
from azureml.core import Run, Model
import argparse
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--training-data", type=str, dest='training_data', help='training data')
args = parser.parse_args()
training_data = args.training_data

# Get the experiment run context
run = Run.get_context()

# load the prepared data file in the training folder
print("Loading Data...")
file_path = os.path.join(training_data,'data.csv')
titanic = pd.read_csv(file_path)

# Separate features and labels
X, y = titanic[['Age','Sex','Fare']].values, titanic['Survived'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train adecision tree model
print('Training a decision tree model...')
model = LogisticRegression(random_state = 42).fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

# plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])
fig = plt.figure(figsize=(6, 4))
# Plot the diagonal 50% line
plt.plot([0, 1], [0, 1], 'k--')
# Plot the FPR and TPR achieved by our model
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
run.log_image(name = "ROC", plot = fig)
plt.show()

# Save the trained model in the outputs folder
print("Saving model...")
os.makedirs('outputs', exist_ok=True)
model_file = os.path.join('outputs', 'titanic_model.pkl')
joblib.dump(value=model, filename=model_file)


# Register the model
print('Registering model...')
Model.register(workspace=run.experiment.workspace,
               model_path = model_file,
               model_name = 'titanic_model',
               tags={'Training context':'Pipeline'},
               properties={'AUC': np.float(auc), 'Accuracy': np.float(acc)})


run.complete()

Overwriting titanic_pipeline/train_titanic.py


In [7]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "sweetdreams"

try:
    # Check for existing compute target
    pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        pipeline_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

Found existing cluster, use it.


In [8]:
%%writefile $experiment_folder/experiment_env.yml
name: experiment_env
dependencies:
- python=3.6.2
- scikit-learn
- ipykernel
- matplotlib
- pandas
- pip
- pip:
  - azureml-defaults
  - pyarrow

Overwriting titanic_pipeline/experiment_env.yml


In [9]:
from azureml.core import Environment
from azureml.core.runconfig import RunConfiguration

# Create a Python environment for the experiment (from a .yml file)
experiment_env = Environment.from_conda_specification("experiment_env", experiment_folder + "/experiment_env.yml")

# Register the environment 
experiment_env.register(workspace=ws)
registered_env = Environment.get(ws, 'experiment_env')

# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

# Use the compute you created above. 
pipeline_run_config.target = pipeline_cluster

# Assign the environment to the run configuration
pipeline_run_config.environment = registered_env

print ("Run configuration created.")

Run configuration created.


In [10]:
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep


# Get the training dataset
titanic_ds = ws.datasets.get("titanic dataset")

# Create an OutputFileDatasetConfig (temporary Data Reference) for data passed from step 1 to step 2
prepped_data = OutputFileDatasetConfig("prepped_data")

# Step 1, Run the data prep script
prep_step = PythonScriptStep(name = "Prepare Data",
                                source_directory = experiment_folder,
                                script_name = "prep_titanic.py",
                                arguments = ['--input-data', titanic_ds.as_named_input('raw_data'),
                                             '--prepped-data', prepped_data],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

# Step 2, run the training script
train_step = PythonScriptStep(name = "Train and Register Model",
                                source_directory = experiment_folder,
                                script_name = "train_titanic.py",
                                arguments = ['--training-data', prepped_data.as_input()],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

print("Pipeline steps defined")

Pipeline steps defined


In [11]:

from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

# Construct the pipeline
pipeline_steps = [prep_step, train_step]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'titanic-pipeline')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

Pipeline is built.
Created step Prepare Data [60ef3f3c][c63f3356-953f-42e7-a381-ff605c8876f4], (This step will run and generate new outputs)
Created step Train and Register Model [af5f8645][8cf96612-cd6c-4e71-9e5c-fcf1d0def609], (This step will run and generate new outputs)
Submitted PipelineRun e772390f-c3db-4b31-a1ee-181dd16cd33e
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/e772390f-c3db-4b31-a1ee-181dd16cd33e?wsid=/subscriptions/71fa0172-ce90-403c-94a9-14ce1e88f56a/resourcegroups/rg_eastus_44930_1_1677358905717/workspaces/testerinos&tid=82676786-5bc7-43c6-b8f8-b3ee02b0b5f3
Pipeline submitted for execution.


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: e772390f-c3db-4b31-a1ee-181dd16cd33e
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/e772390f-c3db-4b31-a1ee-181dd16cd33e?wsid=/subscriptions/71fa0172-ce90-403c-94a9-14ce1e88f56a/resourcegroups/rg_eastus_44930_1_1677358905717/workspaces/testerinos&tid=82676786-5bc7-43c6-b8f8-b3ee02b0b5f3
PipelineRun Status: Running


StepRunId: 26c3eb41-ba15-49c9-ab2a-a8e544a633d4
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/26c3eb41-ba15-49c9-ab2a-a8e544a633d4?wsid=/subscriptions/71fa0172-ce90-403c-94a9-14ce1e88f56a/resourcegroups/rg_eastus_44930_1_1677358905717/workspaces/testerinos&tid=82676786-5bc7-43c6-b8f8-b3ee02b0b5f3
StepRun( Prepare Data ) Status: Running

Streaming azureml-logs/20_image_build_log.txt
2023/02/25 21:28:50 Downloading source code...
2023/02/25 21:28:51 Finished downloading source code
2023/02/25 21:28:51 Creating Docker network: acb_default_network, driver: 'bridge'
2023/02/25 21:28:52 Successfully set up Docker network: acb_de

In [None]:
for run in pipeline_run.get_children():
    print(run.name, ':')
    metrics = run.get_metrics()
    for metric_name in metrics:
        print('\t',metric_name, ":", metrics[metric_name])

In [None]:
import os

# Create a folder for the deployment files
deployment_folder = './titanic_service'
os.makedirs(deployment_folder, exist_ok=True)
print(deployment_folder, 'folder created.')

# Set path for scoring script
script_file = 'score_titanic.py'
script_path = os.path.join(deployment_folder,script_file)
     

In [None]:
%%writefile $script_path
import json
import joblib
import numpy as np
import os
from azureml.core import Workspace, Dataset, Datastore

# Called when the service is loaded
def init():
    global model
    # Get the path to the deployed model file and load it
    model_path = os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'titanic_model.pkl')
    model = joblib.load(model_path)

    # Workspace Information
    subscription_id = '71fa0172-ce90-403c-94a9-14ce1e88f56a'
    resource_group = 'rg_eastus_44930_1_1677358905717'
    workspace_name = 'testerinos'

    datastore = Datastore.get(workspace, "workspaceblobstore")
    dataset = Dataset.File.from_files(path=(datastore, 'titanic-data/scaler.pkl'))
    mounted_path = dataset.mount()
    dataset.download(target_path='.')

# Called when a request is received
def run(raw_data):
    # Get the input data as a numpy array
    data = np.array(json.loads(raw_data)['data'])

    scaler = load(open('scaler.pkl', 'rb'))

    data = scalar.transform(data)

    # Get a prediction from the model
    predictions = model.predict(data)
    # Get the corresponding classname for each prediction (0 or 1)
    classnames = ['Non-Survived', 'Survived']
    predicted_classes = []
    for prediction in predictions:
        predicted_classes.append(classnames[prediction])
    # Return the predictions as JSON
    return json.dumps(predicted_classes)

In [None]:
model = ws.models['titanic_model']
print(model.name, 'version', model.version)

In [None]:
from azureml.core import Environment
from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice
from azureml.core import Model

# Configure the scoring environment
service_env = Environment.get(workspace=ws, name="AzureML-sklearn-0.24.1-ubuntu18.04-py37-cpu-inference")
service_env.inferencing_stack_version="latest"

inference_config = InferenceConfig(source_directory=deployment_folder,
                                   entry_script=script_file,
                                   environment=service_env)

# Configure the web service container
deployment_config = AciWebservice.deploy_configuration(cpu_cores=1, memory_gb=1)

# Deploy the model as a service
print('Deploying model...')
service_name = "titanic-service"
service = Model.deploy(ws, service_name, [model], inference_config, deployment_config, overwrite=True)
service.wait_for_deployment(True)
print(service.state)
     

In [None]:
for webservice_name in ws.webservices:
    print(webservice_name)

In [None]:
## Invoke Endpoint

import json

x_new = [[22,0,7.25],[54,1,52]]

# Convert the array to a serializable list in a JSON document
input_json = json.dumps({"data": x_new})

# Call the web service, passing the input data (the web service will also accept the data in binary format)
predictions = service.run(input_data = input_json)

# Get the predicted class - it'll be the first (and only) one.
predicted_classes = json.loads(predictions)

for i in range(len(x_new)):
    print ("Passenger {}".format(x_new[i]), predicted_classes[i] )
     

In [None]:
service.delete()
print ('Service deleted.')

Research/References:

- https://machinelearningmastery.com/how-to-save-and-load-models-and-data-preparation-in-scikit-learn-for-later-use/