# Titanic Survival Prediction Project using Azure AutoML

In [None]:
#Import libraries needed
import logging
import os
import csv

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from azureml.data.dataset_factory import TabularDatasetFactory
import pkg_resources

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset

from azureml.pipeline.steps import AutoMLStep

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

In [None]:
#Testing the authentication using the Workspace method "from_config"
Workspace.from_config()

In [None]:
#create workspace
ws = Workspace.from_config()

In [None]:
# Create Experiment

experiment_name = 'Titanic-AutoML-Experiment'
project_folder = './Titanic-project'

experiment = Experiment(ws, experiment_name)
experiment

## Dataset Overview:

As you probably have guessed from the project title we will be working with the "Titanic Dataset" which is already a classical dataset to learn Machine Learning.

The main task for this project will be to build a predictive model that answers the question: “what sorts of people were more likely to survive?” To answer the above stated question we are going to give the model different input variables such as age, type of cabin the passanger had, etc

In [None]:
#Getting the dataset
key = "titanic_dataset"
description_text = "Titanic Dataset for Model Deployment"

dataset = TabularDatasetFactory.from_delimited_files('https://www.openml.org/data/get_csv/16826755/phpMYEkMl')

#Register Dataset and it will store into the Datasets->Registered Dataset
dataset = dataset.register(workspace=ws,
                           name=key,
                           description=description_text)

#Create a df out of the registered dataset
dataset = dataset.to_pandas_dataframe()
dataset.describe()

In [None]:
###Be sure to have the compute target setup
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException


# Choose a name for your CPU cluster
amlcompute_cluster_name = "compute-cluster"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2',# for GPU, use "STANDARD_NC6"
                                                           #vm_priority = 'lowpriority', # optional
                                                           min_nodes=1,
                                                           max_nodes=6)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)


### Clean Data:

In [None]:
#Import your clean data function from the train.py file
from train import clean_data

In [None]:
#apply the function
x, y = clean_data(dataset)

In [None]:
#Scale the features
from sklearn.preprocessing import StandardScaler

In [None]:
# create scaler
variables = x.columns.tolist()

scaler = StandardScaler()
scaler.fit(x[variables]) 

x = scaler.transform(x[variables])

In [None]:
x = pd.DataFrame(x,columns=variables)
x

In [None]:
#Split the data
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.3) # 30% as test data,random_state=0

In [None]:
#bring them together them again
dataset = pd.concat([x_train,y_train],axis=1)

In [None]:
dataset.head(10)

In [None]:
#To train the model we need a TabularDataset and not a dataframe, therefore the current df will be converterd 
#into a TabularDataset:

#Convert the dataframe into a csv
local_path = 'prepared.csv'

#Save it locally
dataset.to_csv(local_path,index=None)

#Generate the a datastore object which is the the default datastore
datastore = ws.get_default_datastore()

In [None]:
#Upload the dataframe which was previosly converted into a csv
datastore.upload(src_dir='.', target_path='data')

In [None]:
#For the sake of checking; check the path
datastore.path()

In [None]:
#Now the uploaded file will be transformed into a Tabular dataset and store in a varible named 'training_dataset'
training_dataset = Dataset.Tabular.from_delimited_files("https://raw.githubusercontent.com/ddgope/Titanic-Survival-Prediction/master/prepared.csv")

In [None]:
#let's visualize the data:
training_dataset.to_pandas_dataframe().head()

## AutoML Configuration

Below we will chose the automl settings and cofiguration

In [None]:
#Create the automl settings which will be used as argurments in the automl config
automl_settings = {
    "experiment_timeout_minutes": 20,
    "max_concurrent_iterations": 5,
    "primary_metric" : 'accuracy'
}

#Create the automl_config
automl_config = AutoMLConfig(compute_target=compute_target,
                             task = "classification",
                             training_data=training_dataset,
                             label_column_name="survived",   
                             path = project_folder,
                             enable_early_stopping= True,
                             featurization= 'auto',
                             debug_log = "automl_errors.log",
                             **automl_settings
                            )

In [None]:
#Submitt the experiment
automl_run = experiment.submit(automl_config,show_output=True)

In [None]:
#Additional Run Details
from azureml.widgets import RunDetails

RunDetails(automl_run).show()

# wait for completion
automl_run.wait_for_completion()

In [None]:
#Get generic outputs from the automl_run
automl_run.get_output()

In [None]:
#Get the best model outputs
best_automl_run, best_model = automl_run.get_output()


# Retrieve the best automl run model
print('Best AutoML run: ', best_automl_run)
print('Best AutoML model :', best_model)

# get best model and display properties
model_name = best_automl_run.properties['model_name']
print('Best_model name: ', model_name)

# display all the properties of the best model
best_automl_run.get_properties()

In [None]:
#Get the best model id
print(best_automl_run.id)

# Save the best AutoML model

In [None]:
import joblib

joblib.dump(best_model, 'best_automl_model.pkl')

# Azure ML Studio

I will be using the azurenl sdk for Python for depoyment.
Ref: https://docs.microsoft.com/en-us/python/api/overview/azure/ml/?view=azure-ml-py

# Install Required Libraries

In [4]:
#pip install azureml , azureml-core

Note: you may need to restart the kernel to use updated packages.


# Import the libraries

In [None]:
# #Prepare deploying of the model as a web service
# from azureml.core import Workspace
# from azureml.core import Workspace
# from azureml.core.model import Model

from azureml.core.webservice import AciWebservice
from azureml.core.webservice import Webservice
from azureml.core.model import InferenceConfig
from azureml.core.environment import Environment
#from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies

# Create Workspace

In [6]:
# In my case Workspace has alreday been created in top otherwise. 
# Need to create using below command

# ws = Workspace.create(
#                name='myworkspace',            
#                subscription_id='<azure-subscription-id>',           
#                resource_group='myresourcegroup',                 
#                create_resource_group=True,                 
#                location='eastus2'                
#                )

#Once it is created, save the details of the workspace in a config.json 
#file to use the workspace later.
# ws.write_config()

# Create a folder named ‘.azureml’ and save the json file inside it. 
# Make sure you name the folder and json file correctly.
# Python will look for the config.json file inside this folder 
# while loading up your workspace

# You won’t need to create a workspace every time, in the future, 
# you can use the following command to load the workspace
# #ws = Workspace.from_config()

# Register the Model
1. workspace: The workspace object we created
2. model_path: the path to the pickle file
3. model_name: Name of the model on Azure MLS
4. tags: Although not necessary, you can add tags to your model
5. description: A description of your model

In [None]:
# register best AutoML model for future deployment
from azureml.core.model import Model
description = 'AutoML Model trained on the titanic dataset'
tags = {'area': 'data science beginners', 'type': 'classification'}

automl_model = Model.register(workspace =ws,
                              model_name = 'best-titanicMLmodel',
                              model_path = 'best_automl_model.pkl',
                             description = description, tags = tags)

print('AutoML RunID: ', automl_run.id, sep='\t')

# Create an Environment
Environment was alreday created otherwise create using below command

In [9]:
## to install required packages
# env = Environment('env')
# cd = CondaDependencies.create(pip_packages=['pandas==1.1.5', 'azureml-defaults','joblib==0.17.0'], conda_packages = ['scikit-learn==0.23.2'])
# env.python.conda_dependencies = cd
## Register environment to re-use later
# env.register(workspace = ws)
# print("Registered Environment")

In [None]:
#get’s the environment whuich just created or previously created
env = Environment.get(workspace=ws, name='AzureML-AutoML')

In [8]:
#Confirm that all the required libraries have been installed by creating a .yml 
#based on the created environment. Ensure all the libraries are mentioned inside the .yml file

In [None]:
#This will create a new folder called environ with a .yml and a .json file inside it
env.save_to_directory('./environ', overwrite=True)

In [None]:
#Chekc environment dependencies
print("packages", env.python.conda_dependencies.serialize_to_string())

# Config Objects
Create a container instance and set the number of cpu_cores and memory_gb based on your requirements.

In [None]:
aci_config = AciWebservice.deploy_configuration(
            cpu_cores=1,
            memory_gb=4, 
            enable_app_insights=True,
            auth_enabled=True,
            tags={"data":"titanic classifier"},
            description='titanic classification Model',
            )

In [None]:
#Create and InferenceConfig instance to link the environment and entry script. 
inference_config = InferenceConfig(entry_script='score.py', environment=env)

#In my case Entry Script is already create 
#and stored in the path otherwise you can create using below command
#The entry script will two functions, an init function and a run function.
#     Create a global variable called model
#     Load the model using the name of the model you registered earlier
#     Load the model from the path
# def init():
#     global modelmodel_path = Model.get_model_path("knn")
#     print("Model Path is  ", model_path)
#     model = joblib.load(model_path)

# #Save this file.
# def run(data):
#     try:
#        print(data)
#        result = model.predict(data['data']) 
#        return {'data' : result.tolist() , 'message' : "Successfully  classified Titanic"}
#     except Exception as e:
#            error = str(e)
#            return {'data' : error , 'message' : 'Failed to classify Titanic'}

# Deploy the Model

In [None]:
#deploy the model by combining our config objects, workspace and model together. 
service_name = 'my-ml-service'

model = Model(ws,name='best-titanicMLmodel')
service = Model.deploy(workspace=ws,
                       name=service_name,
                       models=[model],
                       inference_config=inference_config,
                       deployment_config=aci_config,
                       overwrite=True)

service.wait_for_deployment(show_output=True)

In [None]:
#deployment is successful, the rest endpoint will be printed out.
# If it is unsuccessful, see the deployment logs
service.get_logs()

In [None]:
# print service state
print(service.state)
# print scoring URI
print('scoring URI: ' + service.scoring_uri)
# print Swagger URI
print('Swagger URI: ' + service.swagger_uri)
# retrieve authentication keys
primary, secondary = service.get_keys()
# print primary authenticaton key
print('Primary Authentication Key: ' + primary)

In [None]:
#Store the uri's in variables:
scoring_uri = 'http://87897773-cb15-40d5-ba0d-ba8285d8f467.southcentralus.azurecontainer.io/score'

key = 'iBX2glUB3xcahOndX5AW62WoVbRiDcIZ'

# Consume the Endpoint

In [None]:
#let's test requests:
import json
import requests

scoring_uri = scoring_uri
key = key

headers = {'Content-Type':'application/json'}
headers['Authorization'] = f'Bearer {key}'


test_data = json.dumps({'data':[{
    'pclass': 0.8419164182590155,
    'age': -0.34907541344456255,
    'sibsp': -0.47908676070718687,
    'parch': -0.444999501816175,
    'fare': -0.4902404567566683,
    'age_NA': -0.5014319838391105,
    'fare_NA': -0.027650063180466557,
    'sex_male': 0.743496915331831,
    'cabin_Missing': 0.5393765119990418,
    'cabin_Rare': -0.42592011250734235,
    'embarked_Q': -0.32204029159373954,
    'embarked_Rare': -0.03911805059269843,
    'embarked_S': 0.6573935670276714,
    'title_Mr': 0.8525918887485938,
    'title_Mrs': -0.42592011250734235,
    'title_Rare': -0.27494677157229536
    }
    ]
        })

test_data2 = json.dumps({'data':[{
    'pclass': -15460978645168200,
    'age': 0.8912042887450313,
    'sibsp': -0.47908676070718687,
    'parch': -0.444999501816175,
    'fare': 19569900306355100,
    'age_NA': -0.5014319838391105,
    'fare_NA': -0.027650063180466557,
    'sex_male': -13449954927569300,
    'cabin_Missing': -18539924853119600,
    'cabin_Rare': 23478581326275300,
    'embarked_Q': -0.32204029159373954,
    'embarked_Rare': -0.03911805059269843,
    'embarked_S': -15211587854766800,
    'title_Mr': -11728941046668400,
    'title_Mrs': -0.42592011250734235,
    'title_Rare': -0.27494677157229536

    }
    ]
        })


response1 = requests.post(scoring_uri, data=test_data, headers=headers)
response2 = requests.post(scoring_uri, data=test_data2, headers=headers)

print("Classification Prediction:",response1.text)
print("Classification Prediction:",response2.text)

In [None]:
#get the environment Details and stored them into a file:
f = open("env.yml", "w")
f.write(env.python.conda_dependencies.serialize_to_string())
f.close()

print("packages", env.python.conda_dependencies.serialize_to_string())

In [None]:
#Delete Service:
service.delete()