# Automated ML

TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [None]:
import logging
import os
import csv
import shutil


from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import pkg_resources

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset

#Computer target
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException


#Visualizing run
from azureml.widgets import RunDetails

#library for Saving model
import joblib

#import clean data function defined in train.py script
from train import clean_data 

#ONNX libraries
import sys
import json
from azureml.automl.core.onnx_convert import OnnxConvertConstants
from azureml.train.automl import constants

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

Initialize a workspace object from persisted configuration. Make sure the config file is present at .\config.json

In [None]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

Here we will be creating an experiment named "heart-disease-automl".  

In [None]:
# choose a name for experiment
experiment_name = 'liver-disease-automl'

experiment=Experiment(ws, experiment_name)

Create or Attach an AmlCompute cluster
You will need to create a compute target for your AutoML run. In this demo, you get the default AmlCompute as your training compute resource.

In [None]:
#Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

# Choose a name for your CPU cluster
cpu_cluster_name = "notebook135081"
# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS3_V2',
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)

## Dataset

TODO: In this markdown cell, give an overview of the dataset you are using. Also mention the task you will be performing.
TODO: Get data. In the cell below, write code to access the data you will be using in this project. Remember that the dataset needs to be external.

### Overview
There is an increasing number of patients with liver disease in recent time due to life style and living habits such as excessive alcohol consumption, inhale of harmful gases, excessive weight gain, intake of contaminated food, abuse of drugs. This dataset is aimed at helping doctors during clinical diagnosis of liver disease to elevate burden and the stress involved in analyzing every single patients’ information. Therefore, the goal is to create a classifier that predicts whether a subject is healthy (non-liver patient) or ill (liver patient) based on some clinical and demographic features which are: age, gender, total Bilirubin, direct Bilirubin, total proteins, albumin, A/G ratio, SGPT, SGOT and Alkphos.

In [None]:
#load the liver dataset to datastore
data_path = "https://raw.githubusercontent.com/chollette/nd00333-capstone/master/Liver%20Patient%20Dataset%20(LPD)_train.csv"
dataset = pd.read_csv(data_path)

In [None]:
# Use the clean_data function to clean your data.
x, y = clean_data(dataset)
train_data = pd.concat([x, y], axis=1, sort=False)
#upload the cleaned marketing data to the default datastore (blob) of my workspace.

#first convert data to .csv
train_data.to_csv('train_data.csv',header=True)

#Then upload to datastore
datastore = ws.get_default_datastore()
datastore.upload_files(['train_data.csv'], target_path='', overwrite=True)

In [None]:
#convert back to tabular dataset for running in AutoML
train_data = Dataset.Tabular.from_delimited_files(path = [(datastore, 'train_data.csv')])
label = "Result"

## AutoML Configuration

TODO: Explain why you chose the automl settings and cofiguration you used below.

In [None]:
# TODO: Put your automl settings here
automl_settings = {   
    "experiment_timeout_hours": 1,
    #"experiment_timeout_minutes": 30,
    "enable_early_stopping" : True,
    "model_explainability" : True,
    "iteration_timeout_minutes": 5,
    "max_concurrent_iterations": 5,
    "max_cores_per_iteration": -1,
    "n_cross_validations": 10,
    "primary_metric": 'accuracy',
    "featurization": 'auto',
    "verbosity": logging.INFO,
}

# TODO: Put your automl config here
automl_config = AutoMLConfig(task = 'classification',
                             compute_target=compute_target,
                             training_data = train_data,
                             label_column_name = label,
                             debug_log = "automl_errors.log",
                             **automl_settings
                            )

In [None]:
# TODO: Submit your experiment
remote_run = experiment.submit(automl_config)

## Run Details

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?

TODO: In the cell below, use the `RunDetails` widget to show the different experiments.

In [None]:
#Visualize experiment
RunDetails(remote_run).show()


In [None]:
remote_run.wait_for_completion()

## Best Model

TODO: In the cell below, get the best model from the automl experiments and display all the properties of the model.



In [None]:
# Retrieve and save your best automl model.
best_automl_run_metrics = remote_run.get_metrics()
print(best_automl_run_metrics)

In [None]:
print("Best AutoML model Accuracy: ", best_automl_run_metrics['accuracy'])

In [None]:
#TODO: Save the best model
best_run, fitted_model = remote_run.get_output()
print(best_run)

In [None]:
joblib.dump(fitted_model, 'automl-votingEnsemble_model.joblib')

In [None]:
compute_target.delete()