# AutoML Classification experiment using Remote-AML-Compute and AML-Datasets
## Data: IBM Employee Attrition dataset loaded from Azure ML Dataset

##  Get Azure ML Workspace to use

In [1]:
# azureml-core of version 1.0.72 or higher is required
from azureml.core import Workspace, Dataset

# Get Workspace defined in by default config.json file
ws = Workspace.from_config()

## Load data from Azure ML Datasets 
Pandas DataFrame only used to check out the data

In [2]:
# Load Data
aml_dataset = ws.datasets['IBM-Employee-Attrition']

# Use Pandas DataFrame just to sneak peak some data and schema
full_df = aml_dataset.to_pandas_dataframe()
# .to_pandas_dataframe().dropna()
full_df.head(5)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,0,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,1,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,0,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,0,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [3]:
# Use Pandas DataFrame just to investigate the dataset's schema and info
full_df.describe()

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,0.161224,802.485714,9.192517,2.912925,1.0,1024.865306,2.721769,65.891156,2.729932,...,2.712245,80.0,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129
std,9.135373,0.367863,403.5091,8.106864,1.024165,0.0,602.024335,1.093082,20.329428,0.711561,...,1.081209,0.0,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136
min,18.0,0.0,102.0,1.0,1.0,1.0,1.0,1.0,30.0,1.0,...,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,0.0,465.0,2.0,2.0,1.0,491.25,2.0,48.0,2.0,...,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,0.0,802.0,7.0,3.0,1.0,1020.5,3.0,66.0,3.0,...,3.0,80.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,0.0,1157.0,14.0,4.0,1.0,1555.75,4.0,83.75,3.0,...,4.0,80.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1.0,1499.0,29.0,5.0,1.0,2068.0,4.0,100.0,4.0,...,4.0,80.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


## Clean up the initial dataset 
#### (Using AML Tabular Dataset .drop_columns() method )

In [4]:
# Dropping Employee count as all values are 1 and hence attrition is independent of this feature
# Dropping Employee Number since it is merely an identifier
# Dropping Over18 since all employees are over 18
# Dropping StandardHours Since all values are 80
  
aml_dataset = aml_dataset.drop_columns(['EmployeeCount','EmployeeNumber','Over18','StandardHours'])

full_df = aml_dataset.to_pandas_dataframe()
full_df.describe()

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,0.161224,802.485714,9.192517,2.912925,2.721769,65.891156,2.729932,2.063946,2.728571,...,3.153741,2.712245,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129
std,9.135373,0.367863,403.5091,8.106864,1.024165,1.093082,20.329428,0.711561,1.10694,1.102846,...,0.360824,1.081209,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136
min,18.0,0.0,102.0,1.0,1.0,1.0,30.0,1.0,1.0,1.0,...,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,0.0,465.0,2.0,2.0,2.0,48.0,2.0,1.0,2.0,...,3.0,2.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,0.0,802.0,7.0,3.0,3.0,66.0,3.0,2.0,3.0,...,3.0,3.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,0.0,1157.0,14.0,4.0,4.0,83.75,3.0,3.0,4.0,...,3.0,4.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1.0,1499.0,29.0,5.0,4.0,100.0,4.0,5.0,4.0,...,4.0,4.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


## Split original AML Tabular Dataset in two test/train AML Tabular Datasets (using AML DS function)

In [5]:
# Split using Azure Tabular Datasets (Better for Remote Compute)
# https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.tabulardataset?view=azure-ml-py#random-split-percentage--seed-none-

train_dataset, test_dataset = aml_dataset.random_split(0.9, seed=1)

# Use Pandas DF only to check the data
train_dataset_df = train_dataset.to_pandas_dataframe()
test_dataset_df = test_dataset.to_pandas_dataframe()

print(train_dataset_df.describe())
# print(test_dataset_df.describe())

               Age    Attrition    DailyRate  DistanceFromHome    Education  \
count  1318.000000  1318.000000  1318.000000       1318.000000  1318.000000   
mean     36.967375     0.162367   800.254173          9.224583     2.890744   
std       9.228459     0.368927   405.399515          8.162060     1.032600   
min      18.000000     0.000000   102.000000          1.000000     1.000000   
25%      30.000000     0.000000   459.500000          2.000000     2.000000   
50%      36.000000     0.000000   798.500000          7.000000     3.000000   
75%      43.000000     0.000000  1157.000000         14.000000     4.000000   
max      60.000000     1.000000  1499.000000         29.000000     5.000000   

       EnvironmentSatisfaction   HourlyRate  JobInvolvement     JobLevel  \
count              1318.000000  1318.000000     1318.000000  1318.000000   
mean                  2.724583    65.786798        2.722307     2.081184   
std                   1.094274    20.496071        0.715574 

## List remote AML compute targets available

In [6]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget

ComputeTarget.list(ws)

[AmlCompute(workspace=Workspace.create(name='cesardl-automl-northcentralus-ws', subscription_id='102a16c3-37d3-48a8-9237-4c9b1e8e80e0', resource_group='automlpmdemo'), name=cesardl-cpu-clus, id=/subscriptions/102a16c3-37d3-48a8-9237-4c9b1e8e80e0/resourceGroups/automlpmdemo/providers/Microsoft.MachineLearningServices/workspaces/cesardl-automl-northcentralus-ws/computes/cesardl-cpu-clus, type=AmlCompute, provisioning_state=Succeeded, location=northcentralus, tags=None),
 AmlCompute(workspace=Workspace.create(name='cesardl-automl-northcentralus-ws', subscription_id='102a16c3-37d3-48a8-9237-4c9b1e8e80e0', resource_group='automlpmdemo'), name=cesardl-gpu-clus, id=/subscriptions/102a16c3-37d3-48a8-9237-4c9b1e8e80e0/resourceGroups/automlpmdemo/providers/Microsoft.MachineLearningServices/workspaces/cesardl-automl-northcentralus-ws/computes/cesardl-gpu-clus, type=AmlCompute, provisioning_state=Succeeded, location=northcentralus, tags=None),
 AmlCompute(workspace=Workspace.create(name='cesardl-a

## Connect to Remote AML Compute (Existing AML cluster)

In [7]:
# Define remote compute target to use
# Further docs on Remote Compute Target: https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-remote

# Choose a name for your cluster.
amlcompute_cluster_name = "cpu-cluster"

found = False
# Check if this compute target already exists in the workspace.
cts = ws.compute_targets

if amlcompute_cluster_name in cts and cts[amlcompute_cluster_name].type == 'AmlCompute':
     found = True
     print('Found existing training cluster.')
     # Get existing cluster
     # Method 1:
     aml_remote_compute = cts[amlcompute_cluster_name]
     # Method 2:
     # aml_remote_compute = ComputeTarget(ws, amlcompute_cluster_name)
    
if not found:
     print('Creating a new training cluster...')
     provisioning_config = AmlCompute.provisioning_configuration(vm_size = "STANDARD_D13_V2", # for GPU, use "STANDARD_NC12"
                                                                 #vm_priority = 'lowpriority', # optional
                                                                 max_nodes = 20)
     # Create the cluster.
     aml_remote_compute = ComputeTarget.create(ws, amlcompute_cluster_name, provisioning_config)
    
print('Checking cluster status...')
# Can poll for a minimum number of nodes and for a specific timeout.
# If no min_node_count is provided, it will use the scale settings for the cluster.
aml_remote_compute.wait_for_completion(show_output = True, min_node_count = 0, timeout_in_minutes = 20)
    


Found existing training cluster.
Checking cluster status...
Succeeded
AmlCompute wait for completion finished
Minimum number of nodes requested have been provisioned


In [8]:
# For additional details of current AmlCompute status:
aml_remote_compute.get_status()

<azureml.core.compute.amlcompute.AmlComputeStatus at 0x7fe8247759e8>

## List and select primary metric to drive the AutoML classification problem

In [9]:
from azureml.train import automl

# List of possible primary metrics is here:
# https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-auto-train#primary-metric
    
# Get a list of valid metrics for your given task
automl.utilities.get_primary_metrics('classification')

# I'll use 'accuracy' as primary metric (Closer to 1.00 is better)

['accuracy',
 'precision_score_weighted',
 'norm_macro_recall',
 'average_precision_score_weighted',
 'AUC_weighted']

## Define AutoML Experiment settings (With AML Remote Compute)

In [10]:
import logging

# You can provide additional settings as a **kwargs parameter for the AutoMLConfig object
# automl_settings = {
#     "whitelist_models": 'XGBoostClassifier'
# }

from azureml.train.automl import AutoMLConfig

automl_config = AutoMLConfig(compute_target=aml_remote_compute,
                             task='classification',
                             primary_metric='accuracy',
                             experiment_timeout_minutes= 25,                            
                             training_data=train_dataset,
                             label_column_name="Attrition",
                             # X=x_train.values,             # X parameter is deprecated 
                             # y=y_train.values.flatten(),   # y parameter is deprecated 
                             n_cross_validations= 5,
                             # blacklist_models='XGBoostClassifier', 
                             # iteration_timeout_minutes= 5,                                                    
                             enable_early_stopping= True,
                             featurization= 'auto',
                             debug_log='automated_ml_errors.log',
                             verbosity= logging.INFO,
                             # **automl_settings
                             )

# WARNING: If using X and y parameters (deprecated) you get the following warning
# WARNING - The AutoMLConfig inputs you have specified will soon be deprecated. Please use the AutoMLConfig shown in our documentation: https://aka.ms/AutoMLConfig

# PaperCut?: Why is drop_column_names only supported by Time Series Forecast? - If used for classification, you get:
# drop_column_names= ['EmployeeCount','EmployeeNumber','Over18','StandardHours'], # Clean up dataset by dropping not needed columns
# WARNING - Received unrecognized parameter: drop_column_names ['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours']
# In documentation it doesn't state that it is only supported for Forecast...:
# https://docs.microsoft.com/en-us/python/api/azureml-train-automl-client/azureml.train.automl.automlconfig.automlconfig?view=azure-ml-py

# Explanation of Settings: https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-auto-train#configure-your-experiment-settings

# AutoMLConfig info on: 
# https://docs.microsoft.com/en-us/python/api/azureml-train-automl-client/azureml.train.automl.automlconfig.automlconfig


## Run Experiment (on AML Remote Compute) with multiple child runs under the covers

In [None]:
from azureml.core import Experiment
from datetime import datetime

now = datetime.now()
time_string = now.strftime("%m-%d-%Y-%H")
#time_string = now.strftime("%m-%d-%Y-%H-%M")
print(time_string)
experiment_name = "classif-automl-remote-{0}".format(time_string)
print(experiment_name)

experiment = Experiment(workspace=ws, 
                        name=experiment_name)
import time
start_time = time.time()
            
run = experiment.submit(automl_config, show_output=True)

print('Manual run timing: --- %s seconds needed for running the whole Remote AutoML Experiment ---' % (time.time() - start_time))

# (Issue/Bug) Why is 'DATA GUARDRAILS SUMMARY' not shown when training in remote AML Compute???


01-23-2020-18
classif-automl-remote-01-23-2020-18
Running on remote compute: cpu-cluster
Parent Run ID: AutoML_d09f154d-402f-4d2a-8d28-66d8004119b0

Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
ITERATION: The iteration being evaluated.
PIPELINE: A summary description of the pipeline being evaluated.
DURATION: Time taken for the current iteration.
METRIC: The result of computing score on the fitted pipeline.
BEST: The best observed score thus far.
****************************************************************************************************

 ITERATION   PIPELINE                                       DURATION      METRIC      BEST
         0   MaxAbsScaler SGD                               0:01:52       0.8399    0.8399
         1   MaxAbsScaler SGD                               0:01:37       0.7906    0.8399
         2   MaxAbsScaler ExtremeRandomTrees              

## Explore results with Widget

In [None]:
# Explore the results of automatic training with a Jupyter widget: https://docs.microsoft.com/en-us/python/api/azureml-widgets/azureml.widgets?view=azure-ml-py
from azureml.widgets import RunDetails

RunDetails(run).show()

### Measure Parent Run Time needed for the whole AutoML process 

In [62]:
import time
import datetime as dt

run_details = run.get_details()

# Like: 2020-01-12T23:11:56.292703Z
end_time_utc_str = run_details['endTimeUtc'].split(".")[0]
start_time_utc_str = run_details['startTimeUtc'].split(".")[0]
timestamp_end = time.mktime(datetime.strptime(end_time_utc_str, "%Y-%m-%dT%H:%M:%S").timetuple())
timestamp_start = time.mktime(datetime.strptime(start_time_utc_str, "%Y-%m-%dT%H:%M:%S").timetuple())

parent_run_time = timestamp_end - timestamp_start
print('Run Timing: --- %s seconds needed for running the whole Remote AutoML Experiment ---' % (parent_run_time))

Run Timing: --- 4031.0 seconds needed for running the whole Remote AutoML Experiment ---


## Retrieve the 'Best Model' (Scikit-Learn model)

In [63]:
best_run, fitted_model = run.get_output()
print(best_run)
print(fitted_model)

Run(Experiment: classif-automl-remote-01-12-2020-22,
Id: AutoML_5b795b31-d600-41d3-8c68-e67b83113772_34,
Type: azureml.scriptrun,
Status: Completed)
Pipeline(memory=None,
     steps=[('datatransformer', DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
        feature_sweeping_config=None, feature_sweeping_timeout=None,
        featurization_config=None, is_cross_validation=None,
        is_onnx_compatible=None, logger=None, observer=None, task=None)), ('pref...666666666667, 0.06666666666666667, 0.06666666666666667, 0.06666666666666667, 0.06666666666666667]))])


## Make Predictions

### Extract X values (feature columns) from test dataset and convert to NumPi array for predicting 

In [65]:
import pandas as pd

#Remove Label/y column
if 'Attrition' in test_dataset_df.columns:
    y_test_df = test_dataset_df.pop('Attrition')

x_test_df = test_dataset_df

### Make the actual Predictions

In [66]:
# Try the best model
y_predictions = fitted_model.predict(x_test_df)

print('10 predictions: ')
print(y_predictions[:10])

10 predictions: 
[0 0 0 0 0 0 1 0 1 0]


In [67]:
y_predictions.shape

(152,)

### Calculate the Accuracy with Test Dataset (Not used for training)

In [68]:
from sklearn.metrics import accuracy_score

print('Accuracy:')
accuracy_score(y_test_df, y_predictions)

Accuracy:


0.8881578947368421