# AutoML Classification experiment using Remote-AML-Compute and AML-Datasets
## Data: IBM Employee Attrition dataset loaded from Azure ML Dataset

##  Get Azure ML Workspace to use

In [24]:
# azureml-core of version 1.0.72 or higher is required
from azureml.core import Workspace, Dataset

# Get Workspace defined in by default config.json file
ws = Workspace.from_config()

## Load data from Azure ML Datasets 
Pandas DataFrame only used to check out the data

In [25]:
# Load Data
aml_dataset = ws.datasets['IBM-Employee-Attrition']

# Use Pandas DataFrame just to sneak peak some data and schema
full_df = aml_dataset.to_pandas_dataframe()
# .to_pandas_dataframe().dropna()
full_df.head(5)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,0,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,1,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,0,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,0,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [26]:
# Use Pandas DataFrame just to investigate the dataset's schema and info
full_df.describe()

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92,0.16,802.49,9.19,2.91,1.0,1024.87,2.72,65.89,2.73,...,2.71,80.0,0.79,11.28,2.8,2.76,7.01,4.23,2.19,4.12
std,9.14,0.37,403.51,8.11,1.02,0.0,602.02,1.09,20.33,0.71,...,1.08,0.0,0.85,7.78,1.29,0.71,6.13,3.62,3.22,3.57
min,18.0,0.0,102.0,1.0,1.0,1.0,1.0,1.0,30.0,1.0,...,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,0.0,465.0,2.0,2.0,1.0,491.25,2.0,48.0,2.0,...,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,0.0,802.0,7.0,3.0,1.0,1020.5,3.0,66.0,3.0,...,3.0,80.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,0.0,1157.0,14.0,4.0,1.0,1555.75,4.0,83.75,3.0,...,4.0,80.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1.0,1499.0,29.0,5.0,1.0,2068.0,4.0,100.0,4.0,...,4.0,80.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


## Clean up the initial dataset 
#### (Using AML Tabular Dataset .drop_columns() method )

In [27]:
# Dropping Employee count as all values are 1 and hence attrition is independent of this feature
# Dropping Employee Number since it is merely an identifier
# Dropping Over18 since all employees are over 18
# Dropping StandardHours Since all values are 80
  
aml_dataset = aml_dataset.drop_columns(['EmployeeCount','EmployeeNumber','Over18','StandardHours'])

full_df = aml_dataset.to_pandas_dataframe()
full_df.describe()

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92,0.16,802.49,9.19,2.91,2.72,65.89,2.73,2.06,2.73,...,3.15,2.71,0.79,11.28,2.8,2.76,7.01,4.23,2.19,4.12
std,9.14,0.37,403.51,8.11,1.02,1.09,20.33,0.71,1.11,1.1,...,0.36,1.08,0.85,7.78,1.29,0.71,6.13,3.62,3.22,3.57
min,18.0,0.0,102.0,1.0,1.0,1.0,30.0,1.0,1.0,1.0,...,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,0.0,465.0,2.0,2.0,2.0,48.0,2.0,1.0,2.0,...,3.0,2.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,0.0,802.0,7.0,3.0,3.0,66.0,3.0,2.0,3.0,...,3.0,3.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,0.0,1157.0,14.0,4.0,4.0,83.75,3.0,3.0,4.0,...,3.0,4.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1.0,1499.0,29.0,5.0,4.0,100.0,4.0,5.0,4.0,...,4.0,4.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


## Split original AML Tabular Dataset in two test/train AML Tabular Datasets (using AML DS function)

In [28]:
# Split using Azure Tabular Datasets (Better for Remote Compute)
# https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.tabulardataset?view=azure-ml-py#random-split-percentage--seed-none-

train_dataset, test_dataset = aml_dataset.random_split(0.9, seed=1)

# Use Pandas DF only to check the data
train_dataset_df = train_dataset.to_pandas_dataframe()
test_dataset_df = test_dataset.to_pandas_dataframe()

print(train_dataset_df.describe())
# print(test_dataset_df.describe())

          Age  Attrition  DailyRate  DistanceFromHome  Education  \
count 1318.00    1318.00    1318.00           1318.00    1318.00   
mean    36.97       0.16     800.25              9.22       2.89   
std      9.23       0.37     405.40              8.16       1.03   
min     18.00       0.00     102.00              1.00       1.00   
25%     30.00       0.00     459.50              2.00       2.00   
50%     36.00       0.00     798.50              7.00       3.00   
75%     43.00       0.00    1157.00             14.00       4.00   
max     60.00       1.00    1499.00             29.00       5.00   

       EnvironmentSatisfaction  HourlyRate  JobInvolvement  JobLevel  \
count                  1318.00     1318.00         1318.00   1318.00   
mean                      2.72       65.79            2.72      2.08   
std                       1.09       20.50            0.72      1.12   
min                       1.00       30.00            1.00      1.00   
25%                       2

## List remote AML compute targets available

In [29]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget

ComputeTarget.list(ws)

[AmlCompute(workspace=Workspace.create(name='cesardl-automl-northcentralus-ws', subscription_id='102a16c3-37d3-48a8-9237-4c9b1e8e80e0', resource_group='automlpmdemo'), name=cesardl-cpu-clus, id=/subscriptions/102a16c3-37d3-48a8-9237-4c9b1e8e80e0/resourceGroups/automlpmdemo/providers/Microsoft.MachineLearningServices/workspaces/cesardl-automl-northcentralus-ws/computes/cesardl-cpu-clus, type=AmlCompute, provisioning_state=Succeeded, location=northcentralus, tags=None),
 AmlCompute(workspace=Workspace.create(name='cesardl-automl-northcentralus-ws', subscription_id='102a16c3-37d3-48a8-9237-4c9b1e8e80e0', resource_group='automlpmdemo'), name=cesardl-gpu-clus, id=/subscriptions/102a16c3-37d3-48a8-9237-4c9b1e8e80e0/resourceGroups/automlpmdemo/providers/Microsoft.MachineLearningServices/workspaces/cesardl-automl-northcentralus-ws/computes/cesardl-gpu-clus, type=AmlCompute, provisioning_state=Succeeded, location=northcentralus, tags=None)]

## Connect to Remote AML Compute (Existing AML cluster)

In [30]:
# Define remote compute target to use
# Further docs on Remote Compute Target: https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-remote
compute_target_name = 'cesardl-cpu-clus'
# Get existing compute target
aml_remote_compute = ComputeTarget(ws, compute_target_name)

## List and select primary metric to drive the AutoML classification problem

In [31]:
from azureml.train import automl

# List of possible primary metrics is here:
# https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-auto-train#primary-metric
    
# Get a list of valid metrics for your given task
automl.utilities.get_primary_metrics('classification')

# I'll use 'accuracy' as primary metric (Closer to 1.00 is better)

['accuracy',
 'average_precision_score_weighted',
 'precision_score_weighted',
 'AUC_weighted',
 'norm_macro_recall']

## Define AutoML Experiment settings (With AML Remote Compute)

In [32]:
import logging

# You can provide additional settings as a **kwargs parameter for the AutoMLConfig object
# automl_settings = {
#     "whitelist_models": 'XGBoostClassifier'
# }

from azureml.train.automl import AutoMLConfig

automl_config = AutoMLConfig(compute_target=aml_remote_compute,
                             task='classification',
                             primary_metric='accuracy',
                             # experiment_timeout_minutes= 20,                            
                             training_data=train_dataset,
                             label_column_name="Attrition",
                             # X=x_train.values,             # X parameter is deprecated 
                             # y=y_train.values.flatten(),   # y parameter is deprecated 
                             n_cross_validations= 5,
                             # blacklist_models='XGBoostClassifier', 
                             # iteration_timeout_minutes= 5,                                                    
                             enable_early_stopping= True,
                             featurization= 'auto',
                             debug_log='automated_ml_errors.log',
                             verbosity= logging.INFO,
                             # **automl_settings
                             )

# WARNING: If using X and y parameters (deprecated) you get the following warning
# WARNING - The AutoMLConfig inputs you have specified will soon be deprecated. Please use the AutoMLConfig shown in our documentation: https://aka.ms/AutoMLConfig

# PaperCut?: Why is drop_column_names only supported by Time Series Forecast? - If used for classification, you get:
# drop_column_names= ['EmployeeCount','EmployeeNumber','Over18','StandardHours'], # Clean up dataset by dropping not needed columns
# WARNING - Received unrecognized parameter: drop_column_names ['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours']
# In documentation it doesn't state that it is only supported for Forecast...:
# https://docs.microsoft.com/en-us/python/api/azureml-train-automl-client/azureml.train.automl.automlconfig.automlconfig?view=azure-ml-py

# Explanation of Settings: https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-auto-train#configure-your-experiment-settings

# AutoMLConfig info on: 
# https://docs.microsoft.com/en-us/python/api/azureml-train-automl-client/azureml.train.automl.automlconfig.automlconfig


## Run Experiment (on AML Remote Compute) with multiple child runs under the covers

In [33]:
from azureml.core import Experiment
from datetime import datetime

now = datetime.now()
time_string = now.strftime("%m-%d-%Y-%H")
#time_string = now.strftime("%m-%d-%Y-%H-%M")
print(time_string)
experiment_name = "classif-automl-remote-{0}".format(time_string)
print(experiment_name)

experiment = Experiment(workspace=ws, 
                        name=experiment_name)

import time
start_time = time.time()
            
run = experiment.submit(automl_config, show_output=True)

print('Manual run timing: --- %s seconds needed for running the whole Remote AutoML Experiment ---' % (time.time() - start_time))

# (CDLTLL) Why is 'DATA GUARDRAILS SUMMARY' not shown when training in remote AML Compute???


01-12-2020-22
classif-automl-remote-01-12-2020-22
Running on remote compute: cesardl-cpu-clus
Parent Run ID: AutoML_5b795b31-d600-41d3-8c68-e67b83113772

Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
ITERATION: The iteration being evaluated.
PIPELINE: A summary description of the pipeline being evaluated.
DURATION: Time taken for the current iteration.
METRIC: The result of computing score on the fitted pipeline.
BEST: The best observed score thus far.
****************************************************************************************************

 ITERATION   PIPELINE                                       DURATION      METRIC      BEST
         0   MaxAbsScaler SGD                               0:02:21       0.8422    0.8422
         1   MaxAbsScaler SGD                               0:01:47       0.7815    0.8422
         2   MaxAbsScaler ExtremeRandomTrees         

## Explore results with Widget

In [34]:
# Explore the results of automatic training with a Jupyter widget: https://docs.microsoft.com/en-us/python/api/azureml-widgets/azureml.widgets?view=azure-ml-py
from azureml.widgets import RunDetails

RunDetails(run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'NOTSET', …

### Measure Parent Run Time needed for the whole AutoML process 

In [62]:
import time
import datetime as dt

run_details = run.get_details()

# Like: 2020-01-12T23:11:56.292703Z
end_time_utc_str = run_details['endTimeUtc'].split(".")[0]
start_time_utc_str = run_details['startTimeUtc'].split(".")[0]
timestamp_end = time.mktime(datetime.strptime(end_time_utc_str, "%Y-%m-%dT%H:%M:%S").timetuple())
timestamp_start = time.mktime(datetime.strptime(start_time_utc_str, "%Y-%m-%dT%H:%M:%S").timetuple())

parent_run_time = timestamp_end - timestamp_start
print('Run Timing: --- %s seconds needed for running the whole Remote AutoML Experiment ---' % (parent_run_time))

Run Timing: --- 4031.0 seconds needed for running the whole Remote AutoML Experiment ---


## Retrieve the 'Best Model'

In [63]:
best_run, fitted_model = run.get_output()
print(best_run)
print(fitted_model)

Run(Experiment: classif-automl-remote-01-12-2020-22,
Id: AutoML_5b795b31-d600-41d3-8c68-e67b83113772_34,
Type: azureml.scriptrun,
Status: Completed)
Pipeline(memory=None,
     steps=[('datatransformer', DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
        feature_sweeping_config=None, feature_sweeping_timeout=None,
        featurization_config=None, is_cross_validation=None,
        is_onnx_compatible=None, logger=None, observer=None, task=None)), ('pref...666666666667, 0.06666666666666667, 0.06666666666666667, 0.06666666666666667, 0.06666666666666667]))])


## Make Predictions

### Extract X values (feature columns) from test dataset and convert to NumPi array for predicting 

In [65]:
import pandas as pd

#Remove Label/y column
if 'Attrition' in test_dataset_df.columns:
    y_test_df = test_dataset_df.pop('Attrition')

x_test_df = test_dataset_df

### Make the actual Predictions

In [66]:
# Try the best model
y_predictions = fitted_model.predict(x_test_df)

print('10 predictions: ')
print(y_predictions[:10])

10 predictions: 
[0 0 0 0 0 0 1 0 1 0]


In [67]:
y_predictions.shape

(152,)

### Calculate the Accuracy with Test Dataset (Not used for training)

In [68]:
from sklearn.metrics import accuracy_score

print('Accuracy:')
accuracy_score(y_test_df, y_predictions)

Accuracy:


0.8881578947368421