# AutoML Classification experiment using Local Compute and Pandas DataFrames
## Data: IBM Employee Attrition dataset loaded from Azure ML Dataset

##  Get Azure ML Workspace to use

In [31]:
# azureml-core of version 1.0.72 or higher is required
from azureml.core import Workspace, Dataset

# Get Workspace defined in by default config.json file
ws = Workspace.from_config()

## Load data from Azure ML Datasets into Pandas DataFrame

In [32]:
# Load Data
aml_dataset = ws.datasets['IBM-Employee-Attrition']

# Use Pandas DataFrame just to sneak peak some data and schema
full_df = aml_dataset.to_pandas_dataframe()
# .to_pandas_dataframe().dropna()
full_df.head(5)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,0,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,1,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,0,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,0,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [33]:
# Use Pandas DataFrame just to investigate the dataset's schema and info
full_df.describe()

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92,0.16,802.49,9.19,2.91,1.0,1024.87,2.72,65.89,2.73,...,2.71,80.0,0.79,11.28,2.8,2.76,7.01,4.23,2.19,4.12
std,9.14,0.37,403.51,8.11,1.02,0.0,602.02,1.09,20.33,0.71,...,1.08,0.0,0.85,7.78,1.29,0.71,6.13,3.62,3.22,3.57
min,18.0,0.0,102.0,1.0,1.0,1.0,1.0,1.0,30.0,1.0,...,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,0.0,465.0,2.0,2.0,1.0,491.25,2.0,48.0,2.0,...,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,0.0,802.0,7.0,3.0,1.0,1020.5,3.0,66.0,3.0,...,3.0,80.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,0.0,1157.0,14.0,4.0,1.0,1555.75,4.0,83.75,3.0,...,4.0,80.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1.0,1499.0,29.0,5.0,1.0,2068.0,4.0,100.0,4.0,...,4.0,80.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


## Split original dataset in test/train sets using Scikit-Learn train_test_split function

In [64]:
from sklearn.model_selection import train_test_split

# Split using ScikitLearn train_test_split function using Dataframes
# Will use test dataframe at the end, without AutoML, just for testing predictions with the model

# Only split in test/train
train_df, test_df = train_test_split(full_df, test_size=0.2, random_state=1)
train_df.describe()

# Split in x/y and test/train
# y_df = full_df.pop("Attrition")
# x_df = full_df
# x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.1, random_state=1)

#Another possibility would be to split using the Azure ML Datasets (Better for Remote Compute): 
# https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.tabulardataset?view=azure-ml-py#random-split-percentage--seed-none-

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,...,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0
mean,37.1,0.15,810.27,9.22,2.92,1.0,1025.95,2.72,65.66,2.73,...,2.75,80.0,0.79,11.31,2.81,2.75,6.88,4.22,2.13,4.11
std,9.12,0.36,404.54,8.12,1.02,0.0,607.2,1.09,20.31,0.73,...,1.07,0.0,0.85,7.78,1.28,0.7,5.96,3.63,3.2,3.5
min,18.0,0.0,102.0,1.0,1.0,1.0,1.0,1.0,30.0,1.0,...,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,0.0,473.5,2.0,2.0,1.0,485.75,2.0,48.0,2.0,...,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,0.0,809.0,7.0,3.0,1.0,1017.5,3.0,65.0,3.0,...,3.0,80.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,0.0,1159.0,14.0,4.0,1.0,1569.75,4.0,83.0,3.0,...,4.0,80.0,1.0,15.0,3.0,3.0,9.0,7.0,2.0,7.0
max,60.0,1.0,1499.0,29.0,5.0,1.0,2068.0,4.0,100.0,4.0,...,4.0,80.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


In [51]:
test_df.describe()

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0,...,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0
mean,36.23,0.2,771.34,9.08,2.87,1.0,1020.51,2.71,66.81,2.75,...,2.57,80.0,0.82,11.17,2.74,2.81,7.51,4.26,2.4,4.18
std,9.17,0.4,398.54,8.07,1.05,0.0,581.86,1.1,20.42,0.65,...,1.13,0.0,0.87,7.81,1.31,0.72,6.75,3.62,3.3,3.85
min,18.0,0.0,115.0,1.0,1.0,1.0,5.0,1.0,30.0,1.0,...,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,0.0,427.5,2.0,2.0,1.0,519.0,2.0,50.0,2.0,...,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,1.25
50%,35.0,0.0,726.5,7.0,3.0,1.0,1027.0,3.0,68.5,3.0,...,3.0,80.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,42.0,0.0,1146.0,14.0,4.0,1.0,1515.25,4.0,84.75,3.0,...,4.0,80.0,1.0,15.0,3.0,3.0,10.0,7.0,4.0,7.0
max,59.0,1.0,1498.0,29.0,5.0,1.0,2064.0,4.0,100.0,4.0,...,4.0,80.0,3.0,37.0,6.0,4.0,36.0,18.0,15.0,17.0


## List and select primary metric to drive the AutoML classification problem

In [52]:
from azureml.train import automl

# List of possible primary metrics is here:
# https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-auto-train#primary-metric
    
# Get a list of valid metrics for your given task
automl.utilities.get_primary_metrics('classification')

# I'll use 'accuracy' as primary metric (Closer to 1.00 is better)

['accuracy',
 'norm_macro_recall',
 'average_precision_score_weighted',
 'precision_score_weighted',
 'AUC_weighted']

## Define AutoML Experiment settings

In [53]:
import logging

# Explanation of Settings: https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-auto-train#configure-your-experiment-settings

# AutoMLConfig info on: 
# https://docs.microsoft.com/en-us/python/api/azureml-train-automl-client/azureml.train.automl.automlconfig.automlconfig

# You can provide additional settings as a **kwargs parameter for the AutoMLConfig object
# automl_settings = {
#     "whitelist_models": 'XGBoostClassifier'
# }

from azureml.train.automl import AutoMLConfig

automl_config = AutoMLConfig(task='classification',
                             primary_metric='accuracy',
                             # experiment_timeout_minutes= 20,                            
                             training_data=train_df,
                             label_column_name="Attrition",
                             # X=x_train.values,             # X parameter is deprecated 
                             # y=y_train.values.flatten(),   # y parameter is deprecated 
                             n_cross_validations= 5,
                             # blacklist_models='XGBoostClassifier', 
                             # iteration_timeout_minutes= 5,                                                    
                             enable_early_stopping= True,
                             featurization= 'auto',
                             debug_log='automated_ml_errors.log',
                             verbosity= logging.INFO,
                             # **automl_settings
                             )

# WARNING: If using X and y parameters (deprecated) you get the following warning
# WARNING - The AutoMLConfig inputs you have specified will soon be deprecated. Please use the AutoMLConfig shown in our documentation: https://aka.ms/AutoMLConfig


## Run Experiment with multiple child runs under the covers

In [54]:
from azureml.core import Experiment
from datetime import datetime

now = datetime.now()
time_string = now.strftime("%m-%d-%Y-%H")
#time_string = now.strftime("%m-%d-%Y-%H-%M")
print(time_string)
experiment_name = "classif-automl-local-{0}".format(time_string)
print(experiment_name)

experiment = Experiment(workspace=ws, 
                        name=experiment_name)

run = experiment.submit(automl_config, show_output=True)

01-11-2020-23
classif-automl-local-01-11-2020-23
Running on local machine
Parent Run ID: AutoML_e48502c8-6e4c-4ffe-b370-ceef36111e0c

Current status: DatasetFeaturization. Beginning to featurize the dataset.
Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturizationCompleted. Completed featurizing the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.

****************************************************************************************************
DATA GUARDRAILS SUMMARY:
For more details, use API: run.get_guardrails()

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Classes are balanced in the training data.

TYPE:         Missing values imputation
STATUS:       PASSED
DESCRIPTION:  There were no missing values found in the training data.

TYPE:         High cardinality feature detection
STAT

## Explore results with Widget

In [55]:
# Explore the results of automatic training with a Jupyter widget: https://docs.microsoft.com/en-us/python/api/azureml-widgets/azureml.widgets?view=azure-ml-py
from azureml.widgets import RunDetails
RunDetails(run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'NOTSET', …

## Retrieve the 'Best Model'

In [56]:
best_run, fitted_model = run.get_output()
print(best_run)
print(fitted_model)

Run(Experiment: classif-automl-local-01-11-2020-23,
Id: AutoML_e48502c8-6e4c-4ffe-b370-ceef36111e0c_30,
Type: None,
Status: Completed)
Pipeline(memory=None,
     steps=[('datatransformer', DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
        feature_sweeping_config=None, feature_sweeping_timeout=None,
        featurization_config=None, is_cross_validation=None,
        is_onnx_compatible=None, logger=None, observer=None, task=None)), ('pref...  flatten_transform=None,
               weights=[0.25, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125]))])


## Make Predictions

### Extract X values (feature columns) from test dataset and convert to NumPi array for predicting 

In [65]:
import pandas as pd

#Remove Label/y column
if 'Attrition' in test_df.columns:
    y_test_df = test_df.pop('Attrition')

x_test_df = test_df

In [66]:
x_test_df.describe()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0,...,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0
mean,36.23,771.34,9.08,2.87,1.0,1020.51,2.71,66.81,2.75,2.04,...,2.57,80.0,0.82,11.17,2.74,2.81,7.51,4.26,2.4,4.18
std,9.17,398.54,8.07,1.05,0.0,581.86,1.1,20.42,0.65,1.1,...,1.13,0.0,0.87,7.81,1.31,0.72,6.75,3.62,3.3,3.85
min,18.0,115.0,1.0,1.0,1.0,5.0,1.0,30.0,1.0,1.0,...,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,427.5,2.0,2.0,1.0,519.0,2.0,50.0,2.0,1.0,...,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,1.25
50%,35.0,726.5,7.0,3.0,1.0,1027.0,3.0,68.5,3.0,2.0,...,3.0,80.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,42.0,1146.0,14.0,4.0,1.0,1515.25,4.0,84.75,3.0,2.0,...,4.0,80.0,1.0,15.0,3.0,3.0,10.0,7.0,4.0,7.0
max,59.0,1498.0,29.0,5.0,1.0,2064.0,4.0,100.0,4.0,5.0,...,4.0,80.0,3.0,37.0,6.0,4.0,36.0,18.0,15.0,17.0


### Make the actual Predictions

In [70]:
# Try the best model
y_predictions = fitted_model.predict(x_test_df)

print('10 predictions: ')
print(y_predictions[:10])

10 predictions: 
[0 1 1 1 0 0 0 0 0 0]


In [71]:
y_predictions.shape

(294,)

### Calculate the Accuracy with Test Dataset (Not used for training)

In [72]:
from sklearn.metrics import accuracy_score

print('Accuracy:')
accuracy_score(y_test_df, y_predictions)

Accuracy:


0.8571428571428571