# Train the Machine Learning Model
Christoph Windheuser, ThoughtWorks, June 25, 2020    
Train a ML Model on SageMaker with Experiments

---
## Install dependencies

In [1]:
import sys
!{sys.executable} -m pip install sagemaker -U
!{sys.executable} -m pip install sagemaker-experiments


Processing /root/.cache/pip/wheels/06/2c/e6/8f994b953a230c29b25cbe6a37be5a8fef4851ae8a4548980c/sagemaker-1.66.0-py2.py3-none-any.whl
Collecting boto3>=1.13.24
  Using cached boto3-1.14.12-py2.py3-none-any.whl (128 kB)
Collecting smdebug-rulesconfig==0.1.4
  Using cached smdebug_rulesconfig-0.1.4-py2.py3-none-any.whl (10 kB)
Collecting botocore<1.18.0,>=1.17.12
  Using cached botocore-1.17.12-py2.py3-none-any.whl (6.3 MB)
[31mERROR: awscli 1.18.66 has requirement botocore==1.16.16, but you'll have botocore 1.17.12 which is incompatible.[0m
Installing collected packages: botocore, boto3, smdebug-rulesconfig, sagemaker
  Attempting uninstall: botocore
    Found existing installation: botocore 1.16.16
    Uninstalling botocore-1.16.16:
      Successfully uninstalled botocore-1.16.16
  Attempting uninstall: boto3
    Found existing installation: boto3 1.13.16
    Uninstalling boto3-1.13.16:
      Successfully uninstalled boto3-1.13.16
  Attempting uninstall: smdebug-rulesconfig
    Found 

---
## Import Libraries

In [2]:
import pandas as pd
import numpy as np
from   time import strftime, gmtime
import sys, os, json

import joblib
# from sklearn.externals import joblib

from   sklearn.tree import DecisionTreeRegressor
from   sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from   sklearn import metrics

import boto3
import sagemaker
from   sagemaker import get_execution_role
from   sagemaker.s3 import S3Uploader, S3Downloader
from   smexperiments.experiment import Experiment
from   smexperiments.trial import Trial
from   sagemaker.sklearn.estimator import SKLearn


---
## Definitions

In [3]:
train_filename = 'final_train.csv'
test_filename  = 'final_validate.csv'
train_dir      = 'train/final'
test_dir       = 'test/final'
local_data_dir = '/home/sagemaker-user/CD4ML-AWS-Serverless/data'
s3_prefix      = 'demandforecast'

model_name     = 'decision_tree'
seed           = 8675309


---
## Define the Machine Learning Model and Parameters

In [4]:
model_parameters = {
    "random_forest": {
        "n_estimators": 10,
        "max_features": 0.5
    },
    "adaboost": {
        "n_estimators": 100
    },
    "gradient_boosting": {
        "n_estimators": 200,
        "max_depth": 4
    },
    "decision_tree": {
        "criterion": 'mse'
    }
}


def get_model_class(model_name):
    model_classes = {
        'random_forest': RandomForestRegressor,
        'adaboost': AdaBoostRegressor,
        'gradient_boosting': GradientBoostingRegressor,
        'decision_tree': DecisionTreeRegressor
    }
    return model_classes[model_name]


---
## Open S3 Session and define bucket

In [5]:
sess = boto3.Session()
account_id = sess.client('sts', region_name=sess.region_name).get_caller_identity()["Account"]
bucket = 'sagemaker-studio-{}-{}'.format(sess.region_name, account_id)


---
## Load training and validation data from S3 and store it locally

In [6]:
def load_data():
    S3Downloader.download('s3://{}/{}/{}/{}'.format(bucket, s3_prefix, train_dir, train_filename),
                          '{}/{}'.format(local_data_dir, train_dir))

    S3Downloader.download('s3://{}/{}/{}/{}'.format(bucket, s3_prefix, test_dir, test_filename),
                          '{}/{}'.format(local_data_dir, test_dir))

    train    = pd.read_csv('{}/{}/{}'.format(local_data_dir, train_dir, train_filename), engine='python')
    validate = pd.read_csv('{}/{}/{}'.format(local_data_dir, test_dir, test_filename), engine='python')

    return train, validate


In [7]:
train, validate = load_data()
train.head()


Unnamed: 0,id,item_nbr,unit_sales,family,class,perishable,transactions,year,month,day,dayofweek,days_til_end_of_data,dayoff
0,88219279,103520,10.0,11,1028,0,3570,2016,8,16,1,364,0
1,88219280,103665,4.0,4,2712,1,3570,2016,8,16,1,364,0
2,88219281,105574,9.0,11,1045,0,3570,2016,8,16,1,364,0
3,88219282,105575,45.0,11,1045,0,3570,2016,8,16,1,364,0
4,88219283,105577,8.0,11,1045,0,3570,2016,8,16,1,364,0


In [8]:
validate.head()

Unnamed: 0,id,item_nbr,unit_sales,family,class,perishable,transactions,year,month,day,dayofweek,days_til_end_of_data,dayoff
0,124124002,96995,2.0,11,1093,0,3936,2017,8,2,2,13,0
1,124124003,99197,1.0,11,1067,0,3936,2017,8,2,2,13,0
2,124124004,103520,5.0,11,1028,0,3936,2017,8,2,2,13,0
3,124124005,103665,1.0,4,2712,1,3936,2017,8,2,2,13,0
4,124124006,105574,17.0,11,1045,0,3936,2017,8,2,2,13,0


---
## Train the model

In [9]:
def train_model(train, model_name='decision_tree', seed=None):

    train_dropped = train.drop('unit_sales', axis=1)
    target        = train['unit_sales']

    model_class   = get_model_class(model_name)
    params        = model_parameters[model_name]
    
    print("Training %s model" % model_name)

    clf           = model_class(random_state=seed, **params)

    trained_model = clf.fit(train_dropped, target)

    return trained_model, params


In [10]:
model, params = train_model(train, model_name, seed)


Training decision_tree model


---
## Validate the trained model

In [10]:
def make_predictions(model, validate):
    validate_dropped = validate.drop('unit_sales', axis=1).fillna(-1)
    validate_preds   = model.predict(validate_dropped)
    return validate_preds


def write_predictions_and_score(model_name, evaluation_metrics, model):
    path     = '{}/models/{}/'.format(local_data_dir, model_name)
    filename = 'model.pkl'
    if not os.path.exists(path):
        os.makedirs(path)
    print("Writing to {}".format(path + filename))
    joblib.dump(model, path + filename)

    path     = '{}/results/{}/'.format(local_data_dir, model_name)
    filename = 'metrics.json'
    print("Writing to {}".format(path + filename))
    if not os.path.exists(path):
        os.makedirs(path)
    with open(path + filename, 'w+') as score_file:
        json.dump(evaluation_metrics, score_file)


def eval_nwrmsle(predictions, targets, weights):
    if type(predictions) == list:
        predictions = np.array([np.nan if x < 0 else x for x in predictions])
    elif type(predictions) == pd.Series:
        predictions[predictions < 0] = np.nan
    targetsf = targets.astype(float)
    targetsf[targets < 0] = np.nan
    weights = 1 + 0.25 * weights
    log_square_errors = (np.log(predictions + 1) - np.log(targetsf + 1)) ** 2
    return(np.sqrt(np.sum(weights * log_square_errors) / np.sum(weights)))


In [12]:
print("Making prediction on validation data")
validation_predictions = make_predictions(model, validate)

print("Calculating metrics")
evaluation_metrics = {
    'nwrmsle' : eval_nwrmsle(validation_predictions, validate['unit_sales'].values, validate['perishable'].values),
    'r2_score': metrics.r2_score(y_true=validate['unit_sales'].values, y_pred=validation_predictions)
}

#        track.log_metrics(evaluation_metrics)

write_predictions_and_score(model_name, evaluation_metrics, model)

print("Evaluation done with metrics {}.".format(json.dumps(evaluation_metrics)))

print (validate_predictions)


Making prediction on validation data
Calculating metrics
Writing to /home/sagemaker-user/CD4ML-AWS-Serverless/data/models/decision_tree/model.pkl
Writing to /home/sagemaker-user/CD4ML-AWS-Serverless/data/results/decision_tree/metrics.json
Evaluation done with metrics {"nwrmsle": 0.8480084859255792, "r2_score": -2.0782116252217895}.


---
# Run the Training Model on a different train instance

In [11]:
sm_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()


### Create Experiment

In [12]:
# create_date = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
create_date = strftime("%Y-%m-%d", gmtime())
DemForExp = Experiment.create(experiment_name="DemandFrcst-{}-04".format(create_date), 
                                              description="Predict customer demand", 
                                              sagemaker_boto_client=boto3.client('sagemaker'))
print (DemForExp)


Experiment(sagemaker_boto_client=<botocore.client.SageMaker object at 0x7f94f30db2d0>,experiment_name='DemandFrcst-2020-06-29-04',description='Predict customer demand',experiment_arn='arn:aws:sagemaker:us-east-1:261586618408:experiment/demandfrcst-2020-06-29-04',response_metadata={'RequestId': 'b85e159f-bb6a-43ae-84ac-6c301b27ff3f', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'b85e159f-bb6a-43ae-84ac-6c301b27ff3f', 'content-type': 'application/x-amz-json-1.1', 'content-length': '97', 'date': 'Mon, 29 Jun 2020 13:37:57 GMT'}, 'RetryAttempts': 0})


### Create Trial

In [13]:
trial_dt = Trial.create(trial_name="decision-tree-{}".format(strftime("%Y-%m-%d-%H-%M-%S", gmtime())), 
                     experiment_name=DemForExp.experiment_name,
                     sagemaker_boto_client=boto3.client('sagemaker'))

trial_rf = Trial.create(trial_name="random-forest-{}".format(strftime("%Y-%m-%d-%H-%M-%S", gmtime())), 
                     experiment_name=DemForExp.experiment_name,
                     sagemaker_boto_client=boto3.client('sagemaker'))


### Create Hyperparameters

In [15]:
hyperparams = {'model_name'  : 'random_forest',
               'n_estimators': 10,
               'max_features': 0.5,
               'max_depth'   : 4,
               'criterion'   : 'mse'}


### Create S3 bucket for training and validation

In [16]:
s3_data_train='s3://{}/{}/{}'.format(bucket, s3_prefix, train_dir)
s3_input_train = sagemaker.s3_input(s3_data_train, content_type='csv')
print ("s3 data for training:   " + s3_data_train)

s3_data_validate='s3://{}/{}/{}'.format(bucket, s3_prefix, test_dir)
s3_input_validate = sagemaker.s3_input(s3_data_validate, content_type='csv')
print ("s3 data for validation: " + s3_data_validate)




s3 data for training:   s3://sagemaker-studio-us-east-1-261586618408/demandforecast/train/final
s3 data for validation: s3://sagemaker-studio-us-east-1-261586618408/demandforecast/test/final


### Create scikit-learn Estimator

In [17]:
# script_path = './CD4ML-AWS-Serverless/src/scikitmodel.py'
script_path = '/root/CD4ML-AWS-Serverless/src/scikitmodel.py'

sklearn = SKLearn(
    entry_point=script_path,
    train_instance_type="ml.m5.large",
    role=role,
    sagemaker_session=sm_session,
    hyperparameters=hyperparams,
    metric_definitions=[
        {'Name':'test:nwrmsle',  'Regex':'nwrmsle: (.*?);'},
        {'Name':'test:r2_score', 'Regex':'r2_score: (.*?);'}
    ],
    enable_sagemaker_metrics=True
)




In [18]:
sklearn.fit({'train': s3_input_train,
             'validation': s3_input_validate},
           experiment_config={
            "ExperimentName": DemForExp.experiment_name, 
            "TrialName": trial_rf.trial_name,
            "TrialComponentDisplayName": "Training",               
           })


INFO:sagemaker:Creating training-job with name: sagemaker-scikit-learn-2020-06-29-13-38-30-374


2020-06-29 13:38:30 Starting - Starting the training job...
2020-06-29 13:38:33 Starting - Launching requested ML instances.........
2020-06-29 13:40:18 Starting - Preparing the instances for training...
2020-06-29 13:41:01 Downloading - Downloading input data......
2020-06-29 13:41:52 Training - Training image download completed. Training in progress.[34m2020-06-29 13:41:52,857 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2020-06-29 13:41:52,860 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-06-29 13:41:52,869 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2020-06-29 13:42:24,594 sagemaker-containers INFO     Module scikitmodel does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m2020-06-29 13:42:24,595 sagemaker-containers INFO     Generating setup.cfg[0m
[34m2020-06-29 13:42:24,595 sagemaker-containers INFO     Generating MANIFEST.in

---
# Create an Endpoint of the model and inference requests

In [19]:
predictor = sklearn.deploy(initial_instance_count=1, instance_type="ml.m5.large")


INFO:sagemaker:Creating model with name: sagemaker-scikit-learn-2020-06-29-13-38-30-374
INFO:sagemaker:Creating endpoint with name sagemaker-scikit-learn-2020-06-29-13-38-30-374


-------------!

In [20]:
validate_dropped = validate.drop('unit_sales', axis=1).fillna(-1)
validate_preds   = predictor.predict(validate_dropped)

print (validate)

print (validate_preds)

print("R2 Score: {}".format(metrics.r2_score(y_true=validate['unit_sales'].values, y_pred=validate_preds)))


              id  item_nbr  unit_sales  family  class  perishable  \
0      124124002     96995         2.0      11   1093           0   
1      124124003     99197         1.0      11   1067           0   
2      124124004    103520         5.0      11   1028           0   
3      124124005    103665         1.0       4   2712           1   
4      124124006    105574        17.0      11   1045           0   
...          ...       ...         ...     ...    ...         ...   
38443  125481864   2123463         1.0      11   1076           0   
38444  125481865   2123727         1.0      11   1028           0   
38445  125481866   2123775         1.0      11   1030           0   
38446  125481867   2124052        21.0      21   1318           0   
38447  125481868   2126842         2.0      11   1030           0   

       transactions  year  month  day  dayofweek  days_til_end_of_data  dayoff  
0              3936  2017      8    2          2                    13       0  
1        

---
## Endpoint Clean-Up

In [37]:
sklearn.delete_endpoint()


INFO:sagemaker:Deleting endpoint with name: sagemaker-scikit-learn-2020-06-27-16-33-49-916
