# Train the Machine Learning Model
Christoph Windheuser, ThoughtWorks, June 19, 2020

---
## Install dependencies

In [1]:
import sys
!{sys.executable} -m pip install sagemaker -U
!{sys.executable} -m pip install sagemaker-experiments

Requirement already up-to-date: sagemaker in /opt/conda/lib/python3.7/site-packages (1.65.1.post0)


---
## Import Libraries

In [2]:
import pandas as pd
import numpy as np
from   time import strftime, gmtime
import sys, os, json
import joblib
from   sklearn.tree import DecisionTreeRegressor
from   sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from   sklearn import metrics

import boto3
import sagemaker
from   sagemaker import get_execution_role
from   sagemaker.s3 import S3Uploader, S3Downloader
from   smexperiments.experiment import Experiment
from   smexperiments.trial import Trial
from   sagemaker.sklearn.estimator import SKLearn


---
## Definitions

In [3]:
train_filename = 'final_train.csv'
test_filename  = 'final_validate.csv'
train_dir      = 'train'
test_dir       = 'test'
local_data_dir = 'CD4ML-AWS-Serverless/data'
s3_prefix      = 'demandforecast'

model_name     = 'decision_tree'
seed           = 8675309


---
## Define the Machine Learning Model and Parameters

In [4]:
model_parameters = {
    "random_forest": {
        "n_estimators": 10,
        "max_features": 0.5
    },
    "adaboost": {
        "n_estimators": 100
    },
    "gradient_boosting": {
        "n_estimators": 200,
        "max_depth": 4
    },
    "decision_tree": {
        "criterion": 'mse'
    }
}


def get_model_class(model_name):
    model_classes = {
        'random_forest': RandomForestRegressor,
        'adaboost': AdaBoostRegressor,
        'gradient_boosting': GradientBoostingRegressor,
        'decision_tree': DecisionTreeRegressor
    }
    return model_classes[model_name]


---
## Open S3 Session and define bucket

In [5]:
sess = boto3.Session()
account_id = sess.client('sts', region_name=sess.region_name).get_caller_identity()["Account"]
bucket = 'sagemaker-studio-{}-{}'.format(sess.region_name, account_id)


---
## Load training and validation data from S3 and store it locally

In [6]:
def load_data():
    S3Downloader.download('s3://{}/{}/{}/{}'.format(bucket, s3_prefix, train_dir, train_filename),
                          '{}/{}'.format(local_data_dir, train_dir))

    S3Downloader.download('s3://{}/{}/{}/{}'.format(bucket, s3_prefix, test_dir, test_filename),
                          '{}/{}'.format(local_data_dir, test_dir))

    train    = pd.read_csv('{}/{}/{}'.format(local_data_dir, train_dir, train_filename), engine='python')
    validate = pd.read_csv('{}/{}/{}'.format(local_data_dir, test_dir, test_filename), engine='python')

    return train, validate


In [7]:
train, validate = load_data()
train.head()


Unnamed: 0,id,item_nbr,unit_sales,family,class,perishable,transactions,year,month,day,dayofweek,days_til_end_of_data,dayoff
0,88219279,103520,10.0,11,1028,0,3570,2016,8,16,1,364,0
1,88219280,103665,4.0,4,2712,1,3570,2016,8,16,1,364,0
2,88219281,105574,9.0,11,1045,0,3570,2016,8,16,1,364,0
3,88219282,105575,45.0,11,1045,0,3570,2016,8,16,1,364,0
4,88219283,105577,8.0,11,1045,0,3570,2016,8,16,1,364,0


In [8]:
validate.head()

Unnamed: 0,id,item_nbr,unit_sales,family,class,perishable,transactions,year,month,day,dayofweek,days_til_end_of_data,dayoff
0,124124002,96995,2.0,11,1093,0,3936,2017,8,2,2,13,0
1,124124003,99197,1.0,11,1067,0,3936,2017,8,2,2,13,0
2,124124004,103520,5.0,11,1028,0,3936,2017,8,2,2,13,0
3,124124005,103665,1.0,4,2712,1,3936,2017,8,2,2,13,0
4,124124006,105574,17.0,11,1045,0,3936,2017,8,2,2,13,0


---
## Train the model

In [24]:
def train_model(train, model_name='decision_tree', seed=None):

    train_dropped = train.drop('unit_sales', axis=1)
    target        = train['unit_sales']

    model_class   = get_model_class(model_name)
    params        = model_parameters[model_name]
    
    print("Training %s model" % model_name)

    clf           = model_class(random_state=seed, **params)

    trained_model = clf.fit(train_dropped, target)

    return trained_model, params


In [25]:
model, params = train_model(train, model_name, seed)


Training decision_tree model


---
## Validate the trained model

In [26]:
def make_predictions(model, validate):
    validate_dropped = validate.drop('unit_sales', axis=1).fillna(-1)
    validate_preds   = model.predict(validate_dropped)
    return validate_preds


def write_predictions_and_score(model_name, evaluation_metrics, model):
    path     = '{}/models/{}/'.format(local_data_dir, model_name)
    filename = 'model.pkl'
    if not os.path.exists(path):
        os.makedirs(path)
    print("Writing to {}".format(path + filename))
    joblib.dump(model, path + filename)

    path     = '{}/results/{}/'.format(local_data_dir, model_name)
    filename = 'metrics.json'
    print("Writing to {}".format(path + filename))
    if not os.path.exists(path):
        os.makedirs(path)
    with open(path + filename, 'w+') as score_file:
        json.dump(evaluation_metrics, score_file)


def eval_nwrmsle(predictions, targets, weights):
    if type(predictions) == list:
        predictions = np.array([np.nan if x < 0 else x for x in predictions])
    elif type(predictions) == pd.Series:
        predictions[predictions < 0] = np.nan
    targetsf = targets.astype(float)
    targetsf[targets < 0] = np.nan
    weights = 1 + 0.25 * weights
    log_square_errors = (np.log(predictions + 1) - np.log(targetsf + 1)) ** 2
    return(np.sqrt(np.sum(weights * log_square_errors) / np.sum(weights)))


In [27]:
print("Making prediction on validation data")
validation_predictions = make_predictions(model, validate)

print("Calculating metrics")
evaluation_metrics = {
    'nwrmsle' : eval_nwrmsle(validation_predictions, validate['unit_sales'].values, validate['perishable'].values),
    'r2_score': metrics.r2_score(y_true=validate['unit_sales'].values, y_pred=validation_predictions)
}

#        track.log_metrics(evaluation_metrics)

write_predictions_and_score(model_name, evaluation_metrics, model)

print("Evaluation done with metrics {}.".format(json.dumps(evaluation_metrics)))


Making prediction on validation data
Calculating metrics
Writing to CD4ML-AWS-Serverless/data/models/decision_tree/model.pkl
Writing to CD4ML-AWS-Serverless/data/results/decision_tree/metrics.json
Evaluation done with metrics {"nwrmsle": 0.8480084859255792, "r2_score": -2.0782116252217895}.


---
# SageMaker Training

In [9]:
sagemaker_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()


In [26]:
s3_data='s3://{}/{}/{}'.format(bucket, s3_prefix, train_dir)
s3_input_train = sagemaker.s3_input(s3_data, content_type='csv')
print ("s3 data for training: " + s3_data)




s3_data = s3://sagemaker-studio-us-east-1-261586618408/demandforecast/train


In [12]:
script_path = './data/scikitmodel.py'

sklearn = SKLearn(
    entry_point=script_path,
    train_instance_type="ml.c4.xlarge",
    role=role,
    sagemaker_session=sagemaker_session,
    hyperparameters={"criterion": 'mse'})




In [33]:
sklearn.fit({'train': s3_input_train})


INFO:sagemaker:Creating training-job with name: sagemaker-scikit-learn-2020-06-24-11-32-25-224


2020-06-24 11:32:25 Starting - Starting the training job...
2020-06-24 11:32:27 Starting - Launching requested ML instances......
2020-06-24 11:33:45 Starting - Preparing the instances for training......
2020-06-24 11:34:49 Downloading - Downloading input data...
2020-06-24 11:35:17 Training - Downloading the training image..[34m2020-06-24 11:35:38,343 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2020-06-24 11:35:38,345 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-06-24 11:35:38,355 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2020-06-24 11:35:38,763 sagemaker-containers INFO     Module scikitmodel does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m2020-06-24 11:35:38,763 sagemaker-containers INFO     Generating setup.cfg[0m
[34m2020-06-24 11:35:38,764 sagemaker-containers INFO     Generating MANIFEST.in[0m
[34m2020-06-24 11:35:3