# Train the Machine Learning Model
Christoph Windheuser, ThoughtWorks, June 19, 2020

## Import Libraries

In [14]:
import boto3
from sagemaker.s3 import S3Uploader, S3Downloader
import pandas as pd

from enum import Enum
import numpy as np
import sys, os, json
from sklearn.preprocessing import LabelEncoder
from sklearn.externals import joblib
sys.path.append(os.path.join('..', 'src'))
sys.path.append(os.path.join('src'))
from sklearn import tree, ensemble, metrics
# import evaluation
# import tracking


In [6]:
class Model(Enum):
    DECISION_TREE = 0
    RANDOM_FOREST = 1
    ADABOOST = 2
    GRADIENT_BOOST = 3


## Load training and validation data from S3

In [26]:
def load_data():
    train_filename = 'store47-2016-train.csv'
    test_filename  = 'store47-2016-test.csv'
    train_dir      = 'train'
    test_dir       = 'test'
    local_data_dir = 'data'

    sess = boto3.Session()
    account_id = sess.client('sts', region_name=sess.region_name).get_caller_identity()["Account"]
    bucket = 'sagemaker-studio-{}-{}'.format(sess.region_name, account_id)
    prefix = 'demandforecast-rf'

    S3Downloader.download('s3://{}/{}/{}/{}'.format(bucket, prefix, train_dir, train_filename),
                          '{}/{}'.format(local_data_dir, train_dir))

    S3Downloader.download('s3://{}/{}/{}/{}'.format(bucket, prefix, test_dir, test_filename),
                          '{}/{}'.format(local_data_dir, test_dir))

    train    = pd.read_csv('{}/{}/{}'.format(local_data_dir, train_dir, train_filename), engine='python')
    validate = pd.read_csv('{}/{}/{}'.format(local_data_dir, test_dir, test_filename), engine='python')

    return train, validate


In [27]:
load_data()

(               id        date  item_nbr  unit_sales            family  class  \
 0        88219279  2016-08-16    103520        10.0         GROCERY I   1028   
 1        88219280  2016-08-16    103665         4.0      BREAD/BAKERY   2712   
 2        88219281  2016-08-16    105574         9.0         GROCERY I   1045   
 3        88219282  2016-08-16    105575        45.0         GROCERY I   1045   
 4        88219283  2016-08-16    105577         8.0         GROCERY I   1045   
 ...           ...         ...       ...         ...               ...    ...   
 958778  124019431  2017-08-01   2113914        10.0          CLEANING   3040   
 958779  124019432  2017-08-01   2116416        15.0         GROCERY I   1060   
 958780  124019433  2017-08-01   2122188         3.0         GROCERY I   1084   
 958781  124019434  2017-08-01   2124052         4.0  LIQUOR,WINE,BEER   1318   
 958782  124019435  2017-08-01   2127114         3.0         BEVERAGES   1152   
 
         perishable  trans

In [28]:
def join_tables(train, validate):
    print("Joining tables for consistent encoding")
    return train.append(validate).drop('date', axis=1)


def encode_categorical_columns(df):
    obj_df = df.select_dtypes(include=['object', 'bool']).copy().fillna('-1')
    lb = LabelEncoder()
    for col in obj_df.columns:
        df[col] = lb.fit_transform(obj_df[col])
    return df


def encode(train, validate):
    print("Encoding categorical variables")
    train_ids = train.id
    validate_ids = validate.id

    joined = join_tables(train, validate)

    encoded = encode_categorical_columns(joined.fillna(-1))

    print("Not predicting returns...")
    encoded.loc[encoded.unit_sales < 0, 'unit_sales'] = 0

    validate = encoded[encoded['id'].isin(validate_ids)]
    train = encoded[encoded['id'].isin(train_ids)]
    return train, validate


def train_model(train, model=Model.DECISION_TREE, seed=None):
    print("Training model using regressor: {}".format(model.name))
    train_dropped = train.drop('unit_sales', axis=1)
    target = train['unit_sales']

    if model == Model.RANDOM_FOREST:
        params = {'n_estimators': 10}
        clf = ensemble.RandomForestRegressor(random_state=seed, **params)
    elif model == Model.ADABOOST:
        params = {'n_estimators': 50, 'learning_rate': 1.0, 'loss':'linear'}
        clf = ensemble.AdaBoostRegressor(random_state=seed, **params)
    elif model == Model.GRADIENT_BOOST:
        params = {'n_estimators': 200, 'max_depth': 4}
        clf = ensemble.GradientBoostingRegressor(random_state=seed, **params)
    else:
        params = {'criterion': 'mse'}
        clf = tree.DecisionTreeRegressor(random_state=seed)

    trained_model = clf.fit(train_dropped, target)
    return (trained_model,params)


def overwrite_unseen_prediction_with_zero(preds, train, validate):
    cols_item_store = ['item_nbr', 'store_nbr']
    cols_to_use = validate.columns.drop('unit_sales') if 'unit_sales' in validate.columns else validate.columns
    validate_train_joined = pd.merge(validate[cols_to_use], train, on=cols_item_store, how='left')
    unseen = validate_train_joined[validate_train_joined['unit_sales'].isnull()]
    validate['preds'] = preds
    validate.loc[validate.id.isin(unseen['id_x']), 'preds'] = 0
    preds = validate['preds'].tolist()
    return preds


def make_predictions(model, validate):
    print("Making prediction on validation data")
    validate_dropped = validate.drop('unit_sales', axis=1).fillna(-1)
    validate_preds = model.predict(validate_dropped)
    return validate_preds


def write_predictions_and_score(evaluation_metrics, model, columns_used):
    key = "decision_tree"
    if not os.path.exists('data/{}'.format(key)):
        os.makedirs('data/{}'.format(key))
    filename = 'data/{}/model.pkl'.format(key)
    print("Writing to {}".format(filename))
    joblib.dump(model, filename)

    filename = 'results/metrics.json'
    print("Writing to {}".format(filename))
    if not os.path.exists('results'):
        os.makedirs('results')
    with open(filename, 'w+') as score_file:
        json.dump(evaluation_metrics, score_file)


In [29]:
def main(model=Model.DECISION_TREE, seed=None):
    original_train, original_validate = load_data()
    train, validate = encode(original_train, original_validate)
    with tracking.track() as track:
        track.set_model(model)
        model, params = train_model(train, model, seed)
        track.log_params(params)
        validation_predictions = make_predictions(model, validate)

        print("Calculating metrics")
        evaluation_metrics = {
            'nwrmsle': evaluation.nwrmsle(validation_predictions, validate['unit_sales'].values, validate['perishable'].values),
            'r2_score': metrics.r2_score(y_true=validate['unit_sales'].values, y_pred=validation_predictions)
        }
        track.log_metrics(evaluation_metrics)

        write_predictions_and_score(evaluation_metrics, model, original_train.columns)

        print("Evaluation done with metrics {}.".format(json.dumps(evaluation_metrics)))


In [30]:
main(model=Model.RANDOM_FOREST, seed=8675309)
 

Encoding categorical variables
Joining tables for consistent encoding
Not predicting returns...


NameError: name 'tracking' is not defined