# Train the Machine Learning Model
Christoph Windheuser, ThoughtWorks, June 19, 2020

---
## Import Libraries

In [20]:
import boto3
from   sagemaker.s3 import S3Uploader, S3Downloader
import pandas as pd
import numpy as np
from   enum import Enum
from   sklearn.preprocessing import LabelEncoder
from   sklearn import tree, ensemble, metrics
import sys, os, json
import joblib

# from sklearn.externals import joblib
# sys.path.append(os.path.join('..', 'src'))
# sys.path.append(os.path.join('src'))
# import evaluation
# import tracking


In [2]:
class Model(Enum):
    DECISION_TREE = 0
    RANDOM_FOREST = 1
    ADABOOST = 2
    GRADIENT_BOOST = 3


---
## Load training and validation data from S3

In [3]:
def load_data():
    train_filename = 'store47-2016-train.csv'
    test_filename  = 'store47-2016-test.csv'
    train_dir      = 'train'
    test_dir       = 'test'
    local_data_dir = 'data'

    sess = boto3.Session()
    account_id = sess.client('sts', region_name=sess.region_name).get_caller_identity()["Account"]
    bucket = 'sagemaker-studio-{}-{}'.format(sess.region_name, account_id)
    prefix = 'demandforecast-rf'

    S3Downloader.download('s3://{}/{}/{}/{}'.format(bucket, prefix, train_dir, train_filename),
                          '{}/{}'.format(local_data_dir, train_dir))

    S3Downloader.download('s3://{}/{}/{}/{}'.format(bucket, prefix, test_dir, test_filename),
                          '{}/{}'.format(local_data_dir, test_dir))

    train    = pd.read_csv('{}/{}/{}'.format(local_data_dir, train_dir, train_filename), engine='python')
    validate = pd.read_csv('{}/{}/{}'.format(local_data_dir, test_dir, test_filename), engine='python')

    return train, validate


In [4]:
original_train, original_validate = load_data()
original_train.head()


Unnamed: 0,id,date,item_nbr,unit_sales,family,class,perishable,transactions,year,month,day,dayofweek,days_til_end_of_data,dayoff
0,88219279,2016-08-16,103520,10.0,GROCERY I,1028,0,3570,2016,8,16,1,364,False
1,88219280,2016-08-16,103665,4.0,BREAD/BAKERY,2712,1,3570,2016,8,16,1,364,False
2,88219281,2016-08-16,105574,9.0,GROCERY I,1045,0,3570,2016,8,16,1,364,False
3,88219282,2016-08-16,105575,45.0,GROCERY I,1045,0,3570,2016,8,16,1,364,False
4,88219283,2016-08-16,105577,8.0,GROCERY I,1045,0,3570,2016,8,16,1,364,False


---
## Encode non-numerical values and drop the date column

In [5]:
def join_tables(train, validate):
    print("Joining tables for consistent encoding")
    return train.append(validate).drop('date', axis=1)


def encode_categorical_columns(df):
    obj_df = df.select_dtypes(include=['object', 'bool']).copy().fillna('-1')
    lb = LabelEncoder()
    for col in obj_df.columns:
        df[col] = lb.fit_transform(obj_df[col])
    return df


def encode(train, validate):
    print("Encoding categorical variables")
    train_ids = train.id
    validate_ids = validate.id

    joined  = join_tables(train, validate)

    encoded = encode_categorical_columns(joined.fillna(-1))

    print("Not predicting returns (changing negative unit sales to 0)")
    encoded.loc[encoded.unit_sales < 0, 'unit_sales'] = 0

    validate = encoded[encoded['id'].isin(validate_ids)]
    train = encoded[encoded['id'].isin(train_ids)]
    return train, validate


In [6]:
train, validate = encode(original_train, original_validate)
train.head()

Encoding categorical variables
Joining tables for consistent encoding
Not predicting returns (changing negative unit sales to 0)


Unnamed: 0,id,item_nbr,unit_sales,family,class,perishable,transactions,year,month,day,dayofweek,days_til_end_of_data,dayoff
0,88219279,103520,10.0,11,1028,0,3570,2016,8,16,1,364,0
1,88219280,103665,4.0,4,2712,1,3570,2016,8,16,1,364,0
2,88219281,105574,9.0,11,1045,0,3570,2016,8,16,1,364,0
3,88219282,105575,45.0,11,1045,0,3570,2016,8,16,1,364,0
4,88219283,105577,8.0,11,1045,0,3570,2016,8,16,1,364,0


In [7]:
validate.head()

Unnamed: 0,id,item_nbr,unit_sales,family,class,perishable,transactions,year,month,day,dayofweek,days_til_end_of_data,dayoff
0,124124002,96995,2.0,11,1093,0,3936,2017,8,2,2,13,0
1,124124003,99197,1.0,11,1067,0,3936,2017,8,2,2,13,0
2,124124004,103520,5.0,11,1028,0,3936,2017,8,2,2,13,0
3,124124005,103665,1.0,4,2712,1,3936,2017,8,2,2,13,0
4,124124006,105574,17.0,11,1045,0,3936,2017,8,2,2,13,0


---
## Train the model

In [14]:
def train_model(train, model=Model.DECISION_TREE, seed=None):
    print("Training model using regressor: {}".format(model.name))
    train_dropped = train.drop('unit_sales', axis=1)
    target = train['unit_sales']

    if model == Model.RANDOM_FOREST:
        params = {'n_estimators': 10}
        clf = ensemble.RandomForestRegressor(random_state=seed, **params)
    elif model == Model.ADABOOST:
        params = {'n_estimators': 50, 'learning_rate': 1.0, 'loss':'linear'}
        clf = ensemble.AdaBoostRegressor(random_state=seed, **params)
    elif model == Model.GRADIENT_BOOST:
        params = {'n_estimators': 200, 'max_depth': 4}
        clf = ensemble.GradientBoostingRegressor(random_state=seed, **params)
    else:
        params = {'criterion': 'mse'}
        clf = tree.DecisionTreeRegressor(random_state=seed)

    trained_model = clf.fit(train_dropped, target)
    return (trained_model,params)


def overwrite_unseen_prediction_with_zero(preds, train, validate):
    cols_item_store = ['item_nbr', 'store_nbr']
    cols_to_use = validate.columns.drop('unit_sales') if 'unit_sales' in validate.columns else validate.columns
    validate_train_joined = pd.merge(validate[cols_to_use], train, on=cols_item_store, how='left')
    unseen = validate_train_joined[validate_train_joined['unit_sales'].isnull()]
    validate['preds'] = preds
    validate.loc[validate.id.isin(unseen['id_x']), 'preds'] = 0
    preds = validate['preds'].tolist()
    return preds


def make_predictions(model, validate):
    print("Making prediction on validation data")
    validate_dropped = validate.drop('unit_sales', axis=1).fillna(-1)
    validate_preds = model.predict(validate_dropped)
    return validate_preds


def write_predictions_and_score(evaluation_metrics, model, columns_used):
    key = "decision_tree"
    if not os.path.exists('data/{}'.format(key)):
        os.makedirs('data/{}'.format(key))
    filename = 'data/{}/model.pkl'.format(key)
    print("Writing to {}".format(filename))
    joblib.dump(model, filename)

    filename = 'results/metrics.json'
    print("Writing to {}".format(filename))
    if not os.path.exists('results'):
        os.makedirs('results')
    with open(filename, 'w+') as score_file:
        json.dump(evaluation_metrics, score_file)

        
def eval_nwrmsle(predictions, targets, weights):
    if type(predictions) == list:
        predictions = np.array([np.nan if x < 0 else x for x in predictions])
    elif type(predictions) == pd.Series:
        predictions[predictions < 0] = np.nan
    targetsf = targets.astype(float)
    targetsf[targets < 0] = np.nan
    weights = 1 + 0.25 * weights
    log_square_errors = (np.log(predictions + 1) - np.log(targetsf + 1)) ** 2
    return(np.sqrt(np.sum(weights * log_square_errors) / np.sum(weights)))


In [29]:
model = Model.RANDOM_FOREST
seed  = 8675309

#    original_train, original_validate = load_data()
#    train, validate = encode(original_train, original_validate)

# with tracking.track() as track:
#         track.set_model(model)
        
model, params = train_model(train, model, seed)

#        track.log_params(params)


Training model using regressor: RANDOM_FOREST


In [30]:
validation_predictions = make_predictions(model, validate)

print("Calculating metrics")

evaluation_metrics = {
    'nwrmsle': eval_nwrmsle(validation_predictions, validate['unit_sales'].values, validate['perishable'].values),
    'r2_score': metrics.r2_score(y_true=validate['unit_sales'].values, y_pred=validation_predictions)
}

#        track.log_metrics(evaluation_metrics)

write_predictions_and_score(evaluation_metrics, model, original_train.columns)

print("Evaluation done with metrics {}.".format(json.dumps(evaluation_metrics)))


Making prediction on validation data
Calculating metrics
Writing to data/decision_tree/model.pkl
Writing to results/metrics.json
Evaluation done with metrics {"nwrmsle": 0.8355623479315407, "r2_score": -0.16134290849725463}.


In [None]:
# main(model=Model.RANDOM_FOREST, seed=8675309)
 