# Notes

This notebook is used as a prototype for the inference script for Sagemaker Endpoint.

In [1]:
!pip install -U scikit-learn==0.20



In [2]:
import json
import logging
import sys
import os
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
import pickle
from io import StringIO 

logging.basicConfig(
    format="%(filename)s %(asctime)s %(levelname)s Line no: %(lineno)d %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S%z",
    level=logging.INFO,
)
log = logging.getLogger(__name__)

log.addHandler(logging.StreamHandler(sys.stdout))

JSON_CONTENT_TYPE = "application/json"
NUMPY_CONTENT_TYPE = "application/x-npy"
CSV_CONTENT_TYPE = "text/csv"

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  from ._gradient_boosting import predict_stages
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  from ._gradient_boosting import predict_stages


In [3]:
def determine_time_of_day(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
    """
    Determine time of day based on hour of the day.
    Time of day: early morning, morning, afternoon, or evening.

    :param df: input dataframe to be processed
    :type df: pd.DataFrame
    :param column_name: column that stores hour of day
    :type column_name: str
    :return: input dataframe with additional time_of_day column
    :rtype: pd.DataFrame
    """
    condition_list = [
        ((df[column_name] >= 0) & (df[column_name] < 6)),
        ((df[column_name] >= 6) & (df[column_name] < 12)),
        ((df[column_name] >= 12) & (df[column_name] < 18)),
        ((df[column_name] >= 18) & (df[column_name] < 25)),
    ]

    choice_list = ["early_morning", "morning", "afternoon", "evening"]

    df["time_of_day"] = np.select(condition_list, choice_list, default="unknown")
    return df


def preprocess_input(df_test):
    ## convert '?' into -99
    ## reformat data types
    for col in df_test.columns:
        ## replace missing value with a numeric value, e.g., -99
        mask = df_test[col] == "?"
        df_test.loc[mask, col] = -99

    ## reformat data types
    # then, convert to numeric
    numeric_column_list = [
        "startHour",
        "startWeekday",
        "duration",
        "cCount",
        "cMinPrice",
        "cMaxPrice",
        "cSumPrice",
        "bCount",
        "bMinPrice",
        "bMaxPrice",
        "bSumPrice",
        "bStep",
        "customerNo",
        "maxVal",
        "customerScore",
        "accountLifetime",
        "payments",
        "age",
        "address",
        "lastOrder",
    ]

    for col in numeric_column_list:
        df_test[col] = df_test[col].astype(float)

    condition_list = [
        (df_test["onlineStatus"] == "y"),
        (df_test["onlineStatus"] == "n"),
    ]
    choice_list = [1, 0]
    df_test["onlineStatus"] = np.select(
        condition_list, choice_list, default=df_test["onlineStatus"]
    )

    df_test["address"] = df_test["address"].astype(int)

    ## remove ID columns
    df_test.drop(labels=["sessionNo", "customerNo"], axis=1, inplace=True)

    ## determine time of day
    df_test = determine_time_of_day(df=df_test, column_name="startHour")

    ## one hot encoding
    selected_feature_list = ["availability", "address", "time_of_day", "onlineStatus"]
    for selected_feature in selected_feature_list:
        df_dummy_values = pd.get_dummies(df_test[selected_feature])
        df_dummy_values.columns = [
            selected_feature.replace(" ", "_") + "_" + str(col)
            for col in df_dummy_values.columns
        ]
        df_test = pd.concat([df_test, df_dummy_values], axis=1)
        ## remove redundant features - we've done one hot encoding
        df_test.drop(labels=selected_feature, axis=1, inplace=True)

    ## reformat
    categorical_feature_list = [
        "availability_-99",
        "availability_completely_not_determinable",
        "availability_completely_not_orderable",
        "availability_completely_orderable",
        "availability_mainly_not_determinable",
        "availability_mainly_not_orderable",
        "availability_mainly_orderable",
        "address_-99",
        "address_1",
        "address_2",
        "time_of_day_afternoon",
        "time_of_day_early_morning",
        "time_of_day_evening",
        "onlineStatus_-99",
        "onlineStatus_0",
    ]

    for col in categorical_feature_list:
        try:
            df_test[col] = df_test[col].astype(int)
        except:
            df_test[col] = 0

    return df_test

In [4]:
def model_fn(model_dir):
    log.info(f"In model_fn. Model directory is {model_dir}")

    model_file_path = os.path.join(model_dir, "model.pkl")

    log.info(f"Loading the model from {model_file_path}")
    with open(model_file_path, "rb") as f:
        model_clf = pickle.load(f)

    log.info(f"Model is successfully loaded from {model_file_path}")

    return model_clf

In [27]:
def input_fn(request_body, content_type):
    assert (
        content_type in [JSON_CONTENT_TYPE, CSV_CONTENT_TYPE, NUMPY_CONTENT_TYPE]
    ), f"Request has an unsupported ContentType in content_type: {content_type}"

    log.info(f"Request body CONTENT-TYPE is: {content_type}")
    log.info(f"Request body TYPE is: {type(request_body)}")
    
    log.info("Deserializing the input data.")
    log.info(f"Request body is: {request_body}")
    
    try:
        if content_type == JSON_CONTENT_TYPE:
            ## convert input json object as a dataframe of one row
            request = json.loads(request_body)
            log.info(f"Loaded JSON object: {request}")
            df_test = pd.json_normalize(request)
        elif content_type == NUMPY_CONTENT_TYPE:
            request_body = request_body.decode('utf-8')
            request = np.load(request_body)
            log.info(f"Loaded JSON object: {request}")
            df_test = pd.DataFrame(request)
        elif content_type == CSV_CONTENT_TYPE:
            # data = request_body.decode('utf-8')
            # s = StringIO.StringIO(data)
            # s = StringIO(request_body)
            # df_test = pd.read_csv(s, header=None)
            df_test = pd.DataFrame(request_body)
            
        df_test = preprocess_input(df_test=df_test)
    except Exception as e:
        log.info(e)
        df_test = pd.read_csv(request_body)
        df_test = preprocess_input(df_test=request_body)
        
    return df_test

In [14]:
# inference
def predict_fn(input_object, model):
    log.info("In predict_fn")

    log.info("Calling model")
    feature_name_list = model.feature_names
    prediction = model.predict(input_object[feature_name_list])

    return prediction

Here, we pass a JSON object as the test data.

In [15]:
def main():
    model_dir = "../model/"
    model = model_fn(model_dir)
    
    payload = {
        "sessionNo": 101,
        "startHour": 4,
        "startWeekday": 7,
        "duration": 0,
        "cCount": 2,
        "cMinPrice": 30,
        "cMaxPrice": 40,
        "cSumPrice": 70,
        "bCount": 1,
        "bMinPrice": 30,
        "bMaxPrice": 30,
        "bSumPrice": 30,
        "bStep": "?",
        "onlineStatus": "?",
        "availability": "?",
        "customerNo": 39,
        "maxVal": 200,
        "customerScore": 65,
        "accountLifetime": 30,
        "payments": 2,
        "age": 39,
        "address": 1,
        "lastOrder": 30,
    }
    
    request_body = json.dumps(payload)
    content_type = 'application/json'
    
    df_test = input_fn(request_body, content_type)
    
    print(df_test.head())
    
    pred = predict_fn(df_test, model)
    print(pred)
        
if __name__ == "__main__":
    main()

In model_fn. Model directory is ../model/


4092385979.py 2022-01-06 13:51:22+1300 INFO Line no: 2 In model_fn. Model directory is ../model/


Loading the model from ../model/model.pkl


4092385979.py 2022-01-06 13:51:22+1300 INFO Line no: 6 Loading the model from ../model/model.pkl


Model is successfully loaded from ../model/model.pkl


4092385979.py 2022-01-06 13:51:23+1300 INFO Line no: 10 Model is successfully loaded from ../model/model.pkl


Request body CONTENT-TYPE is: application/json


498457568.py 2022-01-06 13:51:23+1300 INFO Line no: 6 Request body CONTENT-TYPE is: application/json


Request body TYPE is: <class 'str'>


498457568.py 2022-01-06 13:51:23+1300 INFO Line no: 7 Request body TYPE is: <class 'str'>


Deserializing the input data.


498457568.py 2022-01-06 13:51:23+1300 INFO Line no: 9 Deserializing the input data.


Request body is: {"sessionNo": 101, "startHour": 4, "startWeekday": 7, "duration": 0, "cCount": 2, "cMinPrice": 30, "cMaxPrice": 40, "cSumPrice": 70, "bCount": 1, "bMinPrice": 30, "bMaxPrice": 30, "bSumPrice": 30, "bStep": "?", "onlineStatus": "?", "availability": "?", "customerNo": 39, "maxVal": 200, "customerScore": 65, "accountLifetime": 30, "payments": 2, "age": 39, "address": 1, "lastOrder": 30}


498457568.py 2022-01-06 13:51:23+1300 INFO Line no: 10 Request body is: {"sessionNo": 101, "startHour": 4, "startWeekday": 7, "duration": 0, "cCount": 2, "cMinPrice": 30, "cMaxPrice": 40, "cSumPrice": 70, "bCount": 1, "bMinPrice": 30, "bMaxPrice": 30, "bSumPrice": 30, "bStep": "?", "onlineStatus": "?", "availability": "?", "customerNo": 39, "maxVal": 200, "customerScore": 65, "accountLifetime": 30, "payments": 2, "age": 39, "address": 1, "lastOrder": 30}


Loaded JSON object: {'sessionNo': 101, 'startHour': 4, 'startWeekday': 7, 'duration': 0, 'cCount': 2, 'cMinPrice': 30, 'cMaxPrice': 40, 'cSumPrice': 70, 'bCount': 1, 'bMinPrice': 30, 'bMaxPrice': 30, 'bSumPrice': 30, 'bStep': '?', 'onlineStatus': '?', 'availability': '?', 'customerNo': 39, 'maxVal': 200, 'customerScore': 65, 'accountLifetime': 30, 'payments': 2, 'age': 39, 'address': 1, 'lastOrder': 30}


498457568.py 2022-01-06 13:51:23+1300 INFO Line no: 16 Loaded JSON object: {'sessionNo': 101, 'startHour': 4, 'startWeekday': 7, 'duration': 0, 'cCount': 2, 'cMinPrice': 30, 'cMaxPrice': 40, 'cSumPrice': 70, 'bCount': 1, 'bMinPrice': 30, 'bMaxPrice': 30, 'bSumPrice': 30, 'bStep': '?', 'onlineStatus': '?', 'availability': '?', 'customerNo': 39, 'maxVal': 200, 'customerScore': 65, 'accountLifetime': 30, 'payments': 2, 'age': 39, 'address': 1, 'lastOrder': 30}


   startHour  startWeekday  duration  cCount  cMinPrice  cMaxPrice  cSumPrice  \
0        4.0           7.0       0.0     2.0       30.0       40.0       70.0   

   bCount  bMinPrice  bMaxPrice  ...  availability_completely_not_orderable  \
0     1.0       30.0       30.0  ...                                      0   

   availability_completely_orderable  availability_mainly_not_determinable  \
0                                  0                                     0   

   availability_mainly_not_orderable  availability_mainly_orderable  \
0                                  0                              0   

   address_-99  address_2  time_of_day_afternoon  time_of_day_evening  \
0            0          0                      0                    0   

   onlineStatus_0  
0               0  

[1 rows x 33 columns]
In predict_fn


1955181547.py 2022-01-06 13:51:23+1300 INFO Line no: 3 In predict_fn


Calling model


1955181547.py 2022-01-06 13:51:24+1300 INFO Line no: 5 Calling model


[0]


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype=np.int)


Pass CSV input - not working yet. Might need to adjust the inference logic (e.g., hard-code the feature names since the input is comma-separated values without headers -- currently we assume it's a CSV file).

In [29]:
test_file_path = os.path.join(
    '..',
    'dataset',
    'feature-engineering',
    'df_test_rfe.csv'
)
df_input = pd.read_csv(test_file_path).sample(5)

Unnamed: 0,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,bMaxPrice,...,payments,age,lastOrder,availability_completely_orderable,address_1,address_2,time_of_day_afternoon,time_of_day_evening,onlineStatus_-99,order
3574,15.0,6.0,480.32,12.0,9.99,44.99,343.88,12.0,9.99,44.99,...,18.0,33.0,14.0,1,0,1,1,0,0,1
84644,17.0,7.0,3547.48,30.0,3.99,29.99,430.55,5.0,3.99,20.0,...,3.0,54.0,8.0,1,0,1,1,0,0,1
123897,15.0,7.0,690.174,4.0,14.99,14.99,59.96,1.0,14.99,14.99,...,-99.0,-99.0,-99.0,1,0,0,1,0,0,0
126160,10.0,5.0,595.122,10.0,5.99,24.99,154.92,4.0,22.99,24.99,...,9.0,49.0,11.0,1,0,1,0,0,0,1
27919,18.0,6.0,1302.851,13.0,29.99,59.95,719.43,3.0,59.95,59.95,...,23.0,47.0,6.0,1,0,1,0,1,0,1


In [38]:
', '.join([str(elem) for elem in df_input.iloc[0].values.tolist()])

'15.0, 6.0, 480.32, 12.0, 9.99, 44.99, 343.88, 12.0, 9.99, 44.99, 343.88, 2.0, 2600.0, 522.0, 100.0, 18.0, 33.0, 14.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0'

In [28]:
def main():
    model_dir = "../model/"
    model = model_fn(model_dir)
    
    test_file_path = os.path.join(
        '..',
        'dataset',
        'feature-engineering',
        'df_test_rfe.csv'
    )
    df_input = pd.read_csv(test_file_path)

    request_body = df_input.sample(5)
    content_type = 'text/csv'
    
    df_test = input_fn(request_body, content_type)
    
    print(df_test.head())
    
    pred = predict_fn(df_test, model)
    print(pred)
        
if __name__ == "__main__":
    main()

In model_fn. Model directory is ../model/


4092385979.py 2022-01-06 14:13:10+1300 INFO Line no: 2 In model_fn. Model directory is ../model/


Loading the model from ../model/model.pkl


4092385979.py 2022-01-06 14:13:10+1300 INFO Line no: 6 Loading the model from ../model/model.pkl


Model is successfully loaded from ../model/model.pkl


4092385979.py 2022-01-06 14:13:11+1300 INFO Line no: 10 Model is successfully loaded from ../model/model.pkl


Request body CONTENT-TYPE is: text/csv


2071135398.py 2022-01-06 14:13:11+1300 INFO Line no: 6 Request body CONTENT-TYPE is: text/csv


Request body TYPE is: <class 'pandas.core.frame.DataFrame'>


2071135398.py 2022-01-06 14:13:11+1300 INFO Line no: 7 Request body TYPE is: <class 'pandas.core.frame.DataFrame'>


Deserializing the input data.


2071135398.py 2022-01-06 14:13:11+1300 INFO Line no: 9 Deserializing the input data.


Request body is:         startHour  startWeekday  duration  cCount  cMinPrice  cMaxPrice  \
98017        13.0           5.0  1771.885    25.0      14.99      29.99   
69166         9.0           6.0  3816.593    85.0       8.00     239.99   
29003        21.0           6.0  5978.640   121.0       9.99      89.95   
101862       11.0           7.0   445.757     5.0      21.95      21.95   
80963        16.0           5.0  3581.435    19.0       9.99      39.95   

        cSumPrice  bCount  bMinPrice  bMaxPrice  ...  payments   age  \
98017      382.84     4.0      14.99      29.99  ...     -99.0 -99.0   
69166     5590.75     2.0       9.99     129.99  ...      12.0  63.0   
29003     3994.92     4.0       9.99      39.90  ...     -99.0 -99.0   
101862      43.90     1.0      21.95      21.95  ...     -99.0 -99.0   
80963      438.77     6.0       9.99      29.99  ...       5.0  42.0   

        lastOrder  availability_completely_orderable  address_1  address_2  \
98017       -99.0    

2071135398.py 2022-01-06 14:13:11+1300 INFO Line no: 10 Request body is:         startHour  startWeekday  duration  cCount  cMinPrice  cMaxPrice  \
98017        13.0           5.0  1771.885    25.0      14.99      29.99   
69166         9.0           6.0  3816.593    85.0       8.00     239.99   
29003        21.0           6.0  5978.640   121.0       9.99      89.95   
101862       11.0           7.0   445.757     5.0      21.95      21.95   
80963        16.0           5.0  3581.435    19.0       9.99      39.95   

        cSumPrice  bCount  bMinPrice  bMaxPrice  ...  payments   age  \
98017      382.84     4.0      14.99      29.99  ...     -99.0 -99.0   
69166     5590.75     2.0       9.99     129.99  ...      12.0  63.0   
29003     3994.92     4.0       9.99      39.90  ...     -99.0 -99.0   
101862      43.90     1.0      21.95      21.95  ...     -99.0 -99.0   
80963      438.77     6.0       9.99      29.99  ...       5.0  42.0   

        lastOrder  availability_completely_

'customerNo'


2071135398.py 2022-01-06 14:13:11+1300 INFO Line no: 32 'customerNo'


TypeError: argument of type 'method' is not iterable