# Data Processing

In [70]:
# import necessary libraries
import math
import json

import pandas as pd
import numpy as np
from sklearn import preprocessing

% matplotlib inline

# helper functions
def normalize_column(col):
    col = col.values.reshape(-1, 1).astype("float64")
    min_max_scaler = preprocessing.MinMaxScaler()
    col_scaled = min_max_scaler.fit_transform(col)
    return col_scaled

## Portfolio Dataset
portfolio is the smallest dataset and contains all offer information. We mainly preprocess two columns:
- offer_type: there are three offer types, that is, bogo, informational, discount. We need process them into numeric categrory such as 1, 2, 3. 
- channels: a list of channel categories such as web, email, mobile, social. We could break it out and use one hot encoding representing this information. 

In [71]:
def process_portfolio(filepath="data/portfolio.json"):
    # read in the json files
    portfolio = pd.read_json(filepath, orient='records', lines=True)

    # numeric offer types
    portfolio = pd.concat([
        portfolio,
        pd.get_dummies(portfolio["offer_type"], prefix="offer_type")
    ], axis=1).drop(["offer_type"], axis=1)
    
    # break out channels column
    portfolio["channel_email"] = portfolio["channels"].apply(lambda x: int("email" in x))
    portfolio["channel_web"] = portfolio["channels"].apply(lambda x: int("web" in x))
    portfolio["channel_mobile"] = portfolio["channels"].apply(lambda x: int("mobile" in x))
    portfolio["channel_social"] = portfolio["channels"].apply(lambda x: int("social" in x))
    
    # two ratio features
    portfolio["reward_difficulty"] = portfolio["reward"] / portfolio["difficulty"]
    portfolio["reward_difficulty"] = portfolio["reward_difficulty"].fillna(0)
    portfolio["difficulty_duration"] = portfolio["difficulty"] / portfolio["duration"]
    
    # normalize columns
    portfolio["difficulty"] = normalize_column(portfolio["difficulty"])
    portfolio["duration"] = normalize_column(portfolio["duration"])
    portfolio["reward"] = normalize_column(portfolio["reward"])
    
    # drop channels
    return portfolio.drop(["channels"], axis=1)

In [72]:
portfolio_df = process_portfolio()
portfolio_df.head()

Unnamed: 0,difficulty,duration,id,reward,offer_type_bogo,offer_type_discount,offer_type_informational,channel_email,channel_web,channel_mobile,channel_social,reward_difficulty,difficulty_duration
0,0.5,0.571429,ae264e3637204a6fb9bb56bc8210ddfd,1.0,1,0,0,1,0,1,1,1.0,1.428571
1,0.5,0.285714,4d5c57ea9a6940dd891ad53e9dbe8da0,1.0,1,0,0,1,1,1,1,1.0,2.0
2,0.0,0.142857,3f207df678b143eea3cee63160fa8bed,0.0,0,0,1,1,1,1,0,0.0,0.0
3,0.25,0.571429,9b98b8c7a33c4b65b9aebfe6a799e6d9,0.5,1,0,0,1,1,1,0,1.0,0.714286
4,1.0,1.0,0b1e1539f2cc45b7b9fa7c272da2e1d7,0.5,0,1,0,1,1,0,0,0.25,2.0


After preprocess, we could see some property of this dataset. 
1. The most of offer has rewards. 4 discounts and 4 BOGO. 
2. Reward of BOGO are higher than that of discount.
3. Email is main spreading channel while social is the least. 

## Profile Dataset



In [73]:
def process_profile(filepath='data/profile.json'):
    # read in the json files
    profile = pd.read_json(filepath, orient='records', lines=True)
    
    # fillin income nan with median
    profile = profile.fillna(
        {"income": profile.income.dropna().median()})
    
    # fillin age nan with median
    profile.age = profile.age.apply(lambda x: None if x == 118 else x)
    profile = profile.fillna({"age": profile.age.dropna().median()})
    
    # one hot encoding for gender
    profile = pd.concat([profile, 
                         pd.get_dummies(profile['gender'], prefix='gender', dummy_na=True)
                        ], axis=1).drop(["gender"], axis=1)
    
    # convert date to unix timestamp
    profile.became_member_on = pd.to_datetime(
        profile.became_member_on, format='%Y%m%d').astype(np.int64) // 10**9
    
    # normalize
    profile["age"] = normalize_column(profile["age"])
    profile["became_member_on"] = normalize_column(profile["became_member_on"])
    profile["income"] = normalize_column(profile["income"])
    
    return profile

In [74]:
profile_df = process_profile()
profile_df.head()

Unnamed: 0,age,became_member_on,id,income,gender_F,gender_M,gender_O,gender_nan
0,0.445783,0.709819,68be06ca386d4c31939f3a4f0e3dd783,0.377778,0,0,0,1
1,0.445783,0.793747,0610b486422d4921ae7d2bf64640c50b,0.911111,1,0,0,0
2,0.445783,0.99232,38fe809add3b4fcf9315a9694bb96ff5,0.377778,0,0,0,1
3,0.686747,0.756994,78afa995795e4d85b5d9ceeca43f5fef,0.777778,1,0,0,0
4,0.445783,0.804717,a03223e636434f42ac4c3df47e8bac43,0.377778,0,0,0,1


## Transcript Dataset

Lets preprocess transcript data. We can see there are three types of event: offer received, offer viewed and offer completed. And we see a complex value column of which the content may vary as event type. So, we split value column into offer_id, amount and reward columns. 

In [75]:
# see scripts/process_transcript.py to learn how data preprocessing works
transcript_df = pd.read_csv('data/processed_transcript.csv')
transcript_df.head()

Unnamed: 0,person,offer_id,offer_type,difficulty,amount,receive_time,view_time,complete_time,expected_complete_time,is_in_expected_complete_time,is_enough_amount,is_view_event,is_complete_event,is_complete
0,0009655768c64bdeb2e877511632db8f,5a8bc65990b245e5a138643cd4eb9837,informational,0.0,22.16,168,192.0,,240.0,False,True,True,False,True
1,0009655768c64bdeb2e877511632db8f,3f207df678b143eea3cee63160fa8bed,informational,0.0,8.57,336,372.0,,432.0,False,True,True,False,True
2,0009655768c64bdeb2e877511632db8f,f19421c1d4aa40978ebb69ca19b0e20d,bogo,5.0,8.57,408,456.0,414.0,528.0,True,True,True,True,True
3,0009655768c64bdeb2e877511632db8f,fafdcd668e3743c1bb461111dcafc2a4,discount,10.0,14.11,504,540.0,528.0,744.0,True,True,True,True,True
4,0009655768c64bdeb2e877511632db8f,2906b810c7d4411798c6938adc9daaa5,discount,10.0,10.27,576,,576.0,744.0,True,True,False,True,False


## Combine datasets

In [76]:
# merge three tables
df = transcript_df[["person", "offer_id", "is_complete"]].\
merge(profile_df, left_on="person", right_on="id").drop(["id", "person"], axis=1).\
merge(portfolio_df, left_on="offer_id", right_on="id").drop(["id", "offer_id"], axis=1)

In [77]:
# make is_complete 1 or 0
df.is_complete = df.is_complete.astype(int)

In [78]:
df.shape

(76277, 20)

In [79]:
df.head()

Unnamed: 0,is_complete,age,became_member_on,income,gender_F,gender_M,gender_O,gender_nan,difficulty,duration,reward,offer_type_bogo,offer_type_discount,offer_type_informational,channel_email,channel_web,channel_mobile,channel_social,reward_difficulty,difficulty_duration
0,1,0.180723,0.74712,0.466667,0,1,0,0,0.0,0.0,0.0,0,0,1,1,0,1,1,0.0,0.0
1,0,0.26506,0.891388,0.3,0,0,1,0,0.0,0.0,0.0,0,0,1,1,0,1,1,0.0,0.0
2,1,0.493976,0.52057,0.666667,1,0,0,0,0.0,0.0,0.0,0,0,1,1,0,1,1,0.0,0.0
3,1,0.072289,0.658804,0.333333,1,0,0,0,0.0,0.0,0.0,0,0,1,1,0,1,1,0.0,0.0
4,1,0.096386,0.780581,0.477778,1,0,0,0,0.0,0.0,0.0,0,0,1,1,0,1,1,0.0,0.0


## Prepare training and test datasets

In [80]:
# split into train/test
def train_test_split(df, train_frac= 0.7, seed=666):
    '''Shuffle the data and randomly split into train and test sets;
       separate the class labels (the column in transaction_df) from the features.
       :param df: Dataframe of all credit card transaction data
       :param train_frac: The decimal fraction of data that should be training data
       :param seed: Random seed for shuffling and reproducibility, default = 1
       :return: Two tuples (in order): (train_features, train_labels), (test_features, test_labels)
       '''
    
    # shuffle and split the data
    df_matrix = df.values
    
    np.random.seed(seed)
    np.random.shuffle(df_matrix)
    
    train_size = int(df_matrix.shape[0] * train_frac)
    train_features = df_matrix[:train_size, 1:]
    train_labels = df_matrix[:train_size, 0]
    
    test_features = df_matrix[train_size:, 1:]
    test_labels = df_matrix[train_size:, 0]
    
    return (train_features, train_labels), (test_features, test_labels)


In [81]:
(train_x, train_y), (test_x, test_y) = train_test_split(df)

In [82]:
train_x.shape, train_y.shape

((53393, 19), (53393,))

In [83]:
test_x.shape, test_y.shape

((22884, 19), (22884,))

## Save the data locally and upload to S3

In [84]:
pd.concat([
    pd.DataFrame(train_y), 
    pd.DataFrame(train_x)
], axis=1).to_csv("data/train.csv", header=False, index=False)

In [85]:
pd.concat([
    pd.DataFrame(test_y), 
    pd.DataFrame(test_x)
], axis=1).to_csv("data/test_full.csv", header=False, index=False)

In [86]:
pd.DataFrame(test_x).to_csv("data/test.csv", header=False, index=False)

In [78]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri

# This is an object that represents the SageMaker session that we are currently operating in. This
# object contains some useful information that we will need to access later such as our region.
session = sagemaker.Session()

# This is an object that represents the IAM role that we are currently assigned. When we construct
# and launch the training job later we will need to tell it what IAM role it should have. Since our
# use case is relatively simple we will simply assign the training job the role we currently have.
role = get_execution_role()

In [80]:
prefix = 'starbucks-xgboost'

train_location = session.upload_data("data/train.csv", key_prefix=prefix)

In [81]:
train_location

's3://sagemaker-us-west-2-203336335427/starbucks-xgboost/train.csv'

In [84]:
test_location = session.upload_data("data/test.csv", key_prefix=prefix)
test_location

's3://sagemaker-us-west-2-203336335427/starbucks-xgboost/test.csv'

## Train XGBoost model

In [82]:
# As stated above, we use this utility method to construct the image name for the training container.
container = get_image_uri(session.boto_region_name, 'xgboost')

# Now that we know which container to use, we can construct the estimator object.
xgb = sagemaker.estimator.Estimator(
    container, # The name of the training container
    role,      # The IAM role to use (our current role in this case)
    train_instance_count=1, # The number of instances to use for training
    train_instance_type='ml.m4.xlarge', # The type of instance ot use for training
    output_path=f's3://{session.default_bucket()}/{prefix}/output',
                                        # Where to save the output (the model artifacts)
    sagemaker_session=session) # The current SageMaker session


# set hyperparameters
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        objective='binary:logistic',
                        early_stopping_rounds=10,
                        num_round=200)


# This is a wrapper around the location of our train and validation data, to make sure that SageMaker
# knows our data is in csv format.
s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')

xgb.fit({'train': s3_input_train})

2019-06-27 00:24:32 Starting - Starting the training job...
2019-06-27 00:24:33 Starting - Launching requested ML instances......
2019-06-27 00:25:36 Starting - Preparing the instances for training...
2019-06-27 00:26:29 Downloading - Downloading input data...
2019-06-27 00:26:48 Training - Downloading the training image..
[31mArguments: train[0m
[31m[2019-06-27:00:27:06:INFO] Running standalone xgboost training.[0m
[31m[2019-06-27:00:27:06:INFO] Path /opt/ml/input/data/validation does not exist![0m
[31m[2019-06-27:00:27:06:INFO] File size need to be processed in the node: 7.56mb. Available memory size in the node: 8465.25mb[0m
[31m[2019-06-27:00:27:06:INFO] Determined delimiter of CSV input is ','[0m
[31m[00:27:06] S3DistributionType set as FullyReplicated[0m
[31m[00:27:06] 53393x19 matrix with 1014467 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[31m[00:27:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nod


2019-06-27 00:27:23 Uploading - Uploading generated training model
2019-06-27 00:27:23 Completed - Training job completed
Billable seconds: 54


In [86]:
xgb_transformer = xgb.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')
xgb_transformer.wait()

........................................!


In [87]:
!aws s3 cp --recursive $xgb_transformer.output_path $"data"

Completed 256.0 KiB/334.6 KiB (3.6 MiB/s) with 1 file(s) remainingCompleted 334.6 KiB/334.6 KiB (4.6 MiB/s) with 1 file(s) remainingdownload: s3://sagemaker-us-west-2-203336335427/xgboost-2019-06-27-00-29-09-523/test.csv.out to data/test.csv.out


In [88]:
predictions = pd.read_csv("data/test.csv.out", header=None)

In [90]:
predictions = [round(num) for num in predictions.squeeze().values]

In [91]:
from sklearn.metrics import accuracy_score
accuracy_score(test_y, predictions)

0.7212025869603216

In [92]:
from sklearn.metrics import f1_score
f1_score(test_y, predictions)

0.682207611077904

In [99]:
from sklearn.metrics import roc_auc_score
roc_auc_score(test_y, predictions)

0.7159450152591236

## Train Neural Network Model 

In [19]:
import pandas as pd
import boto3
import sagemaker

# session and role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

# create an S3 bucket
bucket = sagemaker_session.default_bucket()

In [20]:
prefix = "starbucks-pytorch"

In [21]:
# import a PyTorch wrapper
from sagemaker.pytorch import PyTorch

# specify an output path
output_path = f"s3://{bucket}/{prefix}"

# instantiate a pytorch estimator
estimator = PyTorch(
    entry_point="train.py",
    source_dir="pytorch",
    role=role,
    framework_version="1.0",
    train_instance_count=1,
    train_instance_type="ml.p2.xlarge",
    output_path=output_path,
    sagemaker_session=sagemaker_session,
    hyperparameters={
        "input_features": 19,
        "hidden_dim": 30, 
        "output_dim": 1,
        "epochs": 100
    })


In [22]:
train_data_path = 's3://sagemaker-us-west-2-203336335427/starbucks-xgboost/train.csv'
estimator.fit({'train': train_data_path})

2019-06-28 20:21:10 Starting - Starting the training job...
2019-06-28 20:21:11 Starting - Launching requested ML instances......
2019-06-28 20:22:40 Starting - Preparing the instances for training............
2019-06-28 20:24:21 Downloading - Downloading input data...
2019-06-28 20:24:42 Training - Downloading the training image...
2019-06-28 20:25:31 Training - Training image download completed. Training in progress.
[31mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[31mbash: no job control in this shell[0m
[31m2019-06-28 20:25:32,979 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[31m2019-06-28 20:25:33,007 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[31m2019-06-28 20:25:36,033 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[31m2019-06-28 20:25:36,328 sagemaker-containers INFO     Module train does not provide a setup.

[31mEpoch: 41, Loss: 0.5647252390112323[0m
[31mEpoch: 42, Loss: 0.5647615809035436[0m
[31mEpoch: 43, Loss: 0.5647300049216113[0m
[31mEpoch: 44, Loss: 0.565338591371806[0m
[31mEpoch: 45, Loss: 0.5640366685775559[0m
[31mEpoch: 46, Loss: 0.5651824238511284[0m
[31mEpoch: 47, Loss: 0.5632522821063629[0m
[31mEpoch: 48, Loss: 0.5636568335083764[0m
[31mEpoch: 49, Loss: 0.5639813809442833[0m
[31mEpoch: 50, Loss: 0.5642441238747554[0m
[31mEpoch: 51, Loss: 0.5634700855289059[0m
[31mEpoch: 52, Loss: 0.5641865624554372[0m
[31mEpoch: 53, Loss: 0.5646461112640994[0m
[31mEpoch: 54, Loss: 0.5637507544000274[0m
[31mEpoch: 55, Loss: 0.5636580179386148[0m
[31mEpoch: 56, Loss: 0.5644337049779597[0m
[31mEpoch: 57, Loss: 0.5639795908962519[0m
[31mEpoch: 58, Loss: 0.5639862420518746[0m
[31mEpoch: 59, Loss: 0.5641352043448763[0m
[31mEpoch: 60, Loss: 0.5638479790521248[0m
[31mEpoch: 61, Loss: 0.5640465050712506[0m
[31mEpoch: 62, Loss: 0.5644171724772632[0m
[31mEpoch:

In [23]:
# Deploy the trained model
from sagemaker.pytorch import PyTorchModel

model = PyTorchModel(
    entry_point="predict.py",
    role=role, 
    framework_version="1.0",
    model_data=estimator.model_data,
    source_dir="pytorch"
)

# deploy your model to create a predictor
predictor = model.deploy(initial_instance_count=1, instance_type="ml.t2.medium")


--------------------------------------------------------------------------------------!

In [91]:
# evaluating the model
# read in test data, assuming it is stored locally
test_data = pd.read_csv("data/test_full.csv", header=None, names=None)

# labels are in the first column
test_y = test_data.iloc[:,0]
test_x = test_data.iloc[:,1:]
test_y_preds = np.hstack(
    predictor.predict(test_x.iloc[i: i+1000]).reshape(1, -1).squeeze() 
    for i in range(0, len(test_x), 1000))

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(test_y, predictions)

In [92]:
# calculate the test roc_auc_score
from sklearn.metrics import roc_auc_score
roc_auc_score(test_y, np.hstack([i.reshape(1, -1).squeeze()] for i in test_y_preds).squeeze())

0.6952660611404713

In [93]:
predictor.delete_endpoint()

## Hyperparameter tuning

## lower hidden dim

In [94]:
# specify an output path
output_path = f"s3://{bucket}/{prefix}-lower-hidden-dim"

# instantiate a pytorch estimator
estimator = PyTorch(
    entry_point="train.py",
    source_dir="pytorch",
    role=role,
    framework_version="1.0",
    train_instance_count=1,
    train_instance_type="ml.p2.xlarge", # "ml.c4.xlarge",
    output_path=output_path,
    sagemaker_session=sagemaker_session,
    hyperparameters={
        "input_features": 19,
        "hidden_dim": 15, 
        "output_dim": 1,
        "epochs": 100
    })
train_data_path = 's3://sagemaker-us-west-2-203336335427/starbucks-xgboost/train.csv'
estimator.fit({'train': train_data_path})

2019-06-28 21:01:49 Starting - Starting the training job...
2019-06-28 21:01:52 Starting - Launching requested ML instances......
2019-06-28 21:03:18 Starting - Preparing the instances for training.........
2019-06-28 21:04:48 Downloading - Downloading input data...
2019-06-28 21:05:07 Training - Downloading the training image.....
[31mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[31mbash: no job control in this shell[0m
[31m2019-06-28 21:06:01,468 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[31m2019-06-28 21:06:01,494 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[31m2019-06-28 21:06:01,495 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[31m2019-06-28 21:06:01,743 sagemaker-containers INFO     Module train does not provide a setup.py. [0m
[31mGenerating setup.py[0m
[31m2019-06-28 21:06:01,744 sagemaker-containers I

[31mEpoch: 41, Loss: 0.5724209872636009[0m
[31mEpoch: 42, Loss: 0.5732695232393143[0m
[31mEpoch: 43, Loss: 0.5732778837702024[0m
[31mEpoch: 44, Loss: 0.5730639651640971[0m
[31mEpoch: 45, Loss: 0.5725613469125403[0m
[31mEpoch: 46, Loss: 0.5730033754297856[0m
[31mEpoch: 47, Loss: 0.5724712596766734[0m
[31mEpoch: 48, Loss: 0.5731576812401247[0m
[31mEpoch: 49, Loss: 0.5719016253752878[0m
[31mEpoch: 50, Loss: 0.5727596537813489[0m
[31mEpoch: 51, Loss: 0.5734839259350345[0m
[31mEpoch: 52, Loss: 0.5716362460219905[0m
[31mEpoch: 53, Loss: 0.5726961539618755[0m
[31mEpoch: 54, Loss: 0.5731491646098064[0m
[31mEpoch: 55, Loss: 0.5723278553661112[0m
[31mEpoch: 56, Loss: 0.5728861862186635[0m
[31mEpoch: 57, Loss: 0.5730129739011495[0m
[31mEpoch: 58, Loss: 0.5737174994406405[0m
[31mEpoch: 59, Loss: 0.5730887501259868[0m
[31mEpoch: 60, Loss: 0.5725076032321105[0m
[31mEpoch: 61, Loss: 0.5728811696590332[0m
[31mEpoch: 62, Loss: 0.5727078653481346[0m
[31mEpoch

In [95]:
# Deploy the trained model
from sagemaker.pytorch import PyTorchModel
from sklearn.metrics import roc_auc_score

model = PyTorchModel(
    entry_point="predict.py",
    role=role, 
    framework_version="1.0",
    model_data=estimator.model_data,
    source_dir="pytorch"
)

# deploy your model to create a predictor
predictor = model.deploy(initial_instance_count=1, instance_type="ml.t2.medium")

--------------------------------------------------------------------------------------------------!

In [96]:
# evaluating the model
# read in test data, assuming it is stored locally
test_data = pd.read_csv("data/test_full.csv", header=None, names=None)

# labels are in the first column
test_y = test_data.iloc[:,0]
test_x = test_data.iloc[:,1:]
test_y_preds = np.hstack(
    predictor.predict(test_x.iloc[i: i+1000]).reshape(1, -1).squeeze() 
    for i in range(0, len(test_x), 1000))

In [99]:
# calculate the test roc_auc_score
from sklearn.metrics import roc_auc_score
roc_auc_score(test_y, test_y_preds)

0.7009638194081423

In [101]:
from sklearn.metrics import accuracy_score
accuracy_score(test_y, test_y_preds)

0.7056895647614053

## Higher epochs

In [102]:
# import a PyTorch wrapper
from sagemaker.pytorch import PyTorch

# specify an output path
output_path = f"s3://{bucket}/{prefix}-higher-epochs"

# instantiate a pytorch estimator
estimator = PyTorch(
    entry_point="train.py",
    source_dir="pytorch",
    role=role,
    framework_version="1.0",
    train_instance_count=1,
    train_instance_type="ml.c4.xlarge",
    output_path=output_path,
    sagemaker_session=sagemaker_session,
    hyperparameters={
        "input_features": 19,
        "hidden_dim": 15, 
        "output_dim": 1,
        "epochs": 200
    })
train_data_path = 's3://sagemaker-us-west-2-203336335427/starbucks-xgboost/train.csv'
estimator.fit({'train': train_data_path})

2019-06-28 22:19:15 Starting - Starting the training job...
2019-06-28 22:19:17 Starting - Launching requested ML instances...
2019-06-28 22:20:15 Starting - Preparing the instances for training......
2019-06-28 22:21:08 Downloading - Downloading input data...
2019-06-28 22:21:41 Training - Training image download completed. Training in progress..
[31mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[31mbash: no job control in this shell[0m
[31m2019-06-28 22:21:42,549 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[31m2019-06-28 22:21:42,551 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-06-28 22:21:42,563 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[31m2019-06-28 22:21:42,564 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[31m2019-06-28 22:21:42,869 sagemaker-containers INFO     M

[31mEpoch: 38, Loss: 0.5735095308458761[0m
[31mEpoch: 39, Loss: 0.5728003907013922[0m
[31mEpoch: 40, Loss: 0.5736741857945026[0m
[31mEpoch: 41, Loss: 0.5736510265269753[0m
[31mEpoch: 42, Loss: 0.5721236345863967[0m
[31mEpoch: 43, Loss: 0.5746038772472728[0m
[31mEpoch: 44, Loss: 0.572900599615181[0m
[31mEpoch: 45, Loss: 0.5730355562956145[0m
[31mEpoch: 46, Loss: 0.5732300898806656[0m
[31mEpoch: 47, Loss: 0.5730788228245041[0m
[31mEpoch: 48, Loss: 0.5728198857147818[0m
[31mEpoch: 49, Loss: 0.571591656328587[0m
[31mEpoch: 50, Loss: 0.5722077148293288[0m
[31mEpoch: 51, Loss: 0.5728044840001435[0m
[31mEpoch: 52, Loss: 0.5724969982124223[0m
[31mEpoch: 53, Loss: 0.5733874418036768[0m
[31mEpoch: 54, Loss: 0.5727589353277219[0m
[31mEpoch: 55, Loss: 0.5716134936761543[0m
[31mEpoch: 56, Loss: 0.5738802883732185[0m
[31mEpoch: 57, Loss: 0.5709626779071847[0m
[31mEpoch: 58, Loss: 0.572723510629787[0m
[31mEpoch: 59, Loss: 0.5728201337828395[0m
[31mEpoch: 6

In [103]:
# Deploy the trained model
from sagemaker.pytorch import PyTorchModel
from sklearn.metrics import roc_auc_score

model = PyTorchModel(
    entry_point="predict.py",
    role=role, 
    framework_version="1.0",
    model_data=estimator.model_data,
    source_dir="pytorch"
)

# deploy your model to create a predictor
predictor = model.deploy(initial_instance_count=1, instance_type="ml.t2.medium")

--------------------------------------------------------------------------------------------------------------!

In [104]:
# evaluating the model
# read in test data, assuming it is stored locally
test_data = pd.read_csv("data/test_full.csv", header=None, names=None)

# labels are in the first column
test_y = test_data.iloc[:,0]
test_x = test_data.iloc[:,1:]
test_y_preds = np.hstack(
    predictor.predict(test_x.iloc[i: i+1000]).reshape(1, -1).squeeze() 
    for i in range(0, len(test_x), 1000))

In [105]:
# calculate the test roc_auc_score
from sklearn.metrics import roc_auc_score
roc_auc_score(test_y, test_y_preds)

0.7111692582105572

In [106]:
from sklearn.metrics import accuracy_score
accuracy_score(test_y, test_y_preds)

0.7176560041950708

## 5x hidden layers

In [107]:
# import a PyTorch wrapper
from sagemaker.pytorch import PyTorch

# specify an output path
output_path = f"s3://{bucket}/{prefix}-5-hidden-layer"

# instantiate a pytorch estimator
estimator = PyTorch(
    entry_point="train.py",
    source_dir="pytorch",
    role=role,
    framework_version="1.0",
    train_instance_count=1,
    train_instance_type="ml.c4.xlarge",
    output_path=output_path,
    sagemaker_session=sagemaker_session,
    hyperparameters={
        "input_features": 19,
        "hidden_dim": 100, 
        "output_dim": 1,
        "epochs": 100
    })
train_data_path = 's3://sagemaker-us-west-2-203336335427/starbucks-xgboost/train.csv'
estimator.fit({'train': train_data_path})

2019-06-28 22:46:13 Starting - Starting the training job...
2019-06-28 22:46:14 Starting - Launching requested ML instances...
2019-06-28 22:47:11 Starting - Preparing the instances for training......
2019-06-28 22:48:05 Downloading - Downloading input data...
2019-06-28 22:48:26 Training - Downloading the training image.
[31mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[31mbash: no job control in this shell[0m
[31m2019-06-28 22:48:40,214 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[31m2019-06-28 22:48:40,217 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-06-28 22:48:40,230 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[31m2019-06-28 22:48:43,250 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[31m2019-06-28 22:48:43,523 sagemaker-containers INFO     Module train does not provi

[31mEpoch: 37, Loss: 0.5468553893028128[0m
[31mEpoch: 38, Loss: 0.5373431954373804[0m
[31mEpoch: 39, Loss: 0.5367521486556932[0m
[31mEpoch: 40, Loss: 0.5365186577231697[0m
[31mEpoch: 41, Loss: 0.5375246298078741[0m
[31mEpoch: 42, Loss: 0.5360836938464463[0m
[31mEpoch: 43, Loss: 0.536555456659767[0m
[31mEpoch: 44, Loss: 0.5264785062066848[0m
[31mEpoch: 45, Loss: 0.5262435746806837[0m
[31mEpoch: 46, Loss: 0.5269687865562877[0m
[31mEpoch: 47, Loss: 0.5264477149355278[0m
[31mEpoch: 48, Loss: 0.5266553176426039[0m
[31mEpoch: 49, Loss: 0.5262777524779129[0m
[31mEpoch: 50, Loss: 0.5357357767296864[0m
[31mEpoch: 51, Loss: 0.5256157385002585[0m
[31mEpoch: 52, Loss: 0.5265684053781774[0m
[31mEpoch: 53, Loss: 0.5161963112696949[0m
[31mEpoch: 54, Loss: 0.5149484474103102[0m
[31mEpoch: 55, Loss: 0.5159075511964073[0m
[31mEpoch: 56, Loss: 0.515530872486169[0m
[31mEpoch: 57, Loss: 0.5154330683100536[0m
[31mEpoch: 58, Loss: 0.5154193877799903[0m
[31mEpoch: 

In [108]:
# Deploy the trained model
from sagemaker.pytorch import PyTorchModel
from sklearn.metrics import roc_auc_score

model = PyTorchModel(
    entry_point="predict.py",
    role=role, 
    framework_version="1.0",
    model_data=estimator.model_data,
    source_dir="pytorch"
)

# deploy your model to create a predictor
predictor = model.deploy(initial_instance_count=1, instance_type="ml.t2.medium")

---------------------------------------------------------------------------------------!

In [109]:
# evaluating the model
# read in test data, assuming it is stored locally
test_data = pd.read_csv("data/test_full.csv", header=None, names=None)

# labels are in the first column
test_y = test_data.iloc[:,0]
test_x = test_data.iloc[:,1:]
test_y_preds = np.hstack(
    predictor.predict(test_x.iloc[i: i+1000]).reshape(1, -1).squeeze() 
    for i in range(0, len(test_x), 1000))

In [111]:
# calculate the test roc_auc_score
from sklearn.metrics import roc_auc_score
roc_auc_score(test_y, test_y_preds)

0.7605758026287857

In [112]:
from sklearn.metrics import accuracy_score
accuracy_score(test_y, test_y_preds)

0.770365320748121

## No dropout

In [126]:
# import a PyTorch wrapper
from sagemaker.pytorch import PyTorch

# specify an output path
output_path = f"s3://{bucket}/{prefix}-5-hidden-layer"

# instantiate a pytorch estimator
estimator = PyTorch(
    entry_point="train.py",
    source_dir="pytorch",
    role=role,
    framework_version="1.0",
    train_instance_count=1,
    train_instance_type="ml.c4.xlarge",
    output_path=output_path,
    sagemaker_session=sagemaker_session,
    hyperparameters={
        "input_features": 19,
        "hidden_dim": 100, 
        "output_dim": 1,
        "epochs": 100
    })
train_data_path = 's3://sagemaker-us-west-2-203336335427/starbucks-xgboost/train.csv'
estimator.fit({'train': train_data_path})

2019-06-28 23:53:21 Starting - Starting the training job...
2019-06-28 23:53:22 Starting - Launching requested ML instances...
2019-06-28 23:54:19 Starting - Preparing the instances for training......
2019-06-28 23:54:59 Downloading - Downloading input data..

2019-06-28 23:55:34 Training - Training image download completed. Training in progress.[31mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[31mbash: no job control in this shell[0m
[31m2019-06-28 23:55:35,877 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[31m2019-06-28 23:55:35,880 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-06-28 23:55:35,892 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[31m2019-06-28 23:55:37,307 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[31m2019-06-28 23:55:37,575 sagemaker-containers INFO     Mod

[31mEpoch: 39, Loss: 0.4496630337968301[0m
[31mEpoch: 40, Loss: 0.4495197376368868[0m
[31mEpoch: 41, Loss: 0.4495186710932505[0m
[31mEpoch: 42, Loss: 0.4494383441235465[0m
[31mEpoch: 43, Loss: 0.4493861407655455[0m
[31mEpoch: 44, Loss: 0.4493260563368655[0m
[31mEpoch: 45, Loss: 0.4491911900707622[0m
[31mEpoch: 46, Loss: 0.4491100895801064[0m
[31mEpoch: 47, Loss: 0.4490499835921807[0m
[31mEpoch: 48, Loss: 0.4489972139147113[0m
[31mEpoch: 49, Loss: 0.4490127199737543[0m
[31mEpoch: 50, Loss: 0.4489405974606226[0m
[31mEpoch: 51, Loss: 0.4488534836621767[0m
[31mEpoch: 52, Loss: 0.44883547442292[0m
[31mEpoch: 53, Loss: 0.448800710672408[0m
[31mEpoch: 54, Loss: 0.4488281016101998[0m
[31mEpoch: 55, Loss: 0.448681957400247[0m
[31mEpoch: 56, Loss: 0.4486448284671102[0m
[31mEpoch: 57, Loss: 0.448609420855282[0m
[31mEpoch: 58, Loss: 0.4485049841137192[0m
[31mEpoch: 59, Loss: 0.4485107427037388[0m
[31mEpoch: 60, Loss: 0.4483438185677769[0m
[31mEpoch: 61,

In [128]:
# Deploy the trained model
from sagemaker.pytorch import PyTorchModel
from sklearn.metrics import roc_auc_score

model = PyTorchModel(
    entry_point="predict.py",
    role=role, 
    framework_version="1.0",
    model_data=estimator.model_data,
    source_dir="pytorch"
)

# deploy your model to create a predictor
predictor = model.deploy(initial_instance_count=1, instance_type="ml.t2.medium")

----------------------------------------------------------------------------------------!

In [129]:
# evaluating the model
# read in test data, assuming it is stored locally
test_data = pd.read_csv("data/test_full.csv", header=None, names=None)

# labels are in the first column
test_y = test_data.iloc[:,0]
test_x = test_data.iloc[:,1:]
test_y_preds = np.hstack(
    predictor.predict(test_x.iloc[i: i+1000]).reshape(1, -1).squeeze() 
    for i in range(0, len(test_x), 1000))

In [130]:
# calculate the test roc_auc_score
from sklearn.metrics import roc_auc_score
roc_auc_score(test_y, test_y_preds)

0.7843739514310259

In [131]:
from sklearn.metrics import accuracy_score
accuracy_score(test_y, test_y_preds)

0.7980492920818039

In [132]:
predictor.delete_endpoint()

# Clean up resources


In [None]:
# remove endpoint
# predictor.delete_endpoint()