In [1]:
import io
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import boto3
import sagemaker
from sagemaker import get_execution_role

In [2]:
session = sagemaker.Session()
print(session)

role = get_execution_role()
print(role)

bucket = session.default_bucket()

<sagemaker.session.Session object at 0x7fc454cd15f8>
arn:aws:iam::240038582877:role/service-role/AmazonSageMaker-ExecutionRole-20191028T202433


In [3]:
# load data
prefix = 'segmentation'

data_key= prefix + '/scaled_mailout_train.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)

### Training

In [4]:
tmp_list = []

for chunk in pd.read_csv(data_location, chunksize=100000):
    tmp_list.append(chunk)
    
train_df = pd.concat(tmp_list, axis=0).drop(['Unnamed: 0'], axis=1)
print(train_df.shape)
del tmp_list

(42962, 94)


In [5]:
train_df['RESPONSE'].mean()

0.012383036171500396

In [6]:
Y_train = train_df['RESPONSE']
X_train = train_df.drop('RESPONSE', axis=1)

In [7]:
df = pd.concat([Y_train, X_train], axis=1)

In [8]:
df.head()

Unnamed: 0,RESPONSE,LNR,AGER_TYP,D19_BANKEN_ANZ_12,D19_BANKEN_ANZ_24,D19_BANKEN_DATUM,D19_BANKEN_DIREKT,D19_BANKEN_GROSS,D19_BANKEN_LOKAL,D19_BANKEN_OFFLINE_DATUM,...,SEMIO_RAT,SEMIO_REL,SEMIO_SOZ,SEMIO_TRADV,SEMIO_VERT,SHOPPER_TYP,VERS_TYP,ZABEOTYP,ANREDE_KZ,ALTERSKATEGORIE_GROB
0,0.0,0.020538,0.75,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.166667,0.0,0.666667,0.0,0.333333,1.0,1.0,0.4,1.0,0.375
1,0.0,0.020631,0.5,0.166667,0.166667,0.444444,0.0,0.0,0.428571,1.0,...,0.5,0.333333,0.0,0.333333,0.5,0.75,0.666667,0.0,1.0,0.25
2,0.0,0.020689,0.5,0.0,0.166667,0.666667,0.0,0.0,0.0,1.0,...,0.0,0.666667,0.333333,0.0,1.0,1.0,0.666667,0.4,0.0,0.375
3,0.0,0.017006,0.75,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.166667,0.166667,0.666667,0.333333,0.166667,0.5,1.0,0.4,1.0,0.375
4,0.0,0.020771,0.75,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.666667,0.5,0.833333,0.166667,1.0,0.5,0.666667,0.4,0.0,0.25


In [9]:
train = df.sample(frac=0.5)
validation = df.sample(frac=0.25)
test = df.sample(frac=0.25)

In [10]:
train.to_csv('train.csv', header=False, index=False)
validation.to_csv('validation.csv', header=False, index=False)
test.to_csv('test.csv', header=False, index=False)

In [11]:
print(train.shape)
print(validation.shape)
print(test.shape)

(21481, 94)
(10740, 94)
(10740, 94)


In [12]:
prefix='supervised'

train_location = session.upload_data('train.csv', key_prefix=prefix)
validate_location = session.upload_data('validation.csv', key_prefix=prefix)
test_location = session.upload_data('test.csv', key_prefix=prefix)

In [14]:
from sagemaker.amazon.amazon_estimator import get_image_uri

container = get_image_uri(session.boto_region_name, 'xgboost')

	get_image_uri(region, 'xgboost', '0.90-1').


In [15]:
xgb = None
xgb = sagemaker.estimator.Estimator(
    container, 
    role, 
    train_instance_count=1,
    train_instance_type='ml.m4.xlarge',
    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
    sagemaker_session=session
)

In [16]:
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        early_stopping_rounds=10,
                        num_round=500)

### Fit the XGBoost model

In [17]:
s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=validate_location, content_type='csv')

In [18]:
xgb.fit({ 'train': s3_input_train, 'validation': s3_input_validation })

2020-04-27 04:05:59 Starting - Starting the training job...
2020-04-27 04:06:01 Starting - Launching requested ML instances...
2020-04-27 04:06:56 Starting - Preparing the instances for training......
2020-04-27 04:07:51 Downloading - Downloading input data...
2020-04-27 04:08:24 Training - Downloading the training image...
2020-04-27 04:08:56 Uploading - Uploading generated training model
2020-04-27 04:08:56 Completed - Training job completed
[34mArguments: train[0m
[34m[2020-04-27:04:08:44:INFO] Running standalone xgboost training.[0m
[34m[2020-04-27:04:08:44:INFO] File size need to be processed in the node: 23.45mb. Available memory size in the node: 8504.3mb[0m
[34m[2020-04-27:04:08:44:INFO] Determined delimiter of CSV input is ','[0m
[34m[04:08:44] S3DistributionType set as FullyReplicated[0m
[34m[04:08:44] 21481x93 matrix with 1997733 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2020-04-27:04:08:44:INFO] Determined delimi

In [19]:
xgb_transformer = xgb.transformer(instance_count=1, instance_type='ml.m4.xlarge')

In [20]:
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

In [None]:
# 2020-04-27T03:56:19.768:[sagemaker logs]: sagemaker-us-east-2-240038582877/supervised/test.csv: Unable to evaluate payload provided: Feature size of csv inference data 94 is not consistent with feature size of trained model 93

In [None]:
xgb_transformer.wait()

...........

In [None]:
!aws s3 cp --recursive $xgb_transformer.output_path '/dataset'

In [None]:
predictions = pd.read_csv('dataset/test.csv', header=None)
predictions = [round(num) for num in predictions.squeeze().values]

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(test)