In [1]:
import io
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import boto3
import sagemaker
from sagemaker import get_execution_role

In [2]:
session = sagemaker.Session()
print(session)

role = get_execution_role()
print(role)

bucket = session.default_bucket()

<sagemaker.session.Session object at 0x7f091b942978>
arn:aws:iam::240038582877:role/service-role/AmazonSageMaker-ExecutionRole-20191028T202433


In [3]:
# load data
prefix = 'segmentation'

data_key= prefix + '/scaled_mailout_train.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)

### Training

In [4]:
tmp_list = []

for chunk in pd.read_csv(data_location, chunksize=100000):
    tmp_list.append(chunk)
    
train_df = pd.concat(tmp_list, axis=0).drop(['Unnamed: 0'], axis=1)
print(train_df.shape)
del tmp_list

(42962, 94)


In [5]:
train_df['RESPONSE'].mean()

0.012383036171500396

In [6]:
features =[
    'VERS_TYP',
    'FINANZ_UNAUFFAELLIGER',
    'HEALTH_TYP',
    'SHOPPER_TYP',
    'FINANZ_ANLEGER',
    'D19_VERSAND_ANZ_24',
    'FINANZ_UNAUFFAELLIGER',
    'FINANZ_SPARER',
    'D19_GESAMT_ANZ_12',
    'D19_SAMMELARTIKEL',
    'D19_BILDUNG',
    'D19_WEIN_FEINKOST',
    'RESPONSE'
]

In [7]:
df = train_df[features]

In [8]:
train = df.sample(frac=0.5)
validation = df.sample(frac=0.25)
test = df.sample(frac=0.25)

In [9]:
Y_train = train['RESPONSE']
X_train = train.drop('RESPONSE', axis=1)

In [10]:
training_set = pd.concat([Y_train, X_train], axis=1)

In [11]:
training_set.to_csv('train.csv', header=False, index=False)

In [13]:
training_set.head()

Unnamed: 0,RESPONSE,VERS_TYP,FINANZ_UNAUFFAELLIGER,HEALTH_TYP,SHOPPER_TYP,FINANZ_ANLEGER,D19_VERSAND_ANZ_24,FINANZ_UNAUFFAELLIGER.1,FINANZ_SPARER,D19_GESAMT_ANZ_12,D19_SAMMELARTIKEL,D19_BILDUNG,D19_WEIN_FEINKOST
33796,0.0,0.666667,0.25,0.5,1.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0
25480,0.0,1.0,0.25,1.0,0.25,0.5,0.833333,0.25,0.25,0.666667,0.857143,0.0,0.0
29117,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.75,0.0,0.0,0.0,0.0
12195,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.75,0.0,0.0,0.0,0.0
50,0.0,1.0,0.5,1.0,0.75,0.25,0.166667,0.5,0.25,0.0,0.0,0.0,0.0


In [14]:
Y_validation = validation['RESPONSE']
X_validation = validation.drop('RESPONSE', axis=1)

In [15]:
validation_set = pd.concat([Y_validation, X_validation], axis=1)

In [16]:
validation_set.to_csv('validation.csv', header=False, index=False)

In [17]:
validation_set.head()

Unnamed: 0,RESPONSE,VERS_TYP,FINANZ_UNAUFFAELLIGER,HEALTH_TYP,SHOPPER_TYP,FINANZ_ANLEGER,D19_VERSAND_ANZ_24,FINANZ_UNAUFFAELLIGER.1,FINANZ_SPARER,D19_GESAMT_ANZ_12,D19_SAMMELARTIKEL,D19_BILDUNG,D19_WEIN_FEINKOST
37433,0.0,1.0,0.0,1.0,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41021,0.0,0.666667,0.25,0.5,1.0,0.75,0.333333,0.25,0.25,0.0,0.857143,0.0,1.0
16717,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.75,0.0,0.0,0.0,0.0
21867,0.0,1.0,0.25,1.0,0.5,0.0,0.5,0.25,0.0,0.166667,0.0,0.0,0.0
17590,0.0,1.0,0.25,1.0,0.5,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0


In [18]:
test_set = test.drop('RESPONSE', axis=1)

In [19]:
test_set.head()

Unnamed: 0,VERS_TYP,FINANZ_UNAUFFAELLIGER,HEALTH_TYP,SHOPPER_TYP,FINANZ_ANLEGER,D19_VERSAND_ANZ_24,FINANZ_UNAUFFAELLIGER.1,FINANZ_SPARER,D19_GESAMT_ANZ_12,D19_SAMMELARTIKEL,D19_BILDUNG,D19_WEIN_FEINKOST
42491,0.666667,0.5,0.75,0.25,0.5,0.5,0.5,0.0,0.5,0.0,0.0,0.0
6278,1.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2843,1.0,0.5,1.0,0.75,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.0
9500,0.666667,0.25,0.5,0.5,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0
1305,0.666667,0.0,0.75,1.0,0.0,0.166667,0.0,0.0,0.166667,0.857143,0.0,0.0


In [20]:
test_set.to_csv('test.csv', header=False, index=False)

In [21]:
print(train.shape)
print(validation.shape)
print(test.shape)

(21481, 13)
(10740, 13)
(10740, 13)


In [22]:
prefix='supervised'

train_location = session.upload_data('train.csv', key_prefix=prefix)
validate_location = session.upload_data('validation.csv', key_prefix=prefix)
test_location = session.upload_data('test.csv', key_prefix=prefix)

In [23]:
from sagemaker.amazon.amazon_estimator import get_image_uri

container = get_image_uri(session.boto_region_name, 'xgboost')

	get_image_uri(region, 'xgboost', '0.90-1').


In [24]:
xgb = None
xgb = sagemaker.estimator.Estimator(
    container, 
    role, 
    train_instance_count=1,
    train_instance_type='ml.m4.xlarge',
    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
    sagemaker_session=session
)

In [25]:
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        early_stopping_rounds=10,
                        num_round=500)

### Fit the XGBoost model

In [26]:
s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=validate_location, content_type='csv')

In [27]:
xgb.fit({ 'train': s3_input_train, 'validation': s3_input_validation })

2020-04-29 03:41:48 Starting - Starting the training job...
2020-04-29 03:41:49 Starting - Launching requested ML instances...
2020-04-29 03:42:45 Starting - Preparing the instances for training......
2020-04-29 03:43:30 Downloading - Downloading input data...
2020-04-29 03:44:14 Training - Training image download completed. Training in progress.
2020-04-29 03:44:14 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[2020-04-29:03:44:09:INFO] Running standalone xgboost training.[0m
[34m[2020-04-29:03:44:09:INFO] File size need to be processed in the node: 2.41mb. Available memory size in the node: 8495.94mb[0m
[34m[2020-04-29:03:44:09:INFO] Determined delimiter of CSV input is ','[0m
[34m[03:44:09] S3DistributionType set as FullyReplicated[0m
[34m[03:44:09] 21481x12 matrix with 257772 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2020-04-29:03:44:09:INFO] Determined delimiter of CSV input is ','[0m
[34m[

In [28]:
xgb_transformer = xgb.transformer(instance_count=1, instance_type='ml.m4.xlarge')

In [29]:
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

In [30]:
xgb_transformer.wait()

....................[34mArguments: serve[0m
[34m[2020-04-29 03:48:12 +0000] [1] [INFO] Starting gunicorn 19.7.1[0m
[34m[2020-04-29 03:48:12 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2020-04-29 03:48:12 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2020-04-29 03:48:12 +0000] [40] [INFO] Booting worker with pid: 40[0m
[34m[2020-04-29 03:48:12 +0000] [41] [INFO] Booting worker with pid: 41[0m
[34m[2020-04-29:03:48:12:INFO] Model loaded successfully for worker : 40[0m
[34m[2020-04-29:03:48:12:INFO] Model loaded successfully for worker : 41[0m
[34m[2020-04-29 03:48:12 +0000] [42] [INFO] Booting worker with pid: 42[0m
[34m[2020-04-29:03:48:12:INFO] Model loaded successfully for worker : 42[0m
[34m[2020-04-29 03:48:12 +0000] [43] [INFO] Booting worker with pid: 43[0m
[34m[2020-04-29:03:48:12:INFO] Model loaded successfully for worker : 43[0m
[34m[2020-04-29:03:48:34:INFO] Sniff delimiter as ','[0m
[34m[2020-04-29:03:48:34:INFO] Determined deli

In [31]:
!aws s3 cp --recursive $xgb_transformer.output_path 'dataset'

Completed 157.3 KiB/157.3 KiB (2.6 MiB/s) with 1 file(s) remainingdownload: s3://sagemaker-us-east-2-240038582877/xgboost-2020-04-29-03-45-00-277/test.csv.out to dataset/test.csv.out


In [32]:
predictions = pd.read_csv('dataset/test.csv.out', header=None)
predictions = [round(num) for num in predictions.squeeze().values]

In [33]:
from sklearn.metrics import accuracy_score
accuracy_score(test['RESPONSE'], predictions)

0.987243947858473