In [1]:
import io
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import boto3
import sagemaker
from sagemaker import get_execution_role

In [2]:
session = sagemaker.Session()
print(session)

role = get_execution_role()
print(role)

bucket = session.default_bucket()

<sagemaker.session.Session object at 0x7f223165db00>
arn:aws:iam::240038582877:role/service-role/AmazonSageMaker-ExecutionRole-20191028T202433


In [3]:
# TOP TWO COMPONENTS TOP FEATURES 
features =[
'FINANZ_ANLEGER', # (German pop) Investor
'D19_KONSUMTYP_MAX', # (German pop) 
'VERS_TYP', #INSURANCE 
'FINANZ_UNAUFFAELLIGER', #financial typology: unremarkable
'SEMIO_VERT', # (German pop) affinity indicating in what way the person is dreamily
'GEBURTSJAHR', # (German pop) birth year
'HEALTH_TYP', #health typology
'D19_SONSTIGE', # (German pop) transactional activity based on the product group ALL OTHER CATEGORIES
'SHOPPER_TYP', #
'D19_VERSAND_ONLINE_DATUM', # (German pop) actuality of the last transaction with the complete file TOTAL
'D19_GESAMT_ONLINE_DATUM', # (German pop) actuality of the last transaction with the complete file ONLINE
'D19_VERSAND_DATUM', # (German pop) actuality of the last transaction for the segment mail-order TOTAL
'D19_GESAMT_ANZ_24', # (German pop) transaction activity TOTAL POOL in the last 24 months
'D19_VERSAND_ANZ_24', #transaction activity MAIL-ORDER in the last 24 months 
'FINANZ_UNAUFFAELLIGER', #financial typology: unremarkable
'D19_GESAMT_DATUM', # (German pop) actuality of the last transaction with the complete file TOTAL
'FINANZ_SPARER', #financial typology: money saver
'D19_GESAMT_ANZ_12', #transaction activity TOTAL POOL in the last 12 months 
'RESPONSE'
]

In [4]:
# load data
prefix = 'segmentation'

data_key= prefix + '/mailout_train_raw.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)

### Training

In [5]:
tmp_list = []

for chunk in pd.read_csv(data_location, chunksize=100000):
    tmp_list.append(chunk)
    
df = pd.concat(tmp_list, axis=0).drop(['Unnamed: 0'], axis=1)
print(df.shape)
del tmp_list

  interactivity=interactivity, compiler=compiler, result=result)


(42962, 367)


In [6]:
df['RESPONSE'].sum()

532

In [7]:
# select int and float fields, reshuffle dataframe
df = df.select_dtypes(['int', 'float']).sample(frac=1, axis=0)
df.shape

(42962, 361)

In [8]:
y = df['RESPONSE']
X = df.drop(['RESPONSE'], axis=1)

In [9]:
#randomly drop columns to experiment with different features
X = X.sample(frac=0.1, axis=1)
X.columns.values

array(['EWDICHTE', 'KBA13_KMH_0_140', 'KBA13_SEG_SONSTIGE',
       'D19_KONSUMTYP', 'KBA05_KRSOBER', 'KBA05_SEG8', 'KBA13_NISSAN',
       'CJT_TYP_1', 'KBA05_HERST2', 'KBA05_ALTER1', 'KBA13_HALTER_55',
       'D19_VERSAND_ANZ_24', 'KBA13_SEG_KLEINST', 'LP_LEBENSPHASE_FEIN',
       'VHA', 'KBA05_MOD4', 'STRUKTURTYP', 'KBA13_CCM_1600',
       'GEBAEUDETYP_RASTER', 'KBA13_OPEL', 'KBA13_BJ_2000', 'KBA05_KW3',
       'ORTSGR_KLS9', 'KBA13_HALTER_45', 'D19_VERSICHERUNGEN',
       'D19_BANKEN_ONLINE_DATUM', 'D19_GESAMT_OFFLINE_DATUM',
       'D19_SCHUHE', 'KBA13_HERST_EUROPA', 'KBA13_KRSZUL_NEU',
       'PRAEGENDE_JUGENDJAHRE', 'ALTERSKATEGORIE_GROB', 'LP_FAMILIE_GROB',
       'PLZ8_ANTG3', 'KBA13_RENAULT', 'KBA13_SITZE_4'], dtype=object)

In [10]:
X.shape

(42962, 36)

### Standardize

In [11]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
StandardScaler()
transformed_df = pd.DataFrame(columns = X.columns.values, index = X.index.values, data=scaler.transform(X));

  return self.partial_fit(X, y)


In [12]:
transformed_df = transformed_df.fillna(0)
transformed_df.isnull().any(axis=0);
transformed_df.shape

(42962, 36)

In [13]:
tmp = pd.concat([y, transformed_df], axis=1)

In [14]:
train = tmp.iloc[0:21480]
print(train.shape[0])
validation = tmp.iloc[21480:32220]
print(validation.shape[0])
test = tmp.iloc[32220:]
print(test.shape[0])

21480
10740
10742


In [15]:
print(train['RESPONSE'].mean() * 100)
print(validation['RESPONSE'].mean() * 100)
print(test['RESPONSE'].mean() * 100)

1.2243947858472997
1.2756052141527001
1.2288214485198286


In [16]:
train.to_csv('train.csv', header=False, index=False)

In [17]:
validation.to_csv('validation.csv', header=False, index=False)

In [18]:
X_test = test.drop('RESPONSE', axis=1)
Y_test = test['RESPONSE']

In [19]:
X_test.to_csv('test.csv', header=False, index=False)

In [20]:
print(train.shape)
print(validation.shape)
print(test.shape)

(21480, 37)
(10740, 37)
(10742, 37)


In [21]:
prefix='supervised'

train_location = session.upload_data('train.csv', key_prefix=prefix)
validate_location = session.upload_data('validation.csv', key_prefix=prefix)
test_location = session.upload_data('test.csv', key_prefix=prefix)

In [22]:
from sagemaker.amazon.amazon_estimator import get_image_uri

container = get_image_uri(session.boto_region_name, 'xgboost', '0.90-2')

In [23]:
xgb = None
xgb = sagemaker.estimator.Estimator(
    container, 
    role, 
    train_instance_count=1,
    train_instance_type='ml.m4.xlarge',
    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
    sagemaker_session=session
)

In [27]:
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective="binary:logistic",
                        eval_metric="auc",
                        early_stopping_rounds=10,
                        num_round=500)

In [28]:
# ensure that hyperparams are set correctly
xgb.hyperparameters()

{'max_depth': 5,
 'eta': 0.2,
 'gamma': 4,
 'min_child_weight': 6,
 'subsample': 0.8,
 'silent': 0,
 'objective': 'binary:logistic',
 'eval_metric': 'auc',
 'early_stopping_rounds': 10,
 'num_round': 500}

In [29]:
train.head()

Unnamed: 0,RESPONSE,EWDICHTE,KBA13_KMH_0_140,KBA13_SEG_SONSTIGE,D19_KONSUMTYP,KBA05_KRSOBER,KBA05_SEG8,KBA13_NISSAN,CJT_TYP_1,KBA05_HERST2,...,D19_GESAMT_OFFLINE_DATUM,D19_SCHUHE,KBA13_HERST_EUROPA,KBA13_KRSZUL_NEU,PRAEGENDE_JUGENDJAHRE,ALTERSKATEGORIE_GROB,LP_FAMILIE_GROB,PLZ8_ANTG3,KBA13_RENAULT,KBA13_SITZE_4
37958,0,0.122762,-0.816443,-1.097872,-0.251984,-0.085894,0.110627,-1.934342,-0.399407,1.619272,...,0.332665,-0.352762,0.900768,0.124038,-0.113971,0.73641,1.346344,0.539724,-0.028805,-0.128536
17492,0,1.275257,1.808776,1.005273,-0.976195,-1.150692,-0.68419,0.011508,1.815335,-0.922138,...,-1.412118,-0.352762,-0.072194,-1.178494,2.550462,-2.073993,1.346344,1.555088,-0.979925,1.721277
10415,0,1.275257,0.496167,-1.097872,1.92065,-1.150692,-0.68419,0.011508,-1.137654,-0.922138,...,0.768861,-0.352762,-0.072194,-1.178494,0.152472,0.73641,-0.673995,1.555088,-0.979925,1.721277
19360,0,-0.453486,1.152471,1.005273,-0.976195,-0.085894,-0.68419,-1.934342,-0.399407,-0.075001,...,-0.539727,1.571649,-0.072194,-1.178494,0.418916,0.73641,-1.179079,-0.475639,-0.028805,-0.128536
12517,0,1.275257,1.152471,-1.097872,-0.976195,0.978905,-0.68419,0.984433,0.33884,-0.075001,...,-3.156901,3.49606,1.87373,0.124038,0.951802,-0.200391,-0.673995,-0.475639,1.873434,-0.128536


In [30]:
validation.head()

Unnamed: 0,RESPONSE,EWDICHTE,KBA13_KMH_0_140,KBA13_SEG_SONSTIGE,D19_KONSUMTYP,KBA05_KRSOBER,KBA05_SEG8,KBA13_NISSAN,CJT_TYP_1,KBA05_HERST2,...,D19_GESAMT_OFFLINE_DATUM,D19_SCHUHE,KBA13_HERST_EUROPA,KBA13_KRSZUL_NEU,PRAEGENDE_JUGENDJAHRE,ALTERSKATEGORIE_GROB,LP_FAMILIE_GROB,PLZ8_ANTG3,KBA13_RENAULT,KBA13_SITZE_4
7089,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.399407,0.0,...,0.768861,-0.352762,0.0,0.0,-1.179745,-1.137192,0.841259,0.0,0.0,0.0
18142,0,1.275257,1.152471,2.056846,1.92065,-0.085894,-0.68419,0.984433,-0.399407,0.772135,...,0.332665,3.49606,0.900768,-2.481027,-0.913301,0.73641,-1.179079,1.555088,0.922314,-1.978349
1596,0,-1.605981,0.496167,-2.149444,-0.614089,-0.085894,0.110627,-0.961417,-0.399407,0.772135,...,-1.412118,-0.352762,1.87373,0.124038,0.951802,0.73641,1.346344,-1.491002,1.873434,0.796371
32448,0,1.275257,-0.816443,-0.046299,-0.251984,-1.150692,1.700259,0.011508,-0.399407,-0.922138,...,0.768861,-0.352762,0.900768,0.124038,-0.646858,-0.200391,-0.673995,1.555088,0.922314,0.796371
4401,0,0.699009,-0.816443,-0.046299,-0.251984,-0.085894,-0.68419,0.984433,-0.399407,-0.075001,...,-0.103531,-0.352762,-0.072194,0.124038,-0.646858,0.73641,-0.16891,-0.475639,-0.979925,0.796371


### Fit the XGBoost model

In [31]:
s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=validate_location, content_type='csv')

In [32]:
xgb.fit({ 'train': s3_input_train, 'validation': s3_input_validation })

2020-05-17 22:31:03 Starting - Starting the training job...
2020-05-17 22:31:04 Starting - Launching requested ML instances......
2020-05-17 22:32:08 Starting - Preparing the instances for training......
2020-05-17 22:33:07 Downloading - Downloading input data...
2020-05-17 22:33:43 Training - Downloading the training image..[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter eval_metric value auc to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO

### Testing the model

In [33]:
xgb_transformer = xgb.transformer(instance_count=1, instance_type='ml.m4.xlarge')

In [34]:
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

In [35]:
xgb_transformer.wait()

....................[34m[2020-05-17 22:37:54 +0000] [16] [INFO] Starting gunicorn 19.10.0[0m
[34m[2020-05-17 22:37:54 +0000] [16] [INFO] Listening at: unix:/tmp/gunicorn.sock (16)[0m
[34m[2020-05-17 22:37:54 +0000] [16] [INFO] Using worker: gevent[0m
[34m[2020-05-17 22:37:54 +0000] [23] [INFO] Booting worker with pid: 23[0m
[34m[2020-05-17 22:37:54 +0000] [24] [INFO] Booting worker with pid: 24[0m
[34m[2020-05-17 22:37:54 +0000] [28] [INFO] Booting worker with pid: 28[0m
[34m[2020-05-17 22:37:54 +0000] [29] [INFO] Booting worker with pid: 29[0m
[34m[2020-05-17:22:38:24:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m169.254.255.130 - - [17/May/2020:22:38:24 +0000] "GET /ping HTTP/1.1" 200 0 "-" "Go-http-client/1.1"[0m
[34m169.254.255.130 - - [17/May/2020:22:38:24 +0000] "GET /execution-parameters HTTP/1.1" 200 84 "-" "Go-http-client/1.1"[0m
[34m[2020-05-17:22:38:25:INFO] Determined delimiter of CSV input is ','[0m
[34m[2020-05-17:22:38:25:INFO] No GPU

In [36]:
!aws s3 cp --recursive $xgb_transformer.output_path 'dataset'

Completed 214.6 KiB/214.6 KiB (3.7 MiB/s) with 1 file(s) remainingdownload: s3://sagemaker-us-east-2-240038582877/sagemaker-xgboost-2020-05-17-22-34-45-774/test.csv.out to dataset/test.csv.out


In [37]:
predictions = pd.read_csv('dataset/test.csv.out', header=None)
predictions = [round(num) for num in predictions.squeeze().values]

In [38]:
from sklearn.metrics import roc_auc_score, confusion_matrix
auc = roc_auc_score(Y_test, predictions)
tn, fp, fn, tp = confusion_matrix(Y_test, predictions).ravel()

In [39]:
tn, fp, fn, tp

(10610, 0, 132, 0)

In [40]:
auc

0.5