In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import boto3

# magic word for producing visualizations in notebook
%matplotlib inline

In [6]:
#lets load the cleaned data
mail_df_selected = pd.read_csv('data/mail_df_selected_before_scaling.csv')
mail_df_selected = mail_df_selected.drop('Unnamed: 0', axis=1)

In [7]:
mail_df_selected.head(n=10)

Unnamed: 0,ANZ_PERSONEN,ANZ_TITEL,BALLRAUM,CJT_GESAMTTYP,D19_BANKEN_DATUM,D19_GESAMT_ANZ_12,D19_GESAMT_DATUM,D19_GESAMT_OFFLINE_DATUM,D19_GESAMT_ONLINE_DATUM,D19_KONSUMTYP,...,CAMEO_DEU_2015_7E,CAMEO_DEU_2015_8A,CAMEO_DEU_2015_8B,CAMEO_DEU_2015_8C,CAMEO_DEU_2015_8D,CAMEO_DEU_2015_9A,CAMEO_DEU_2015_9B,CAMEO_DEU_2015_9C,CAMEO_DEU_2015_9D,CAMEO_DEU_2015_9E
0,1.0,0.0,5.0,2.0,9.0,1.0,9.0,9.0,9.0,3.0,...,0,0,0,0,0,0,0,0,0,0
1,2.0,0.0,5.0,2.0,5.0,2.0,1.0,1.0,8.0,1.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,1.0,4.0,7.0,1.0,6.0,6.0,7.0,2.0,...,0,0,0,0,0,0,0,0,0,0
3,2.0,0.0,2.0,2.0,9.0,1.0,5.0,9.0,9.0,3.0,...,0,0,0,0,0,0,0,0,0,0
4,1.0,0.0,4.0,6.0,9.0,1.0,7.0,7.0,9.0,2.0,...,0,0,0,0,0,0,0,0,0,0
5,1.0,0.0,1.0,4.0,7.0,3.0,2.0,4.0,2.0,1.0,...,0,0,0,0,0,0,0,0,0,0
6,1.0,0.0,6.0,2.0,9.0,1.0,9.0,9.0,9.0,3.0,...,0,0,0,0,0,0,0,0,0,0
7,2.0,0.0,7.0,2.0,9.0,1.0,8.0,9.0,8.0,9.0,...,0,0,0,0,0,0,0,0,0,0
8,1.0,0.0,6.0,6.0,9.0,1.0,5.0,9.0,9.0,3.0,...,0,0,0,0,0,0,0,0,0,0
9,1.0,0.0,6.0,6.0,9.0,1.0,5.0,9.0,9.0,3.0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
mail_df_selected.columns

Index(['ANZ_PERSONEN', 'ANZ_TITEL', 'BALLRAUM', 'CJT_GESAMTTYP',
       'D19_BANKEN_DATUM', 'D19_GESAMT_ANZ_12', 'D19_GESAMT_DATUM',
       'D19_GESAMT_OFFLINE_DATUM', 'D19_GESAMT_ONLINE_DATUM', 'D19_KONSUMTYP',
       ...
       'CAMEO_DEU_2015_7E', 'CAMEO_DEU_2015_8A', 'CAMEO_DEU_2015_8B',
       'CAMEO_DEU_2015_8C', 'CAMEO_DEU_2015_8D', 'CAMEO_DEU_2015_9A',
       'CAMEO_DEU_2015_9B', 'CAMEO_DEU_2015_9C', 'CAMEO_DEU_2015_9D',
       'CAMEO_DEU_2015_9E'],
      dtype='object', length=271)

In [9]:
response_df = mail_df_selected['RESPONSE'] 

In [10]:
response_df.shape

(42962,)

In [11]:
# Lets try to transform the data using minmax scaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [12]:
mail_df_scaled = mail_df_selected.drop('RESPONSE', axis=1) 
mail_df_scaled[mail_df_scaled.columns] = scaler.fit_transform(mail_df_scaled)

In [13]:
mail_df_scaled.shape

(42962, 270)

In [14]:
dataset = pd.concat([response_df, mail_df_scaled], axis=1)

In [15]:
dataset.shape

(42962, 271)

In [16]:
dataset.head(n=10)

Unnamed: 0,RESPONSE,ANZ_PERSONEN,ANZ_TITEL,BALLRAUM,CJT_GESAMTTYP,D19_BANKEN_DATUM,D19_GESAMT_ANZ_12,D19_GESAMT_DATUM,D19_GESAMT_OFFLINE_DATUM,D19_GESAMT_ONLINE_DATUM,...,CAMEO_DEU_2015_7E,CAMEO_DEU_2015_8A,CAMEO_DEU_2015_8B,CAMEO_DEU_2015_8C,CAMEO_DEU_2015_8D,CAMEO_DEU_2015_9A,CAMEO_DEU_2015_9B,CAMEO_DEU_2015_9C,CAMEO_DEU_2015_9D,CAMEO_DEU_2015_9E
0,0,0.041667,0.0,0.666667,0.2,1.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0.083333,0.0,0.666667,0.2,0.5,0.2,0.0,0.0,0.875,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0.0,0.0,0.0,0.6,0.75,0.0,0.625,0.625,0.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0.083333,0.0,0.166667,0.2,1.0,0.0,0.5,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0.041667,0.0,0.5,1.0,1.0,0.0,0.75,0.75,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0,0.041667,0.0,0.0,0.6,0.75,0.4,0.125,0.375,0.125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0,0.041667,0.0,0.833333,0.2,1.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0,0.083333,0.0,1.0,0.2,1.0,0.0,0.875,1.0,0.875,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0,0.041667,0.0,0.833333,1.0,1.0,0.0,0.5,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0,0.041667,0.0,0.833333,1.0,1.0,0.0,0.5,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
pos_idx = dataset[dataset['RESPONSE']==1].index.to_list()
np.random.shuffle(pos_idx)  # lets first randomly shuffle these indices

In [18]:
neg_idx = dataset[dataset['RESPONSE']==0].index.to_list()
np.random.shuffle(neg_idx)  # lets first randomly shuffle these indices

In [19]:
len(pos_idx), len(neg_idx)

(532, 42430)

In [20]:
dataset_train = pd.concat([dataset.iloc[pos_idx[:int(0.9*len(pos_idx))]], dataset.iloc[neg_idx[:int(0.8*len(neg_idx))]]], axis=0)
dataset_validation = pd.concat([dataset.iloc[pos_idx[int(0.9*len(pos_idx)):]], dataset.iloc[neg_idx[int(0.8*len(neg_idx)):]]], axis=0)

In [21]:
len(pos_idx[int(0.9*len(pos_idx)):])

54

In [22]:
dataset_train.shape

(34422, 271)

In [23]:
dataset_train

Unnamed: 0,RESPONSE,ANZ_PERSONEN,ANZ_TITEL,BALLRAUM,CJT_GESAMTTYP,D19_BANKEN_DATUM,D19_GESAMT_ANZ_12,D19_GESAMT_DATUM,D19_GESAMT_OFFLINE_DATUM,D19_GESAMT_ONLINE_DATUM,...,CAMEO_DEU_2015_7E,CAMEO_DEU_2015_8A,CAMEO_DEU_2015_8B,CAMEO_DEU_2015_8C,CAMEO_DEU_2015_8D,CAMEO_DEU_2015_9A,CAMEO_DEU_2015_9B,CAMEO_DEU_2015_9C,CAMEO_DEU_2015_9D,CAMEO_DEU_2015_9E
29194,1,0.041667,0.0,0.833333,0.0,1.0,0.0,0.125,0.875,1.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16539,1,0.166667,0.0,0.833333,0.8,0.5,1.0,0.000,0.500,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27229,1,0.041667,0.0,0.833333,0.6,1.0,0.0,0.875,1.000,1.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1898,1,0.041667,0.0,1.000000,0.2,1.0,0.0,0.500,0.500,1.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21638,1,0.083333,0.0,1.000000,0.0,1.0,0.0,0.500,1.000,1.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11325,0,0.041667,0.0,0.833333,1.0,1.0,0.0,0.500,1.000,1.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20708,0,0.041667,0.0,0.833333,1.0,1.0,0.0,0.500,1.000,1.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18057,0,0.041667,0.0,0.833333,1.0,1.0,0.0,0.500,1.000,1.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35596,0,0.083333,0.0,1.000000,0.6,1.0,0.0,0.500,1.000,1.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
dataset_validation.shape

(8540, 271)

In [25]:
dataset_validation

Unnamed: 0,RESPONSE,ANZ_PERSONEN,ANZ_TITEL,BALLRAUM,CJT_GESAMTTYP,D19_BANKEN_DATUM,D19_GESAMT_ANZ_12,D19_GESAMT_DATUM,D19_GESAMT_OFFLINE_DATUM,D19_GESAMT_ONLINE_DATUM,...,CAMEO_DEU_2015_7E,CAMEO_DEU_2015_8A,CAMEO_DEU_2015_8B,CAMEO_DEU_2015_8C,CAMEO_DEU_2015_8D,CAMEO_DEU_2015_9A,CAMEO_DEU_2015_9B,CAMEO_DEU_2015_9C,CAMEO_DEU_2015_9D,CAMEO_DEU_2015_9E
4651,1,0.083333,0.0,0.166667,0.0,1.000,0.2,0.500,0.500,0.500,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37058,1,0.083333,0.0,0.000000,0.0,1.000,0.2,0.375,0.875,0.375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7752,1,0.041667,0.0,0.666667,0.2,1.000,0.0,0.500,1.000,1.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17556,1,0.041667,0.0,0.000000,0.6,0.000,0.4,0.000,0.500,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13194,1,0.125000,0.0,0.833333,0.4,1.000,0.2,0.125,0.125,0.250,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31189,0,0.041667,0.0,0.000000,0.2,0.625,0.0,0.625,1.000,0.625,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12762,0,0.041667,0.0,0.833333,1.0,1.000,0.0,0.500,1.000,1.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18598,0,0.083333,0.0,0.000000,1.0,1.000,0.0,0.500,1.000,1.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
21132,0,0.041667,0.0,0.833333,1.0,1.000,0.0,0.500,1.000,1.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
dataset_train.to_csv('data/train.csv', header=False, index=False)
dataset_validation.to_csv('data/validation.csv', header=False, index=False)

In [27]:
train = pd.read_csv('data/train.csv')
vali = pd.read_csv('data/validation.csv')

In [28]:
train

Unnamed: 0,1,0.041666666666666664,0.0,0.8333333333333334,0.0.1,1.0,0.0.2,0.125,0.875,1.0.1,...,0.0.71,0.0.72,0.0.73,0.0.74,0.0.75,0.0.76,0.0.77,0.0.78,0.0.79,0.0.80
0,1,0.166667,0.0,0.833333,0.8,0.5,1.0,0.000,0.5,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.041667,0.0,0.833333,0.6,1.0,0.0,0.875,1.0,1.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0.041667,0.0,1.000000,0.2,1.0,0.0,0.500,0.5,1.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0.083333,0.0,1.000000,0.0,1.0,0.0,0.500,1.0,1.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1,0.041667,0.0,0.833333,1.0,1.0,0.0,0.500,1.0,1.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34416,0,0.041667,0.0,0.833333,1.0,1.0,0.0,0.500,1.0,1.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34417,0,0.041667,0.0,0.833333,1.0,1.0,0.0,0.500,1.0,1.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34418,0,0.041667,0.0,0.833333,1.0,1.0,0.0,0.500,1.0,1.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34419,0,0.083333,0.0,1.000000,0.6,1.0,0.0,0.500,1.0,1.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
vali

Unnamed: 0,1,0.08333333333333333,0.0,0.16666666666666666,0.0.1,1.0,0.2,0.5,0.5.1,0.5.2,...,0.0.71,0.0.72,0.0.73,0.0.74,0.0.75,0.0.76,0.0.77,0.0.78,0.0.79,0.0.80
0,1,0.083333,0.0,0.000000,0.0,1.000,0.2,0.375,0.875,0.375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.041667,0.0,0.666667,0.2,1.000,0.0,0.500,1.000,1.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0.041667,0.0,0.000000,0.6,0.000,0.4,0.000,0.500,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0.125000,0.0,0.833333,0.4,1.000,0.2,0.125,0.125,0.250,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0.041667,0.0,1.000000,0.0,1.000,0.0,0.625,0.875,1.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8534,0,0.041667,0.0,0.000000,0.2,0.625,0.0,0.625,1.000,0.625,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8535,0,0.041667,0.0,0.833333,1.0,1.000,0.0,0.500,1.000,1.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8536,0,0.083333,0.0,0.000000,1.0,1.000,0.0,0.500,1.000,1.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8537,0,0.041667,0.0,0.833333,1.0,1.000,0.0,0.500,1.000,1.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
data_dir = "./data/"

In [31]:
# upload to S3 bucket for training
import sagemaker
import os

session = sagemaker.Session() # Store the current SageMaker session

# S3 prefix (which folder will we use)
prefix = 'capstone_project'

#test_location = session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix)
val_location = session.upload_data(os.path.join(data_dir, 'validation.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)

In [32]:

scale_pos_wt = int(np.ceil(int(0.9*len(neg_idx)) / int(0.8*len(pos_idx))))
print("scale_pos_weight to try", scale_pos_wt)

scale_pos_weight to try 90


In [33]:
from sagemaker import get_execution_role


# Our current execution role is required when creating the model as the training
# and inference code will need to access the model artifacts.

role = get_execution_role()
session = sagemaker.Session() # Store the current SageMaker session

In [34]:
# We need to retrieve the location of the container which is provided by Amazon for using XGBoost.
# As a matter of convenience, the training and inference code both use the same container.
from sagemaker.amazon.amazon_estimator import get_image_uri

container = get_image_uri(session.boto_region_name, 'xgboost')

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
There is a more up to date SageMaker XGBoost image. To use the newer image, please set 'repo_version'='1.0-1'. For example:
	get_image_uri(region, 'xgboost', '1.0-1').


In [35]:
# First we create a SageMaker estimator object for our model.
xgb = sagemaker.estimator.Estimator(container, # The location of the container we wish to use
                                    role,                                    # What is our current IAM Role
                                    train_instance_count=1,                  # How many compute instances
                                    train_instance_type='ml.m4.xlarge',      # What kind of compute instances
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                    sagemaker_session=session)

# And then set the algorithm specific parameters.
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        early_stopping_rounds=10,
                        num_round=500,
                        scale_pos_weight = scale_pos_wt)  # scaling the positive examples for this imbalanced dataset

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [36]:
#Fit the XGBoost model
s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=val_location, content_type='csv')

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [37]:
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

2020-09-17 17:22:02 Starting - Starting the training job...
2020-09-17 17:22:04 Starting - Launching requested ML instances......
2020-09-17 17:23:10 Starting - Preparing the instances for training......
2020-09-17 17:24:24 Downloading - Downloading input data
2020-09-17 17:24:24 Training - Downloading the training image..[34mArguments: train[0m
[34m[2020-09-17:17:24:44:INFO] Running standalone xgboost training.[0m
[34m[2020-09-17:17:24:44:INFO] File size need to be processed in the node: 79.52mb. Available memory size in the node: 8465.7mb[0m
[34m[2020-09-17:17:24:44:INFO] Determined delimiter of CSV input is ','[0m
[34m[17:24:44] S3DistributionType set as FullyReplicated[0m
[34m[17:24:45] 34422x270 matrix with 9293940 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2020-09-17:17:24:45:INFO] Determined delimiter of CSV input is ','[0m
[34m[17:24:45] S3DistributionType set as FullyReplicated[0m
[34m[17:24:45] 8540x270 matrix w

In [38]:
xgb_predictor = xgb.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


-------------!

In [39]:
from sagemaker.predictor import csv_serializer

# We need to tell the endpoint what format the data we are sending is in so that SageMaker can perform the serialization.
xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = csv_serializer

In [40]:
# We split the data into chunks and send each chunk seperately, accumulating the results.

def predict(data, rows=512):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])
    
    return np.fromstring(predictions[1:], sep=',')

In [41]:
validation = pd.read_csv(os.path.join(data_dir, 'validation.csv'), header=None).values
validation_X = validation[:,1:]
validation_y = [round(y) for y in validation[:,0]]
predictions = predict(validation_X)
predictions = [round(num) for num in predictions]

In [82]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import auc

In [43]:
accuracy_score(validation_y, predictions)

0.8227166276346605

In [44]:
# check balanced accuracy
balanced_accuracy_score(validation_y, predictions)

0.5335803632999013

In [118]:
#pos_idx_in_val = [i for i in range(len(validation_y)) if validation_y[i] == 1]

In [45]:
def fraction_correct(prediction, truth):
    pos_correct = 0
    pos_total = 0
    neg_correct = 0
    neg_total = 0
    for i in range(len(truth)):
        if truth[i] == 1:
            if truth[i] == prediction[i]:
                pos_correct += 1
            pos_total += 1
        else:
            if truth[i] == prediction[i]:
                neg_correct += 1
            neg_total += 1
    print("correct pos= ",pos_correct, "total pos=", pos_total)
    print("correct neg= ",neg_correct, "total neg=", neg_total)
    return pos_correct/pos_total, neg_correct/neg_total

In [46]:
fraction_correct(predictions, validation_y)

correct pos=  13 total pos= 54
correct neg=  7013 total neg= 8486


(0.24074074074074073, 0.826419985859062)

In [47]:
# what the metric on training itself?
training = pd.read_csv(os.path.join(data_dir, 'train.csv'), header=None).values
training_X = training[:,1:]
training_y = [round(y) for y in training[:,0]]
predictions_train = predict(training_X)
predictions_train = [round(num) for num in predictions_train]

In [48]:
accuracy_score(training_y, predictions_train)

0.8344372784846901

In [49]:
# check balanced accuracy
balanced_accuracy_score(training_y, predictions_train)

0.8923331265771731

In [50]:
fraction_correct(predictions_train, training_y)

correct pos=  455 total pos= 478
correct neg=  28268 total neg= 33944


(0.9518828451882845, 0.8327834079660618)

In [51]:
# lets get predictions on the test data 

In [55]:
test_X = pd.read_csv(os.path.join(data_dir, 'test_df_scaled.csv'), header=None).values # test data doesn't have labels
predictions_test = predict(test_X)
predictions_test = [round(num) for num in predictions_test]

In [56]:
test_X.shape

(42833, 270)

In [57]:
len(predictions_test)

42833

In [59]:
predictions_test[:10], predictions_test[-10:] # first and the last 10 predictions on test data

([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])

In [61]:
# lets save these predictions in a pickle
pickle.dump(predictions_test, open('data/test_predictions.p', 'wb'))

In [62]:
# lets test if serializing actually worked
t = pickle.load(open('data/test_predictions.p', 'rb'))

In [64]:
t[:10], t[-10:]  # it worked!

([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])

In [65]:
# lets delete the endpoint since we are done
xgb_predictor.delete_endpoint()

In [67]:
len(predictions_test)

42833

In [68]:
# let make the submission file for the test

In [69]:
s3 = boto3.client('s3')
obj = s3.get_object(Bucket='sagemaker-us-east-2-840691022453', Key='Udacity_MAILOUT_052018_TEST.csv')
test_df = pd.read_csv(obj['Body'], sep=";")

  interactivity=interactivity, compiler=compiler, result=result)


In [70]:
test_df.shape

(42833, 366)

In [71]:
test_label_df = pd.DataFrame(predictions_test,columns=['RESPONSE'])

In [73]:
test_label_df.tail(n=10)

Unnamed: 0,RESPONSE
42823,0.0
42824,0.0
42825,0.0
42826,1.0
42827,0.0
42828,0.0
42829,0.0
42830,0.0
42831,0.0
42832,0.0


In [75]:
test_df['LNR'].shape

(42833,)

In [76]:
# creating the dataframe for submission
submission_df = pd.concat([test_df['LNR'], test_label_df], axis=1)

In [77]:
# checking if it worked
submission_df.shape

(42833, 2)

In [78]:
submission_df

Unnamed: 0,LNR,RESPONSE
0,1754,0.0
1,1770,0.0
2,1465,0.0
3,1470,0.0
4,1478,0.0
...,...,...
42828,67615,0.0
42829,67938,0.0
42830,67942,0.0
42831,67949,0.0


In [80]:
# converting to a csv format
submission_df.to_csv("data/submission.csv", index=False)