In [4]:
import boto3

my_region = boto3.session.Session().region_name

In [5]:
print("Success - the MySageMakerInstance is in the " + my_region + " region.")

Success - the MySageMakerInstance is in the us-east-1 region.


In [6]:
import urllib.request

In [7]:
try:
    urllib.request.urlretrieve(
        "https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv",
        "bank_clean.csv",
    )
    print("Success: downloaded bank_clean.csv.")
except Exception as e:
    print("Data load error: ", e)

Success: downloaded bank_clean.csv.


In [8]:
import pandas as pd

In [9]:
try:
    model_data = pd.read_csv("./bank_clean.csv", index_col=0)
    print("Success: Data loaded into dataframe.")
except Exception as e:
    print("Data load error: ", e)

Success: Data loaded into dataframe.


In [10]:
model_data.head()

Unnamed: 0,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,y_no,y_yes
0,56,1,999,0,1,0,0,0,0,1,...,0,1,0,0,0,0,1,0,1,0
1,57,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
2,37,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
3,40,1,999,0,1,0,1,0,0,0,...,0,1,0,0,0,0,1,0,1,0
4,56,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0


In [11]:
import numpy as np

In [12]:
train_data, test_data = np.split(
    model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data))]
)

In [13]:
print(train_data.shape, test_data.shape)

(28831, 61) (12357, 61)


In [14]:
reformatted = pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], axis=1)], axis=1)

In [15]:
reformatted.to_csv('train.csv', index=False, header=False)

In [18]:
bucket_name = (
    "6f3fxfxnwu89ufch" # <--- SET THIS VARIABLE TO A UNIQUE NAME FOR YOUR BUCKET
)

In [20]:
s3 = boto3.resource("s3")
try:
    s3.create_bucket(
        Bucket=bucket_name
    )
    print("S3 bucket created successfully")
except Exception as e:
    print("S3 error: ", e)

S3 bucket created successfully


In [21]:
import os

boto3.Session().resource("s3").Bucket(bucket_name).Object(
    os.path.join("marketing_dataset", "train/train.csv")
).upload_file("train.csv")

In [22]:
import sagemaker

s3_input_train = sagemaker.s3_input(
    s3_data="s3://{}/{}/train".format(bucket_name, "marketing_dataset"),
    content_type="csv",
)

In [23]:
containers = {
    "us-west-2": "433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest",
    "us-east-1": "811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest",
    "us-east-2": "825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest",
    "eu-west-1": "685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest",
}

In [24]:
sess = sagemaker.Session()

In [25]:
from sagemaker import get_execution_role    

role = get_execution_role()
xgb = sagemaker.estimator.Estimator(containers[my_region],role, train_instance_count=1, train_instance_type='ml.m4.xlarge',output_path='s3://{}/{}/output'.format(bucket_name, "marketing"),sagemaker_session=sess)
xgb.set_hyperparameters(max_depth=5,eta=0.2,gamma=4,min_child_weight=6,subsample=0.8,silent=0,objective='binary:logistic',num_round=100)

In [26]:
xgb.fit({'train': s3_input_train})

2019-11-20 20:51:56 Starting - Starting the training job...
2019-11-20 20:52:03 Starting - Launching requested ML instances......
2019-11-20 20:53:09 Starting - Preparing the instances for training......
2019-11-20 20:54:04 Downloading - Downloading input data...
2019-11-20 20:54:57 Training - Training image download completed. Training in progress..[31mArguments: train[0m
[31m[2019-11-20:20:54:57:INFO] Running standalone xgboost training.[0m
[31m[2019-11-20:20:54:57:INFO] Path /opt/ml/input/data/validation does not exist![0m
[31m[2019-11-20:20:54:57:INFO] File size need to be processed in the node: 3.38mb. Available memory size in the node: 8602.11mb[0m
[31m[2019-11-20:20:54:57:INFO] Determined delimiter of CSV input is ','[0m
[31m[20:54:57] S3DistributionType set as FullyReplicated[0m
[31m[20:54:57] 28831x59 matrix with 1701029 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[31m[20:54:57] src/tree/updater_prune.cc:74: tree pruning


2019-11-20 20:55:09 Uploading - Uploading generated training model
2019-11-20 20:55:09 Completed - Training job completed
Training seconds: 65
Billable seconds: 65


In [27]:
xgb_predictor = xgb.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

--------------------------------------------------------------------------------------------------!

In [28]:
test_data_array = test_data.drop(
    ["y_no", "y_yes"], axis=1
).values
xgb_predictor.content_type = "text/csv"

In [31]:
from sagemaker.predictor import csv_serializer

xgb_predictor.serializer = csv_serializer

In [32]:
predictions = xgb_predictor.predict(test_data_array).decode("utf-8")

In [33]:
predictions

'0.0602235160768,0.0892826914787,0.0591339841485,0.365399450064,0.132321238518,0.072853974998,0.0322009883821,0.0353233553469,0.0291406102479,0.0376748815179,0.113088496029,0.0485837049782,0.0590370669961,0.0585256256163,0.0409621670842,0.0292348116636,0.0732434168458,0.0312618277967,0.0891591832042,0.0711572319269,0.0355262830853,0.054693095386,0.0312889516354,0.10143468529,0.0639288127422,0.0578431636095,0.0499844141304,0.0351017266512,0.0449749417603,0.0320359431207,0.105021767318,0.011506174691,0.102485798299,0.0346580147743,0.0501317307353,0.0493454635143,0.188746333122,0.0466941520572,0.498958796263,0.0772133693099,0.0492646954954,0.0522507615387,0.028265343979,0.0574337132275,0.0459359772503,0.064650721848,0.0892403423786,0.107194952667,0.154861181974,0.0753904804587,0.0848136171699,0.158836394548,0.0900727808475,0.025450443849,0.0704037770629,0.751563847065,0.0565032921731,0.0253172945231,0.0711784511805,0.111373201013,0.0259268973023,0.0399601012468,0.0611408576369,0.042134188

In [34]:
predictions_array = np.fromstring(
    predictions[1:], sep=","
)

In [35]:
predictions_array

array([0.06022352, 0.08928269, 0.05913398, ..., 0.0573156 , 0.02460817,
       0.03605176])

In [36]:
print(predictions_array.shape)

(12357,)


In [37]:
cm = pd.crosstab(
    index=test_data["y_yes"],
    columns=np.round(predictions_array),
    rownames=["Observed"],
    colnames=["Predicted"],
)

In [38]:
cm

Predicted,0.0,1.0
Observed,Unnamed: 1_level_1,Unnamed: 2_level_1
0,10785,151
1,1143,278


In [39]:
tn = cm.iloc[0, 0]
fn = cm.iloc[1, 0]
tp = cm.iloc[1, 1]
fp = cm.iloc[0, 1]

In [40]:
p = (tp + tn) / (tp + tn + fp + fn) * 100

In [41]:
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))


Overall Classification Rate: 89.5%



In [42]:
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "No Purchase", "Purchase"))
print("Observed")
print(
    "{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format(
        "No Purchase", tn / (tn + fn) * 100, tn, fp / (tp + fp) * 100, fp
    )
)
print(
    "{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format(
        "Purchase", fn / (tn + fn) * 100, fn, tp / (tp + fp) * 100, tp
    )
)

Predicted      No Purchase    Purchase
Observed
No Purchase    90% (10785)    35% (151)
Purchase        10% (1143)     65% (278) 



In [43]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)

In [44]:
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)

In [45]:
bucket_to_delete.objects.all().delete()

[{'ResponseMetadata': {'RequestId': 'A529242B22DA1BD2',
   'HostId': 'ZZDW0FZ76dJ1hd9wYeX04rKJ8jv74cEzub6SUPBtXyg8oI7AtiQy1uel9Buj29Og9ROUqzXSokQ=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'ZZDW0FZ76dJ1hd9wYeX04rKJ8jv74cEzub6SUPBtXyg8oI7AtiQy1uel9Buj29Og9ROUqzXSokQ=',
    'x-amz-request-id': 'A529242B22DA1BD2',
    'date': 'Wed, 20 Nov 2019 21:10:21 GMT',
    'connection': 'close',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'marketing/output/xgboost-2019-11-20-20-51-56-256/output/model.tar.gz'},
   {'Key': 'marketing_dataset/train/train.csv'}]}]