# Load libraries

In [1]:
# pip3 install s3fs
from sagemaker import Session
import sagemaker
import boto3
import re
from sagemaker import get_execution_role
import numpy as np
import pandas as pd
import os

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [6]:
role = get_execution_role()

bucket_name = 'eliezerraj-908671954593-dataset'
prefix_name = 'payment/notebook/output'

train_path = f"s3://{bucket_name}/{prefix_name}/train"
validation_path = f"s3://{bucket_name}/{prefix_name}/validation"
test_path = f"s3://{bucket_name}/{prefix_name}/test"

x_train_data_file = f's3://{bucket_name}/{prefix_name}/train/train_data.csv'
x_train_input = sagemaker.inputs.TrainingInput(x_train_data_file, content_type='csv')
validation_data_file = f's3://{bucket_name}/{prefix_name}/validation/validation_data.csv'
validation_input = sagemaker.inputs.TrainingInput(validation_data_file, content_type='text/csv')

print("---------------------------------")
print(f"bucket_name '{bucket_name}':")
print(f"prefix_name '{prefix_name}':")

print(f"x_train_data_file '{x_train_data_file}':")
print(f"validation_data_file '{validation_data_file}':")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
s3://eliezerraj-908671954593-dataset/payment/notebook/output/train/train_data.csv
s3://eliezerraj-908671954593-dataset/payment/notebook/output/validation/validation_data.csv
---------------------------------
bucket_name 'eliezerraj-908671954593-dataset':
prefix_name 'payment/notebook/output':
x_train_data_file 's3://eliezerraj-908671954593-dataset/payment/notebook/output/train/train_data.csv':
validation_data_file 's3://eliezerraj-908671954593-dataset/payment/notebook/output/validation/validation_data.csv':


# Prepare the model

In [8]:
container = sagemaker.image_uris.retrieve(region=boto3.Session().region_name,
                                          framework='xgboost',
                                          version='latest')

In [9]:
sess = sagemaker.Session()

xgb = sagemaker.estimator.Estimator(container,
                                    role,
                                    instance_count=1,
                                    instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}'.format(bucket_name,
                                                                    prefix_name),
                                    sagemaker_session=sess)

xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        num_round=100)

xgb.fit({'train': x_train_input,
         'validation': validation_data_file})

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


INFO:sagemaker:Creating training-job with name: xgboost-2024-04-22-00-30-16-096


2024-04-22 00:30:16 Starting - Starting the training job...
2024-04-22 00:30:30 Starting - Preparing the instances for training......
2024-04-22 00:31:38 Downloading - Downloading input data......
2024-04-22 00:32:18 Downloading - Downloading the training image...
2024-04-22 00:33:09 Training - Training image download completed. Training in progress.
2024-04-22 00:33:09 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[2024-04-22:00:33:05:INFO] Running standalone xgboost training.[0m
[34m[2024-04-22:00:33:05:INFO] File size need to be processed in the node: 2.04mb. Available memory size in the node: 8483.96mb[0m
[34m[2024-04-22:00:33:05:INFO] Determined delimiter of CSV input is ','[0m
[34m[00:33:05] S3DistributionType set as FullyReplicated[0m
[34m[00:33:05] 36256x12 matrix with 435072 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2024-04-22:00:33:05:INFO] Determined delimiter of CSV input is ','[0m
[

# Model Validation

In [10]:
# Deploy Model
xgb_predictor = xgb.deploy(initial_instance_count=1,
                           instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: xgboost-2024-04-22-00-43-07-882
INFO:sagemaker:Creating endpoint-config with name xgboost-2024-04-22-00-43-07-882
INFO:sagemaker:Creating endpoint with name xgboost-2024-04-22-00-43-07-882


-----!

In [11]:
# Save model
xgb_predictor.serializer = sagemaker.serializers.CSVSerializer()

In [16]:
test_data_file = f's3://{bucket_name}/{prefix_name}/test/test_data.csv'
df_test_data = pd.read_csv(test_data_file, header=None)

print("---------------------------------")
print(f"df_test_data '{df_test_data.shape}':")

---------------------------------
df_test_data '(5180, 13)':


In [17]:
df_test_data_x = df_test_data.iloc[:, 1:]
df_test_data_y = df_test_data.iloc[:, :1]

In [18]:
df_test_data_x.head(3)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12
0,67,1,0,1,121.0,2,206.0,2,206.0,3,179.67,1762
1,71,1,0,1,102.0,2,378.0,2,378.0,9,287.78,0
2,331,0,1,1,729.0,4,609.0,6,525.0,16,354.19,0


In [19]:
df_test_data_y.head(3)

Unnamed: 0,0
0,0
1,0
2,1


In [20]:
def predict(data, predictor, rows=500 ):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

In [21]:
predictions = predict(df_test_data_x, xgb_predictor)

  return bound(*args, **kwds)


In [22]:
np.round(predictions)

array([0., 0., 1., ..., 0., 0., 0.])

In [45]:
pd.crosstab(index = df_test_data_y[0], 
            columns=np.round(predictions), 
            rownames=['actuals'], colnames=['predictions'])

predictions,0.0,1.0
actuals,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4023,18
1,61,1078


# Make a Prediction

In [46]:
payment_data = [{'distance': 100,
                 'card_model_CHIP': 0.,
                 'card_model_VIRTUAL': 1.,
                 'card_type_CREDIT': 1.,
                 'amount': 2000,
                 'tx_1d': 9,
                 'avg_1d': 1365.00,
                 'tx_7d': 17,
                 'avg_7d': 263.529412,
                 'tx_30d': 28,
                 'avg_30d': 238.714286,
                 'time_btw_cc_tx': 97582.0}]
df_predict = pd.DataFrame(payment_data)
df_predict

Unnamed: 0,distance,card_model_CHIP,card_model_VIRTUAL,card_type_CREDIT,amount,tx_1d,avg_1d,tx_7d,avg_7d,tx_30d,avg_30d,time_btw_cc_tx
0,100,0.0,1.0,1.0,2000,9,1365.0,17,263.529412,28,238.714286,97582.0


In [47]:
y_pred = xgb_predictor.predict(df_predict).decode('utf-8')
print(y_pred)
print(np.round(float(y_pred)))

0.9611890316009521
1.0


In [48]:
import boto3
client = boto3.client(service_name="sagemaker")
runtime = boto3.client(service_name="sagemaker-runtime")
endpoint_name='xgboost-2024-04-22-00-43-07-882'

In [49]:
# Endpoint invocation
payload = b"100., 0., 1., 1., 400., 365.0, 17., 263.529412, 28., 238.714286, 97582.0"

response = runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    Body=payload,
    ContentType="text/csv",
)

y_predict = response["Body"].read().decode()
print(y_predict)
print(np.round(float(y_predict)))

0.25696900486946106
0.0


# Make Validation

In [50]:
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

hyperparameter_ranges = {'eta': ContinuousParameter(0, 1),
                         'min_child_weight': ContinuousParameter(1, 10),
                         'alpha': ContinuousParameter(0, 2),
                         'max_depth': IntegerParameter(1, 10)}

objective_metric_name = 'validation:auc'

In [52]:
tuner = HyperparameterTuner(xgb,
                            objective_metric_name,
                            hyperparameter_ranges,
                            max_jobs=20,
                            max_parallel_jobs=3)

In [53]:
tuner.fit({'train': x_train_input, 'validation': validation_data_file})

INFO:sagemaker:Creating hyperparameter tuning job with name: xgboost-240422-0109


..............................................................................................................!


In [54]:
boto3.client('sagemaker').describe_hyper_parameter_tuning_job(
HyperParameterTuningJobName=tuner.latest_tuning_job.job_name)['HyperParameterTuningJobStatus']

'Completed'

In [55]:
tuner.best_training_job()

'xgboost-240422-0109-014-09dcfba9'

In [56]:
tuner_predictor = tuner.deploy(initial_instance_count=1,
                           instance_type='ml.m4.xlarge')


2024-04-22 01:17:23 Starting - Found matching resource for reuse
2024-04-22 01:17:23 Downloading - Downloading the training image
2024-04-22 01:17:23 Training - Training image download completed. Training in progress.
2024-04-22 01:17:23 Uploading - Uploading generated training model
2024-04-22 01:17:23 Completed - Resource reused by training job: xgboost-240422-0109-017-071c1130

INFO:sagemaker:Creating model with name: xgboost-2024-04-22-01-20-54-832





INFO:sagemaker:Creating endpoint-config with name xgboost-240422-0109-014-09dcfba9
INFO:sagemaker:Creating endpoint with name xgboost-240422-0109-014-09dcfba9


-----!

In [57]:
tuner_predictor.serializer = sagemaker.serializers.CSVSerializer()

In [59]:
test_data_file = f's3://{bucket_name}/{prefix_name}/test/test_data.csv'
df_test_data = pd.read_csv(test_data_file, header=None)
df_test_data_x = df_test_data.iloc[:, 1:]
df_test_data_y = df_test_data.iloc[:, :1]

print("---------------------------------")
print(f"df_test_data '{df_test_data.shape}':")

---------------------------------
df_test_data '(5180, 13)':


In [60]:
pd.crosstab(index=df_test_data_y[0], columns=np.round(predictions), rownames=['actuals'], colnames=['predictions'])

predictions,0.0,1.0
actuals,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4023,18
1,61,1078


In [61]:
xgb_predictor.delete_endpoint(delete_endpoint_config=True)
tuner_predictor.delete_endpoint(delete_endpoint_config=True)

INFO:sagemaker:Deleting endpoint configuration with name: xgboost-2024-04-22-00-43-07-882
INFO:sagemaker:Deleting endpoint with name: xgboost-2024-04-22-00-43-07-882
INFO:sagemaker:Deleting endpoint configuration with name: xgboost-240422-0109-014-09dcfba9
INFO:sagemaker:Deleting endpoint with name: xgboost-240422-0109-014-09dcfba9
