## Load required libraries

In [2]:
import boto3 
import sagemaker
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.session import Session

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

## Create an XGBoost estimator

In [2]:
# Construct a SageMaker estimator that calls the xgboost-container

from sagemaker.debugger import Rule, rule_configs
from sagemaker import image_uris

bucket = "ads508-team4-xgboost"
prefix = "models"
s3_output_location='s3://{}/{}/{}'.format(bucket, prefix, 'xgboost')

# Set up container

container = sagemaker.image_uris.retrieve("xgboost", region, "1.2-1")
print(container)

xgb_model = sagemaker.estimator.Estimator(
    image_uri = container,
    role = role,
    instance_count = 1,
    instance_type = 'ml.m5.large',
    volume_size = 5,
    output_path = s3_output_location,
    sagemaker_session = sagemaker.Session(),
    rules = [Rule.sagemaker(rule_configs.create_xgboost_report())]
)

683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.2-1


## Set hyperparameters for xgboost

In [3]:
xgb_model.set_hyperparameters(
    max_depth = 5,
    eta = 0.2,
    gamma = 4,
    min_child_weight = 6,
    subsample = 0.7,
    objective = "multi:softmax",
    num_round = 20,
    num_class = 8
)

## Set path for input files

In [4]:
from sagemaker.session import TrainingInput

content_type = "csv"

train_input = TrainingInput('s3://ads508-team4-split/train/df_train.csv',content_type = content_type)
validation_input = TrainingInput('s3://ads508-team4-split/validation/df_validation.csv', content_type = content_type)
test_input = TrainingInput('s3://ads508-team4-split/test/df_test.csv', content_type = content_type)

## Start Training

In [5]:
xgb_model.fit({"train": train_input, "validation": validation_input}, wait=True)

2022-03-30 19:55:33 Starting - Starting the training job...
2022-03-30 19:55:57 Starting - Preparing the instances for trainingCreateXgboostReport: InProgress
ProfilerReport-1648670133: InProgress
.........
2022-03-30 19:57:31 Downloading - Downloading input data......
2022-03-30 19:58:31 Training - Downloading the training image......
2022-03-30 19:59:31 Training - Training image download completed. Training in progress.[34m[2022-03-30 19:59:24.315 ip-10-0-83-31.ec2.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value multi:softmax to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input 

## Show the name of the training job

In [6]:
training_job_name = xgb_model.latest_training_job.name
print("Training Job Name:  {}".format(training_job_name))

Training Job Name:  sagemaker-xgboost-2022-03-30-19-55-33-695


## Show training job metrics

In [7]:
xgb_model.training_job_analytics.dataframe()



Unnamed: 0,timestamp,metric_name,value
0,0.0,train:merror,0.616582
1,60.0,train:merror,0.585638
2,120.0,train:merror,0.56865
3,0.0,validation:merror,0.621455
4,60.0,validation:merror,0.590569
5,120.0,validation:merror,0.573125


# Deploy the model to a real-time endpoint

In [8]:
xgb_predictor = xgb_model.deploy(initial_instance_count = 1, instance_type = 'ml.m5.xlarge')

------!

To send it in an HTTP POST request, we'll serialize it as a CSV string and then decode the resulting CSV.

In [14]:
!pip install boto3 --upgrade
from sagemaker.serializers import CSVSerializer


xgb_predictor.serializers = sagemaker.serializers.CSVSerializer()

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.[0m


# Download our test file and saved it to local instance

In [16]:
import pandas as pd
import numpy as np
import csv

!aws s3 cp 's3://ads508-team4-split/test/df_test.csv' ./data/

df_test = pd.read_csv(
    "./data/df_test.csv",
    delimiter=",",
    quoting=csv.QUOTE_NONE,
)
df_test.head()

download: s3://ads508-team4-split/test/df_test.csv to data/df_test.csv


Unnamed: 0,4,115,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,...,0.63,1.2,0.64,0.65,0.66,0.67,0.68,0.69,1.3,0.70
0,6,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,0,84,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,1,0
2,2,77,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,5,44,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,1,79,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
def predict(data, rows = 500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions,xgb_predictor.predict(array).decode('utf-8')])
    
    return np.fromstring(prediction[1:], sep = ',')

predictions = predict(df_test.iloc[:,1:].to_numpy())

NameError: name 'dataframe' is not defined