## Load required libraries

In [1]:
import boto3
import sagemaker
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.session import Session

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

## Create an XGBoost estimator

In [2]:
# Construct a SageMaker estimator that calls the xgboost-container

from sagemaker.debugger import Rule, rule_configs

prefix = "team4-classification-models"
s3_output_location='s3://{}/{}/{}'.format(bucket, prefix, 'xgboost')

container=sagemaker.image_uris.retrieve("xgboost", region, "1.2-1")
print(container)

xgb_model=sagemaker.estimator.Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    volume_size=5,
    output_path=s3_output_location,
    sagemaker_session=sagemaker.Session(),
    rules=[Rule.sagemaker(rule_configs.create_xgboost_report())]
)

683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.2-1


## Set hyperparameters for xgboost

In [3]:
xgb_model.set_hyperparameters(
    max_depth = 5,
    eta = 0.2,
    gamma = 4,
    min_child_weight = 6,
    subsample = 0.7,
    objective = "multi:softmax",
    num_round = 1000
)

## Set path for input files

In [4]:
from sagemaker.session import TrainingInput

train_input = TrainingInput('s3://ads508-team4-split/train/df_train.csv')
validation_input = TrainingInput('s3://ads508-team4-split/validation/df_validation.csv')
test_input = TrainingInput('s3://ads508-team4-split/test/df_test.csv')

## Start Training

In [None]:
xgb_model.fit({"train": train_input, "validation": validation_input}, wait=True)

2022-03-27 04:03:21 Starting - Starting the training job...
2022-03-27 04:03:45 Starting - Launching requested ML instancesCreateXgboostReport: InProgress
ProfilerReport-1648353801: InProgress
.......