In [1]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.image_uris import retrieve
from sagemaker.analytics import TrainingJobAnalytics

# Initialize the SageMaker session and get the execution role
sagemaker_session = sagemaker.Session()
role = get_execution_role()

# Specify the S3 paths for the input datasets without headers
train_data_s3_path = 's3://pre-processed-aws-credit-data-train/train_no_header.csv'
test_data_s3_path = 's3://pre-processed-aws-credit-data-test/test_no_header.csv'

# Specify the output path for the model artifacts in S3
output_path = f's3://{sagemaker_session.default_bucket()}/xgboost-model-artifacts'

# Get the XGBoost Docker image URI for the current region
xgboost_image_uri = retrieve('xgboost', sagemaker_session.boto_region_name, 'latest')

# Configure the XGBoost estimator
xgboost_estimator = sagemaker.estimator.Estimator(image_uri=xgboost_image_uri,
                                                  role=role,
                                                  instance_count=1,
                                                  instance_type='ml.m5.xlarge',
                                                  output_path=output_path,
                                                  sagemaker_session=sagemaker_session)

# Set XGBoost hyperparameters
xgboost_estimator.set_hyperparameters(max_depth=4,  # Reduced tree depth to prevent overfitting
                                      eta=0.1,      # Lower learning rate
                                      gamma=1,      # Minimum loss reduction required to make a further partition
                                      min_child_weight=1,  # Minimum sum of instance weight (hessian) needed in a child
                                      subsample=0.8,  # Subsample ratio of the training instances
                                      colsample_bytree=0.8,  # Subsample ratio of columns when constructing each tree
                                      eval_metric='auc,error',  # Evaluation metrics for validation data
                                      objective='binary:logistic',  # Learning task and the corresponding learning objective
                                      num_round=150)  # Increased number of boosting rounds

# Specify the input data configuration
train_input = TrainingInput(train_data_s3_path, content_type='csv')
test_input = TrainingInput(test_data_s3_path, content_type='csv')

# Train the model
xgboost_estimator.fit({'train': train_input, 'validation': test_input})

# Retrieve and display the metrics
training_job_name = xgboost_estimator.latest_training_job.name
metrics_dataframe = TrainingJobAnalytics(training_job_name=training_job_name).dataframe()

if metrics_dataframe.empty:
    print("No metrics found.")
else:
    print(metrics_dataframe)


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


INFO:sagemaker:Creating training-job with name: xgboost-2024-04-03-00-42-57-055


2024-04-03 00:42:57 Starting - Starting the training job...
2024-04-03 00:43:13 Starting - Preparing the instances for training......
2024-04-03 00:44:09 Downloading - Downloading input data...
2024-04-03 00:44:54 Training - Training image download completed. Training in progress...[34mArguments: train[0m
[34m[2024-04-03:00:45:05:INFO] Running standalone xgboost training.[0m
[34m[2024-04-03:00:45:05:INFO] File size need to be processed in the node: 1.02mb. Available memory size in the node: 8174.16mb[0m
[34m[2024-04-03:00:45:05:INFO] Determined delimiter of CSV input is ','[0m
[34m[00:45:05] S3DistributionType set as FullyReplicated[0m
[34m[00:45:05] 800x115 matrix with 92000 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2024-04-03:00:45:05:INFO] Determined delimiter of CSV input is ','[0m
[34m[00:45:05] S3DistributionType set as FullyReplicated[0m
[34m[00:45:05] 200x115 matrix with 23000 entries loaded from /opt/ml/input/da



   timestamp       metric_name     value
0        0.0       train:error  0.036250
1        0.0    validation:auc  0.651636
2        0.0         train:auc  0.995591
3        0.0  validation:error  0.280000


In [2]:
print(sagemaker_session.default_bucket())

sagemaker-eu-west-2-533267019472


In [3]:
training_job_name = xgboost_estimator.latest_training_job.name
print(training_job_name)

xgboost-2024-04-03-00-42-57-055


In [5]:
from sagemaker.model import Model

model_artifacts = f's3://{sagemaker_session.default_bucket()}/xgboost-model-artifacts'

xgboost_model = Model(
    image_uri=xgboost_image_uri,
    model_data=model_artifacts + '/' + training_job_name + '/output/model.tar.gz',
    role=role,
    sagemaker_session=sagemaker_session
)


In [9]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer

# Assuming your endpoint is successfully created and named as per training_job_name
predictor = Predictor(endpoint_name=training_job_name, 
                      sagemaker_session=sagemaker_session, 
                      serializer=CSVSerializer())


In [11]:
from sagemaker.serializers import CSVSerializer

predictor.serializer = CSVSerializer()

# Example data: Replace this with your actual data
sample_data = '0.565283114,2.270555751,0.254864073,0.897875586,-0.080145932,-0.307219556,0.130250766,0.089258371,0.204528944,-0.361047264,-0.437749871,-0.272854754,-0.510361791,-0.510318646,0.724003444,-0.522898371,-0.225229576,-0.19869169,-0.671721882,-0.66903502,-0.62881387,-0.99125228,-0.374475409,-0.374449243,-0.194733938,-0.163454639,0.043810477,-0.209404602,-0.119876006,-0.12529171,-0.145581911,-0.239228964,-2.831235571,-0.282422226,-0.271669814,-0.297611682,1.734811564,1.973625615,1.202641806,1.346977552,-0.338650173,-0.211283028,1.354376796,1.31765858,0.473318168,0.340876031,-0.344246078,-0.268517128,2.067566951,2.06762886,-0.000867436,1.500318467,-0.160398257,-0.117144674,0.158872015,0.150755358,-0.038488281,0.052959981,-0.518228847,-0.234879238,0.282277011,0.198372961,0.086117861,-0.292263721,-0.384823572,-0.287334023,0.77809923,0.777105345,-0.154274757,0.315705117,-0.474329361,-0.31662534,0.91394728,0.87989371,0.191219386,0.333878806,-0.483783049,-0.328532823,-0.563439243,1,0,0,0,1,1,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,10'
prediction = predictor.predict(sample_data)
print(prediction)


b'0.7627398371696472'
