In [2]:
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.image_uris import retrieve

session = boto3.Session()

role = get_execution_role()

region = session.region_name

container = retrieve('xgboost', region, version='1.5-1')  


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [7]:
bucket = 'fsbucket09'
data_key = 'preprocessed_data.csv' 
s3_data_path = f's3://{bucket}/{data_key}'

role = get_execution_role()

# Retrieving the XGBoost Image URI

In [8]:
region = sagemaker.Session().boto_region_name 
xgboost_version = '1.2-2'  
container = retrieve('xgboost', region, version=xgboost_version)


# Creating SageMaker Estimator

In [28]:
xgb = Estimator(image_uri=container,
                role=role,
                instance_count=1,  
                instance_type='ml.m4.xlarge', 
                output_path=f's3://{bucket}/output', 
                sagemaker_session=sagemaker.Session())

xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        verbosity=0, 
                        objective='reg:squarederror',  
                        num_round=100)

# Loadind the data 

In [17]:
import pandas as pd

local_data_path = 'preprocessed_data.csv' 
df = pd.read_csv(local_data_path)

# Splitting the data

In [18]:
from sklearn.model_selection import train_test_split

X = df.drop(df.columns[0], axis=1)
y = df[df.columns[0]]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

train_data = pd.concat([y_train, X_train], axis=1)
val_data = pd.concat([y_val, X_val], axis=1)


# Uploading the Split Data to S3

In [19]:
import boto3

s3_client = boto3.client('s3')
bucket_name = 'fstbucket09'

train_data.to_csv('train_data.csv', header=False, index=False)
val_data.to_csv('val_data.csv', header=False, index=False)

train_key = 'train/train_data.csv'
val_key = 'validation/val_data.csv'
s3_client.upload_file('train_data.csv', bucket_name, train_key)
s3_client.upload_file('val_data.csv', bucket_name, val_key)


# Referencing the S3 paths and creating input channels to sagemaker

In [21]:
from sagemaker.inputs import TrainingInput

s3_input_train = TrainingInput(s3_data=f's3://{bucket_name}/{train_key}', content_type='csv')
s3_input_val = TrainingInput(s3_data=f's3://{bucket_name}/{val_key}', content_type='csv')


# Start the training 

In [29]:
output_prefix = 'output' 
output_path = f's3://{bucket_name}/{output_prefix}'

xgb = Estimator(
    container,
    role,
    instance_count=1,
    instance_type='ml.m4.xlarge',
    output_path=output_path,
    sagemaker_session=sagemaker.Session()
)



In [None]:
import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri


bucket_name = 'fstbucket09' 
output_prefix = 'output'  

region = sagemaker.Session().boto_region_name 
container = get_image_uri(region, 'xgboost', repo_version='1.0-1')  

role = sagemaker.get_execution_role()

t
xgb = sagemaker.estimator.Estimator(
    image_name=container, 
    role=role,
    instance_count=1,
    instance_type='ml.m4.xlarge',
    output_path=f's3://{bucket_name}/{output_prefix}',
    sagemaker_session=sagemaker.Session()
)

xgb.set_hyperparameters(
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    verbosity=0,
    objective='reg:squarederror',
    num_round=100
)



In [30]:
import sagemaker
from sagemaker import image_uris

region = sagemaker.Session().boto_region_name  # or manually specify your region e.g., 'us-east-1'
xgboost_image_uri = image_uris.retrieve(framework='xgboost', region=region, version='latest')
print(xgboost_image_uri)


INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest


In [54]:
import sagemaker
from sagemaker.estimator import Estimator

role = sagemaker.get_execution_role() 
output_path = 's3://your-bucket-name/model-artifacts' 

xgb_estimator = Estimator(image_uri='825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
                          role=role,
                          instance_count=1,
                          instance_type='ml.t3.medium',
                          output_path=output_path,
                          sagemaker_session=sagemaker.Session())

xgb_estimator.set_hyperparameters(
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    objective='reg:linear',
    num_round=100
)


In [55]:
xgb.set_hyperparameters(
    objective='reg:linear',  
    num_round=100
)


In [56]:
output_path = f's3://{"fstbucket09"}/output'



In [57]:
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://fstbucket09/train/train_data_no_header.csv', content_type='csv')


In [47]:
import pandas as pd

if 'category_column' in df.columns:
    df['category_column'] = df['category_column'].astype('category').cat.codes

for col in df.columns:
    if df[col].dtype == 'object':
        df.drop(col, inplace=True, axis=1)
        
df.to_csv('train_data_no_header.csv', header=False, index=False)


In [48]:
import boto3
s3 = boto3.client('s3')
s3.upload_file('train_data_no_header.csv', 'fstbucket09', 'train/train_data_no_header.csv')


In [62]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.amazon.amazon_estimator import get_image_uri

# Set up the SageMaker session, role
sagemaker_session = sagemaker.Session()
role = get_execution_role()

# Set up the image URI for XGBoost (make sure to specify the version you want to use)
xgboost_container = get_image_uri(sagemaker_session.boto_region_name, 'xgboost', '1.0-1')  # Specify your XGBoost version here

# Define the data channels for input data in S3
s3_input_train = TrainingInput(s3_data='s3://fstbucket09/train/', content_type='csv')
s3_input_validation = TrainingInput(s3_data='s3://fstbucket09/validation/', content_type='csv')

# Configure the XGBoost estimator
xgb_estimator = sagemaker.estimator.Estimator(
    image_uri=xgboost_container,
    role=role,
    instance_count=1,
    instance_type='ml.m4.xlarge',
    output_path='s3://fstbucket09/output/',
    sagemaker_session=sagemaker_session
)

# Set the hyperparameters
xgb_estimator.set_hyperparameters(
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    silent=0,  
    objective='reg:linear',
    num_round=100
)

# Fit the model
xgb_estimator.fit({'train': s3_input_train, 'validation': s3_input_validation})


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.
INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-03-30-03-28-52-587


2024-03-30 03:28:52 Starting - Starting the training job...
2024-03-30 03:29:07 Starting - Preparing the instances for training......
2024-03-30 03:30:05 Downloading - Downloading input data...
2024-03-30 03:30:45 Downloading - Downloading the training image.....[34m[2024-03-30 03:31:36.630 ip-10-0-88-2.us-east-2.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:linear to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determ

In [63]:

model_data = xgb_estimator.model_data


In [65]:
from sagemaker.serializers import CSVSerializer

predictor = xgb_estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge',
    serializer=CSVSerializer()
)


INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-03-30-03-36-42-157
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2024-03-30-03-36-42-157
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2024-03-30-03-36-42-157


------!

In [68]:

prediction_data = '30,70000,7,2,5,18,710'
prediction = predictor.predict(prediction_data).decode('utf-8')
print(prediction)

394378.03125


In [None]:
from sagemaker.tuner import HyperparameterTuner, ContinuousParameter, IntegerParameter

hyperparameter_ranges = {
    'eta': ContinuousParameter(0.01, 0.2),
    'min_child_weight': IntegerParameter(1, 10),
    'gamma': ContinuousParameter(0, 5),
    'subsample': ContinuousParameter(0.5, 0.9),
   
}

tuner = HyperparameterTuner(
    estimator=xgb_estimator,
    objective_metric_name='validation:rmse', 
    hyperparameter_ranges=hyperparameter_ranges,
    metric_definitions=[{'Name': 'validation:rmse', 'Regex': 'validation-rmse:([0-9\\.]+)'}], 
    max_jobs=20,
    max_parallel_jobs=3,
    objective_type='Minimize', 
)

tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})
