In [3]:
import sagemaker 
import boto3 
from sagemaker.amazon.amazon_estimator import get_image_uri 
from sagemaker.session import s3_input, Session

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [12]:
#bucket_name= 'sagemaker-insurance-predict-bill' # <--- CHANGE THIS VARIABLE TO A UNIQUE NAME FOR YOUR BUCKET 
bucket_name= 'bucket_test' # <--- CHANGE THIS VARIABLE TO A UNIQUE NAME FOR YOUR BUCKET 
my_region = boto3.session.Session().region_name # set the region of the instance 
print(my_region)
print(bucket_name)

eu-west-3
bucket_test


In [16]:
import boto3

# Votre nom de bucket et la région souhaitée
bucket_name = 'bucket-david-predict-insurance'
my_region = 'eu-west-3'

s3 = boto3.resource('s3')
try:
    if my_region == 'eu-west-3':
        s3.create_bucket(Bucket=bucket_name,
                         CreateBucketConfiguration={
                             'LocationConstraint': my_region
                         })
        print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ', e)

S3 bucket created successfully


In [17]:
# set an output path where the trained model will be saved
prefix = 'xgboost-as-a-built-in-algo' 
output_path ='s3://{}/{}/output'.format(bucket_name, prefix) 
print(output_path)

s3://bucket-david-predict-insurance/xgboost-as-a-built-in-algo/output


Downloading The Dataset And Storing in S3

In [31]:
import pandas as pd 
import urllib 
import os
try: 
    #urllib.request.urlretrieve ("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", "bank_clean.csv") 
    from pycaret.datasets import get_data
    data = get_data('insurance')
    print('Success: downloaded data.csv.') 
    print(os.getcwd())
except Exception as e: 
    print('Data load error: ',e) 
try: 
    model_data = pd.read_csv('insurance.csv') 
    print('Success: Data loaded into dataframe.') 
except Exception as e: 
    print('Data load error: ',e)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


Success: downloaded data.csv.
/home/sagemaker-user
Success: Data loaded into dataframe.


In [32]:
!ls

AzureMLDeploy	data.csv		 insurance.csv	test-sgm.ipynb
best_model.pkl	insurance-predict.ipynb  logs.log	train.csv


In [33]:
model_data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [75]:
### Train Test split 
import numpy as np 
train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data))]) 
print(train_data.shape, test_data.shape)
test_data

(936, 7) (402, 7)


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
585,33,female,28.270,1,no,southeast,4779.60230
831,36,female,25.840,0,no,northwest,5266.36560
739,29,male,35.500,2,yes,southwest,44585.45587
696,53,female,32.300,2,no,northeast,29186.48236
174,24,female,33.345,0,no,northwest,2855.43755
...,...,...,...,...,...,...,...
315,52,male,33.250,0,no,northeast,9722.76950
857,25,male,24.130,0,yes,northwest,15817.98570
1091,55,female,29.830,0,no,northeast,11286.53870
1267,24,male,31.065,0,yes,northeast,34254.05335


In [76]:
### Saving Train And Test Into Buckets 
## We start with Train Data
import os 
train_data.to_csv('train.csv', index=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv') 
s3_input_train = sagemaker.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

In [77]:
# Test Data Into Buckets 
test_data.to_csv('test.csv', index=False) 
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv') 
s3_input_test = sagemaker.TrainingInput(s3_data='s3://{}/{}/test'.format(bucket_name, prefix), content_type='csv')

Building Models Xgboot- Inbuilt Algorithm

In [78]:
# this line automatically looks for the XGBoost image URI and builds an XGBoost container. 
# specify the repo_version depending on your preference. 
container = get_image_uri(boto3.Session().region_name, 'xgboost', repo_version='1.0-1')

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.


In [79]:
# initialize hyperparameters 
hyperparameters = { "max_depth":"5",
                   "eta":"0.2",
                   "gamma":"4",
                   "min_child_weight":"6",
                   "subsample":"0.7",
                   "objective":"reg:squarederror",
                   "num_round":50 }

In [80]:
# construct a SageMaker estimator that calls the xgboost-container 
estimator = sagemaker.estimator.Estimator(image_uri=container,
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1,
                                          instance_type='ml.m5.2xlarge',
                                          volume_size=5,  # 3 GB
                                          output_path=output_path,
                                          use_spot_instances=True,  # Utilisation d'instances Spot pour réduire le coût
                                          max_run=300,
                                          max_wait=600)

In [81]:
estimator

<sagemaker.estimator.Estimator at 0x7f29117a68e0>

In [82]:
estimator.fit({'train': s3_input_train,'validation': s3_input_test})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-02-09-16-18-38-290


2024-02-09 16:18:38 Starting - Starting the training job......
2024-02-09 16:19:17 Starting - Preparing the instances for training......
2024-02-09 16:20:22 Downloading - Downloading input data...
2024-02-09 16:21:08 Training - Training image download completed. Training in progress....
2024-02-09 16:21:39 Uploading - Uploading generated training model
2024-02-09 16:21:39 Completed - Training job completed
[34m[2024-02-09 16:21:23.964 ip-10-0-107-38.eu-west-3.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV inp

In [83]:
xgb_predictor = estimator.deploy(initial_instance_count=1,instance_type='ml.m5.2xlarge')

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-02-09-16-22-05-833
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2024-02-09-16-22-05-833
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2024-02-09-16-22-05-833


-----!

Prediction of the Test Data

In [84]:
#train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data))]) 
test_data
#train_data
#test_data.to_csv('test.csv', index=False) 

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
585,33,female,28.270,1,no,southeast,4779.60230
831,36,female,25.840,0,no,northwest,5266.36560
739,29,male,35.500,2,yes,southwest,44585.45587
696,53,female,32.300,2,no,northeast,29186.48236
174,24,female,33.345,0,no,northwest,2855.43755
...,...,...,...,...,...,...,...
315,52,male,33.250,0,no,northeast,9722.76950
857,25,male,24.130,0,yes,northwest,15817.98570
1091,55,female,29.830,0,no,northeast,11286.53870
1267,24,male,31.065,0,yes,northeast,34254.05335


In [None]:
import pandas as pd
import numpy as np

# Exemple d'encodage des variables catégorielles
# Supposons que 'test_data' est votre DataFrame
test_data['female'] = test_data['sex'].apply(lambda x: 1 if x == 'female' else 0)
test_data['yes'] = test_data['smoker'].apply(lambda x: 1 if x == 'yes' else 0)

# Supprimer les colonnes originales catégorielles et autres colonnes non nécessaires
test_data = test_data.drop(['sex', 'smoker'], axis=1)

# Ajouter l'encodage One-Hot pour la colonne 'region'
test_data = pd.concat([test_data, pd.get_dummies(test_data['region'], prefix='region')], axis=1)

# Supprimer la colonne originale 'region' après l'encodage
test_data = test_data.drop(['region'], axis=1)


# Conversion des données en CSV sans en-tête
csv_data = test_data.to_csv(index=False, header=False)

# Encodage en bytes pour l'envoi
payload = csv_data.encode('utf-8')

# Configuration du sérialiseur pour envoyer des données au format CSV
xgb_predictor.serializer = CSVSerializer()

# Envoi des données au point de terminaison et réception des prédictions
predictions = xgb_predictor.predict(payload).decode('utf-8')

# Conversion des prédictions en un array numpy
predictions_array = np.fromstring(predictions[1:], sep=',')

# Affichage de la forme du tableau des prédictions
print(predictions_array.shape)


In [None]:
import sagemaker
from sagemaker.serializers import CSVSerializer

# Assurez-vous que xgb_predictor pointe vers votre modèle déployé
# xgb_predictor = estimator.deploy(...)

# Configuration du sérialiseur pour envoyer des données au format CSV
xgb_predictor.serializer = CSVSerializer()

# Vos données au format CSV
#data_csv = "19,female,27.9,0,yes,southwest"

data_csv = "19,0,27.9,0,1,southwest"

# Envoi des données au point de terminaison et réception des prédictions
response = xgb_predictor.predict(data_csv)

# Affichage de la réponse (prédiction)
print(response)


In [58]:
test_data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
585,33,female,28.270,1,no,southeast,4779.60230
831,36,female,25.840,0,no,northwest,5266.36560
739,29,male,35.500,2,yes,southwest,44585.45587
696,53,female,32.300,2,no,northeast,29186.48236
174,24,female,33.345,0,no,northwest,2855.43755
...,...,...,...,...,...,...,...
315,52,male,33.250,0,no,northeast,9722.76950
857,25,male,24.130,0,yes,northwest,15817.98570
1091,55,female,29.830,0,no,northeast,11286.53870
1267,24,male,31.065,0,yes,northeast,34254.05335


Deleting The Endpoints

In [None]:
#Les points de terminaison SageMaker engendrent des coûts tant qu'ils sont actifs. Si vous n'avez plus 
#besoin de votre point de terminaison, pensez à le supprimer pour éviter des frais inutiles. 
#Vous pouvez le faire via la console AWS SageMaker ou en utilisant le SDK AWS :
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint) 
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name) 
bucket_to_delete.objects.all().delete()