In [228]:
import sagemaker 
import boto3 
from sagemaker.amazon.amazon_estimator import get_image_uri 
from sagemaker.session import s3_input, Session

In [229]:
#bucket_name= 'sagemaker-insurance-predict-bill' # <--- CHANGE THIS VARIABLE TO A UNIQUE NAME FOR YOUR BUCKET 
bucket_name = 'bucket-david-predict-insurance' # <--- CHANGE THIS VARIABLE TO A UNIQUE NAME FOR YOUR BUCKET 
my_region = boto3.session.Session().region_name # set the region of the instance 
print(my_region)
print(bucket_name)

eu-west-3
bucket-david-predict-insurance


In [230]:
import boto3

# Votre nom de bucket et la région souhaitée
#bucket_name = 'bucket-david-predict-insurance'
my_region = 'eu-west-3'

s3 = boto3.resource('s3')
try:
    if my_region == 'eu-west-3':
        s3.create_bucket(Bucket=bucket_name,
                         CreateBucketConfiguration={
                             'LocationConstraint': my_region
                         })
        print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ', e)

S3 bucket created successfully


In [231]:
# set an output path where the trained model will be saved
prefix = 'xgboost-as-a-built-in-algo' 
output_path ='s3://{}/{}/output'.format(bucket_name, prefix) 
print(output_path)

s3://bucket-david-predict-insurance/xgboost-as-a-built-in-algo/output


Downloading The Dataset And Storing in S3

In [232]:
import pandas as pd 
import urllib 
import os
try: 
    #urllib.request.urlretrieve ("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", "bank_clean.csv") 
    from pycaret.datasets import get_data
    data = get_data('insurance')
    print('Success: downloaded data.csv.') 
    print(os.getcwd())
except Exception as e: 
    print('Data load error: ',e) 
try: 
    model_data = pd.read_csv('insurance.csv') 
    print('Success: Data loaded into dataframe.') 
except Exception as e: 
    print('Data load error: ',e)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


Success: downloaded data.csv.
/home/sagemaker-user
Success: Data loaded into dataframe.


In [233]:
!ls

data.csv		 logs.log	 test.csv
insurance-predict.ipynb  model.tar.gz	 train.csv
insurance.csv		 test-sgm.ipynb  xgboost-model


In [234]:
mdata=model_data.copy()
mdata

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [235]:
mdata['sex'] = mdata['sex'].map({'female': 0, 'male': 1})
mdata['smoker'] = mdata['smoker'].map({'no': 0, 'yes': 1})
mdata['region'] = mdata['region'].map({'southeast': 0, 'northwest': 1, 'southwest': 2, 'northeast': 3})
mdata

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.900,0,1,2,16884.92400
1,18,1,33.770,1,0,0,1725.55230
2,28,1,33.000,3,0,0,4449.46200
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.880,0,0,1,3866.85520
...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,1,10600.54830
1334,18,0,31.920,0,0,3,2205.98080
1335,18,0,36.850,0,0,0,1629.83350
1336,21,0,25.800,0,0,2,2007.94500


In [236]:
### Train Test split 
import numpy as np 
train_data, test_data = np.split(mdata.sample(frac=1, random_state=1729), [int(0.7 * len(mdata))]) 
print(train_data.shape, test_data.shape)
# Transformer les données avant d'utiliser un modèle

test_data

(936, 7) (402, 7)


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
585,33,0,28.270,1,0,0,4779.60230
831,36,0,25.840,0,0,1,5266.36560
739,29,1,35.500,2,1,2,44585.45587
696,53,0,32.300,2,0,3,29186.48236
174,24,0,33.345,0,0,1,2855.43755
...,...,...,...,...,...,...,...
315,52,1,33.250,0,0,3,9722.76950
857,25,1,24.130,0,1,1,15817.98570
1091,55,0,29.830,0,0,3,11286.53870
1267,24,1,31.065,0,1,3,34254.05335


In [237]:
### Saving Train And Test Into Buckets 
## We start with Train Data
import os 
train_data.to_csv('train.csv', index=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv') 
s3_input_train = sagemaker.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

In [238]:
# Test Data Into Buckets 
test_data.to_csv('test.csv', index=False) 
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv') 
s3_input_test = sagemaker.TrainingInput(s3_data='s3://{}/{}/test'.format(bucket_name, prefix), content_type='csv')

Building Models Xgboot- Inbuilt Algorithm

In [239]:
# this line automatically looks for the XGBoost image URI and builds an XGBoost container. 
# specify the repo_version depending on your preference. 
container = get_image_uri(boto3.Session().region_name, 'xgboost', repo_version='1.7-1')

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [240]:
# initialize hyperparameters 
hyperparameters = { "max_depth":"5",
                   "eta":"0.2",
                   "gamma":"4",
                   "min_child_weight":"6",
                   "subsample":"0.7",
                   "objective":"reg:squarederror",
                   "num_round":50 }

In [250]:
# construct a SageMaker estimator that calls the xgboost-container 
estimator = sagemaker.estimator.Estimator(image_uri=container,
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1,
                                          instance_type='ml.m5.2xlarge',
                                          #volume_size=3,  # 3 GB
                                          output_path=output_path,
                                          #framework_version="1.7-1",  # Note: framework_version is mandatory
                                          use_spot_instances=True,  # Utilisation d'instances Spot pour réduire le coût
                                          max_run=300,
                                          max_wait=600
                                         )



In [251]:
estimator

<sagemaker.estimator.Estimator at 0x7f2911538700>

In [252]:
estimator.fit({'train': s3_input_train,'validation': s3_input_test})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-02-11-08-35-39-783


2024-02-11 08:35:39 Starting - Starting the training job...
2024-02-11 08:35:57 Starting - Preparing the instances for training.........
2024-02-11 08:37:28 Downloading - Downloading input data...
2024-02-11 08:37:53 Downloading - Downloading the training image...
2024-02-11 08:38:39 Training - Training image download completed. Training in progress....
2024-02-11 08:39:09 Uploading - Uploading generated training model[34m[2024-02-11 08:39:00.754 ip-10-0-185-86.eu-west-3.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-02-11 08:39:00.777 ip-10-0-185-86.eu-west-3.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-02-11:08:39:01:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-02-11:08:39:01:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2024-02-11:08:39:01:INFO] No GPUs detected (normal if no gpus

In [253]:
xgb_predictor = estimator.deploy(initial_instance_count=1,instance_type='ml.m5.2xlarge')

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-02-11-08-44-08-065
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2024-02-11-08-44-08-065
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2024-02-11-08-44-08-065


-----!

In [254]:
xgb_predictor

<sagemaker.base_predictor.Predictor at 0x7f28f22b4d00>

Prediction of the Test Data

In [255]:
#train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data))]) 
test_data
#train_data
#test_data.to_csv('test.csv', index=False) 

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
585,33,0,28.270,1,0,0,4779.60230
831,36,0,25.840,0,0,1,5266.36560
739,29,1,35.500,2,1,2,44585.45587
696,53,0,32.300,2,0,3,29186.48236
174,24,0,33.345,0,0,1,2855.43755
...,...,...,...,...,...,...,...
315,52,1,33.250,0,0,3,9722.76950
857,25,1,24.130,0,1,1,15817.98570
1091,55,0,29.830,0,0,3,11286.53870
1267,24,1,31.065,0,1,3,34254.05335


In [258]:
import sagemaker
from sagemaker.serializers import CSVSerializer

# Configurez votre endpoint si nécessaire
endpoint_name = "sagemaker-xgboost-2024-02-11-08-44-08-065"  # Remplacez par le nom de votre endpoint
sagemaker_session = sagemaker.Session()

# Créez un objet Predictor pour votre modèle
xgb_predictor = sagemaker.predictor.Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sagemaker_session,
    serializer=CSVSerializer(),  # Spécifiez le serializer pour le type de contenu CSV
)

# Nouvelles données pour la prédiction, déjà converties si nécessaire
#data_csv = "19,female,27.9,0,yes,southwest"
data_csv = "24,0,27.9,1,1,3"  # Assurez-vous que cette chaîne est correctement formatée

# Utilisez l'endpoint pour faire une prédiction avec le bon ContentType
response = xgb_predictor.predict(data_csv)

# Affichez la réponse de la prédiction
print(response)


b'11.769454002380371\n'


In [165]:
%pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl (297.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.1/297.1 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.0.3
Note: you may need to restart the kernel to use updated packages.


In [182]:
# Conversion des variables catégorielles
dt=test_data.copy()
dt['sex'] = dt['sex'].map({'female': 0, 'male': 1})
dt['smoker'] = dt['smoker'].map({'no': 0, 'yes': 1})
dt['region'] = dt['region'].map({'southeast': 0, 'northwest': 1, 'southwest': 2, 'northeast': 3})

# Supprimer la colonne 'charges'
dt = dt.drop(columns=['charges'])
print(dt)

      age  sex     bmi  children  smoker  region
585    33  NaN  28.270         1     NaN     NaN
831    36  NaN  25.840         0     NaN     NaN
739    29  NaN  35.500         2     NaN     NaN
696    53  NaN  32.300         2     NaN     NaN
174    24  NaN  33.345         0     NaN     NaN
...   ...  ...     ...       ...     ...     ...
315    52  NaN  33.250         0     NaN     NaN
857    25  NaN  24.130         0     NaN     NaN
1091   55  NaN  29.830         0     NaN     NaN
1267   24  NaN  31.065         0     NaN     NaN
1126   55  NaN  29.900         0     NaN     NaN

[402 rows x 6 columns]


In [261]:
import pickle as pkl
import tarfile
import xgboost as xgb

with tarfile.open('model.tar.gz', 'r:gz') as t:
    t.list()  # Affiche la liste des fichiers contenus dans l'archive tar

# Chemin vers le fichier modèle XGBoost extrait
model_file_path = 'xgboost-model'

# Chargement du modèle
with open(model_file_path, 'rb') as file:
    model = pkl.load(file)

# Vérification du modèle chargé
print(model)

# Supprimer la colonne 'charges'
dt=test_data.copy()
dt = dt.drop(columns=['charges'])
#dt = dt.drop(columns=['region'])
print(dt)

# Assurez-vous d'utiliser le DataFrame encodé pour la prédiction
dtest = xgb.DMatrix(dt)
pred = model.predict(dtest)

print(pred)


-rw-r--r-- root/root      71397 2024-02-11 07:40:24 xgboost-model 
<xgboost.core.Booster object at 0x7f28eb6ad2e0>
      age  sex     bmi  children  smoker  region
585    33    0  28.270         1       0       0
831    36    0  25.840         0       0       1
739    29    1  35.500         2       1       2
696    53    0  32.300         2       0       3
174    24    0  33.345         0       0       1
...   ...  ...     ...       ...     ...     ...
315    52    1  33.250         0       0       3
857    25    1  24.130         0       1       1
1091   55    0  29.830         0       0       3
1267   24    1  31.065         0       1       3
1126   55    1  29.900         0       0       2

[402 rows x 6 columns]
[ 8.755165  9.540667  9.720635  8.755165  9.540667  9.540667  8.755165
  9.540667  9.720635 10.506137  8.755165  9.540667  9.540667 10.506137
  9.720635  8.755165 10.506137  9.540667  9.540667  8.755165  8.755165
  8.755165  9.540667  9.720635  9.720635  8.755165  9.720635

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
585,33,female,28.270,1,no,southeast,4779.60230
831,36,female,25.840,0,no,northwest,5266.36560
739,29,male,35.500,2,yes,southwest,44585.45587
696,53,female,32.300,2,no,northeast,29186.48236
174,24,female,33.345,0,no,northwest,2855.43755
...,...,...,...,...,...,...,...
315,52,male,33.250,0,no,northeast,9722.76950
857,25,male,24.130,0,yes,northwest,15817.98570
1091,55,female,29.830,0,no,northeast,11286.53870
1267,24,male,31.065,0,yes,northeast,34254.05335


Deleting The Endpoints

In [None]:
#Les points de terminaison SageMaker engendrent des coûts tant qu'ils sont actifs. Si vous n'avez plus 
#besoin de votre point de terminaison, pensez à le supprimer pour éviter des frais inutiles. 
#Vous pouvez le faire via la console AWS SageMaker ou en utilisant le SDK AWS :
#bucket_name = 'bucket-david-predict-insurance'
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint) 
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name) 
bucket_to_delete.objects.all().delete()