In [38]:
import boto3
import pandas as pd
import sagemaker
from sagemaker import get_execution_role
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import os

# Initialiser le client S3 et le rôle SageMaker
s3 = boto3.client('s3')
role = get_execution_role()

# Nom de votre bucket S3 et le dossier où sont stockées les données traitées
bucket_name = 'test-cloud-gackou'
processed_key = 'processed/titanic.csv'

# Lire les données traitées depuis S3
s3.download_file(bucket_name, processed_key, 'titanic.csv')
df = pd.read_csv('titanic.csv')

# Afficher les premières lignes pour vérifier le chargement


In [39]:
 df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked_Q,Embarked_S
0,0,3,0,22.0,1,0,7.25,1,0,1
1,1,1,1,38.0,1,0,71.2833,0,0,0
2,1,3,1,26.0,0,0,7.925,1,0,1
3,1,1,1,35.0,1,0,53.1,0,0,1
4,0,3,0,35.0,0,0,8.05,1,0,1


In [40]:
# Séparer les caractéristiques (features) et les labels (target)

# 'Survived' est la colonne cible
y = df['Survived']

X = df.drop(columns=['Survived'])  




# Diviser en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Vérification des dimensions des ensembles
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((712, 9), (179, 9), (712,), (179,))

In [41]:
# 'Survived' est la colonne cible
df['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [42]:
df.dtypes

Survived        int64
Pclass          int64
Sex             int64
Age           float64
SibSp           int64
Parch           int64
Fare          float64
Cabin           int64
Embarked_Q      int64
Embarked_S      int64
dtype: object

In [43]:
# Sauvegarder les ensembles d'entraînement et de test dans S3
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

# Enregistrer en CSV
train_data.to_csv('train_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

# Télécharger dans S3
train_file = 'train_data.csv'
test_file = 'test_data.csv'

s3.upload_file(train_file, bucket_name, 'processed/train_data.csv')
s3.upload_file(test_file, bucket_name, 'processed/test_data.csv')


In [44]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked_Q,Embarked_S
0,0,3,0,22.0,1,0,7.25,1,0,1
1,1,1,1,38.0,1,0,71.2833,0,0,0
2,1,3,1,26.0,0,0,7.925,1,0,1
3,1,1,1,35.0,1,0,53.1,0,0,1
4,0,3,0,35.0,0,0,8.05,1,0,1


In [45]:
type(y)
print(y)
y.value_counts()

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64


0    549
1    342
Name: Survived, dtype: int64

In [36]:
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator

# Spécifiez le conteneur d'XGBoost
xgboost_image_uri = sagemaker.image_uris.retrieve("xgboost", region="eu-west-3", version="1.2-1")

# Préparer les entrées d'entraînement
train_data_s3_path = f"s3://{bucket_name}/processed/train_data.csv"
test_data_s3_path = f"s3://{bucket_name}/processed/test_data.csv"

# Définir l'Estimator pour XGBoost
# Initialiser l'estimateur XGBoost
xgb_estimator = Estimator(
    image_uri=xgboost_image_uri,
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    output_path=f's3://{bucket_name}/output',
    sagemaker_session=sagemaker.Session()
)

# Paramètres d'entraînement pour XGBoost
xgb_estimator.set_hyperparameters(
    objective='binary:logistic',  # Classification binaire
    num_round=100,  # Nombre d'itérations (tuning possible)
    max_depth=5,
    eta=0.1,
    subsample=0.8,  # Sous-échantillonnage des instances
    colsample_bytree=0.8  # Sous-échantillonnage des colonnes
)


# Définir les entrées d'entraînement et de validation
train_input = TrainingInput(train_data_s3_path, content_type='csv')
test_input = TrainingInput(test_data_s3_path, content_type='csv')

# Lancer l'entraînement
xgb_estimator.fit({'train': train_input, 'validation': test_input})


2024-12-28 00:16:36 Starting - Starting the training job...
2024-12-28 00:17:04 Starting - Preparing the instances for training...
2024-12-28 00:17:25 Downloading - Downloading input data...
2024-12-28 00:17:45 Downloading - Downloading the training image...
2024-12-28 00:18:30 Training - Training image download completed. Training in progress..[34m[2024-12-28 00:18:39.409 ip-10-0-126-40.eu-west-3.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input i

In [None]:
# Récupérer le modèle formé
xgb_predictor = xgb_estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.xlarge'
)

# Faire des prédictions sur les données de test
test_data = pd.read_csv('test_data.csv')
X_test = test_data.drop(columns=['Survived'])

predictions = xgb_predictor.predict(X_test.values)

# Calculer l'accuracy
y_test = test_data['Survived'].values
accuracy = np.mean(predictions == y_test)
print(f"Accuracy: {accuracy}")


In [None]:
from sagemaker.tuner import HyperparameterTuner, IntegerParameter, ContinuousParameter

# Définir la grille des hyperparamètres à tester
hyperparameter_ranges = {
    'max_depth': IntegerParameter(3, 10),
    'eta': ContinuousParameter(0.01, 0.2),
    'num_round': IntegerParameter(50, 200)
}

# Définir l'optimiseur
tuner = HyperparameterTuner(
    xgb_estimator,
    objective_metric_name='validation:auc',
    hyperparameter_ranges=hyperparameter_ranges,
    max_jobs=20,
    max_parallel_jobs=3
)

# Lancer le fine-tuning
tuner.fit({'train': train_input, 'validation': test_input})
