In [1]:
from zenml import step, pipeline, Model, get_step_context
from zenml.client import Client
from zenml.logger import get_logger
from uuid import UUID
from typing import Optional, List
from typing_extensions import Annotated


from zenml import pipeline, log_artifact_metadata, step, ArtifactConfig


[1;35mNumExpr defaulting to 8 threads.[0m


In [2]:
import pandas as pd
import xgboost as xgb
import numpy as np
import joblib
import datetime
from typing import Tuple
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix, f1_score

In [3]:
@step
def data_load() -> Annotated[pd.DataFrame, "data"]:
    data = pd.read_csv("data/train.csv")
    return data

@step
def data_val_load() -> Annotated[pd.DataFrame, "data_val"]:
    data_val = pd.read_csv('data/val.csv')
    return data_val
    

In [4]:
# Charger vos données depuis un fichier CSV, par exemple
@step
def data_clean(data: pd.DataFrame = None) -> Tuple[Annotated[pd.DataFrame, "features"], Annotated[pd.Series, "target"]]:

    correspondance = {'très bonne':4, 'insuffisante':1, 'bonne':3, 'moyenne':2, 0:0, 'appartement':1, 'maison':2, 'immeuble':3, 'A':6, 'B':5,
                    'C':4, 'D':3, 'E':2, 'F':1, 'G':0, 'Électricité':1, 'Bois – Bûches':2, 'GPL':3, 'Gaz naturel':4,'Fioul domestique':5,
                    'Réseau de Chauffage urbain':6,'Bois – Granulés (pellets) ou briquettes':7,"Électricité d'origine renouvelable utilisée dans le bâtiment":8,
                    'Bois – Plaquettes d’industrie':9, 'Bois – Plaquettes forestières':10,'Charbon':11, 'Propane':12, 'Butane':13, 'Réseau de Froid Urbain':14}

    colonnes_a_transformer = ['Qualité_isolation_plancher_bas', 'Qualité_isolation_enveloppe', 'Qualité_isolation_menuiseries',
                             'Qualité_isolation_murs', 'Qualité_isolation_plancher_haut_comble_aménagé','Qualité_isolation_plancher_haut_comble_perdu',
                             'Qualité_isolation_plancher_haut_toit_terrase', 'Type_bâtiment', 'Etiquette_GES', 'Type_énergie_n°3', 'Etiquette_DPE']

    data = data.fillna(0)

    data[colonnes_a_transformer] = data[colonnes_a_transformer].replace(correspondance)

    data = data.drop(columns= ['Unnamed: 0','N°DPE', 'Configuration_installation_chauffage_n°2', 'Type_générateur_froid', 'Type_émetteur_installation_chauffage_n°2',
                              'Classe_altitude', 'Code_postal_(brut)', 'Type_générateur_n°1_installation_n°2', 'Nom__commune_(Brut)',
                              "Cage_d'escalier", 'Code_INSEE_(BAN)', 'Description_générateur_chauffage_n°2_installation_n°2', 'N°_département_(BAN)',
                              'Surface_totale_capteurs_photovoltaïque', 'Facteur_couverture_solaire_saisi', 'Facteur_couverture_solaire'])

    # Séparer les caractéristiques (X) de la cible (y)
    X = data.drop(columns=['Etiquette_DPE'])  # Supprimez la colonne cible
    y = data['Etiquette_DPE']
    return X, y

In [5]:
@step
def data_prep(X: pd.DataFrame = None, y: pd.Series = None) -> Tuple[Annotated[pd.DataFrame, "features_train"], Annotated[pd.DataFrame, "features_test"],
                                                                   Annotated[pd.Series, "target_train"], Annotated[pd.Series, "target_test"]]:
    # Séparez les données en ensembles d'apprentissage et de test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test
    

In [10]:
@step
def train_model(X_test: pd.DataFrame = None, X_train: pd.DataFrame = None, y_train: pd.Series = None, y_test: pd.Series = None) -> Annotated[xgb.XGBClassifier,  ArtifactConfig(name="my_model", tags=["XGBoost", "trained"])]:
    # Créez un modèle XGBoost
    model = xgb.XGBClassifier(objective='multi:softmax', num_class=7)

    # Entraînez le modèle sur l'ensemble d'apprentissage
    model.fit(X_train, y_train)
    
    #Prédictions sur l'ensemble de test
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)
    # Mesurez la performance du modèle
    accuracy = accuracy_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
    conf_matrix = confusion_matrix(y_test, y_pred)
    conf_matrix = conf_matrix.tolist()
    f1 = f1_score(y_test, y_pred, average='weighted')

    log_artifact_metadata(
        # Artifact name can be omitted if step returns only one output
        artifact_name="my_model",
        # Metadata should be a dictionary of JSON-serializable values
        metadata={"metrics" : {"accuracy": float(accuracy), "roc": float(roc), "conf_matrix": conf_matrix, "f1": float(f1)}}
        # A dictionary of dictionaries can also be passed to group metadata
        #  in the dashboard
        # metadata = {"metrics": {"accuracy": accuracy}}
    )
    
    joblib.dump(model, "XGBoost_trained.pkl")

    return model

In [7]:
# @step
# def test_model(X_test: pd.DataFrame = None, y_test: pd.Series = None, 
#                model: xgb.XGBClassifier = None) -> Tuple[Annotated[np.float64, "précision"],
#                                                          Annotated[np.float64, "F1-score"], Annotated[np.float64, "AUC-ROC"],
#                                                         Annotated[np.ndarray, "conf_matrice"]]:
#     #Prédictions sur l'ensemble de test
#     y_pred = model.predict(X_test)
#     y_pred_proba = model.predict_proba(X_test)
#     # Mesurez la performance du modèle
#     accuracy = accuracy_score(y_test, y_pred)
#     roc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
#     conf_matrix = confusion_matrix(y_test, y_pred)
#     f1 = f1_score(y_test, y_pred, average='weighted')
#     return accuracy, roc, f1, conf_matrix

In [8]:
@pipeline
def dpe_xgboost_train():
    data = data_load()
    X, y = data_clean(data)
    X_train, X_test, y_train, y_test = data_prep(X, y)
    model = train_model(X_test, X_train, y_train, y_test)
#     test_model(X_test, y_test, model)
    

In [11]:
dpe_xgboost_train()

[1;35mInitiating a new run for the pipeline: [0m[1;36mdpe_xgboost_train[1;35m.[0m
[1;35mRegistered new version: [0m[1;36m(version 2)[1;35m.[0m
[1;35mExecuting a new run.[0m
[1;35mUsing user: [0m[1;36mdefault[1;35m[0m
[1;35mUsing stack: [0m[1;36mdefault[1;35m[0m
[1;35m  orchestrator: [0m[1;36mdefault[1;35m[0m
[1;35m  artifact_store: [0m[1;36mdefault[1;35m[0m
[1;35mUsing cached version of [0m[1;36mdata_load[1;35m.[0m
[1;35mStep [0m[1;36mdata_load[1;35m has started.[0m
[1;35mUsing cached version of [0m[1;36mdata_clean[1;35m.[0m
[1;35mStep [0m[1;36mdata_clean[1;35m has started.[0m
[1;35mUsing cached version of [0m[1;36mdata_prep[1;35m.[0m
[1;35mStep [0m[1;36mdata_prep[1;35m has started.[0m
[1;35mStep [0m[1;36mtrain_model[1;35m has started.[0m
[1;35mStep [0m[1;36mtrain_model[1;35m has finished in [0m[1;36m2m14s[1;35m.[0m
[1;35mPipeline run has finished in [0m[1;36m2m15s[1;35m.[0m
[1;35mYou can visualize your pi

PipelineRunResponse(body=PipelineRunResponseBody(created=datetime.datetime(2024, 3, 22, 13, 24, 56, 886404), updated=datetime.datetime(2024, 3, 22, 13, 24, 56, 886404), user=UserResponse(body=UserResponseBody(created=datetime.datetime(2024, 3, 13, 11, 20, 21, 474388), updated=datetime.datetime(2024, 3, 13, 11, 20, 21, 474388), active=True, activation_token=None, full_name='', email_opted_in=None, is_service_account=False), metadata=None, resources=None, id=UUID('d6b6f061-b8c6-462b-b6ad-7018f003f258'), permission_denied=False, name='default'), status=<ExecutionStatus.INITIALIZING: 'initializing'>, stack=StackResponse(body=StackResponseBody(created=datetime.datetime(2024, 3, 13, 11, 20, 20, 712936), updated=datetime.datetime(2024, 3, 13, 11, 20, 20, 712936), user=None), metadata=None, resources=None, id=UUID('12159a6b-75ff-4da9-abaf-25808b794a76'), permission_denied=False, name='default'), pipeline=PipelineResponse(body=PipelineResponseBody(created=datetime.datetime(2024, 3, 22, 13, 24, 

In [12]:

@step
def data_clean_pred(data_val: pd.DataFrame = None) -> Tuple[Annotated[pd.DataFrame, "data_val"], Annotated[pd.Series, "data_val_Id"]]:


    colonnes_a_transformer_val = ['Qualité_isolation_plancher_bas', 'Qualité_isolation_enveloppe', 'Qualité_isolation_menuiseries',
                             'Qualité_isolation_murs', 'Qualité_isolation_plancher_haut_comble_aménagé','Qualité_isolation_plancher_haut_comble_perdu',
                             'Qualité_isolation_plancher_haut_toit_terrase', 'Type_bâtiment', 'Etiquette_GES', 'Type_énergie_n°3']


    data_val = data_val.fillna(0)

    data_val_Id = data_val['N°DPE']



    data_val[colonnes_a_transformer_val] = data_val[colonnes_a_transformer_val].replace(correspondance)

    data_val = data_val.drop(columns= ['N°DPE', 'Facteur_couverture_solaire_saisi', 'Surface_totale_capteurs_photovoltaïque', 'Facteur_couverture_solaire', 'Configuration_installation_chauffage_n°2', 'Type_générateur_froid', 'Type_émetteur_installation_chauffage_n°2',
                              'Classe_altitude', 'Code_postal_(brut)', 'Type_générateur_n°1_installation_n°2', 'Nom__commune_(Brut)',
                              "Cage_d'escalier", 'Code_INSEE_(BAN)', 'Description_générateur_chauffage_n°2_installation_n°2', 'N°_département_(BAN)'])

    return data_val, data_val_Id


In [13]:

@step
def predict(data_val: pd.DataFrame = None) -> Annotated[np.ndarray, "data_val"]:
    model = joblib.load('XGBoost_trained.pkl')
    pred = model.predict(data_val)
    return pred

In [14]:
@step
def pred_save(pred: np.ndarray = None, data_val_Id: pd.Series = None) -> Annotated[pd.DataFrame, "data_pred"]:
    data_pred = pd.DataFrame()
    data_pred['N°DPE'] = data_val_Id
    data_pred['Etiquette_DPE'] = pred

    correspondance_val = {0:'G', 1:'F', 2:'E', 3:'D', 4:'C', 5:'B', 6:'A'}

    data_pred['Etiquette_DPE'] = data_pred['Etiquette_DPE'].replace(correspondance_val)
    data_pred.to_csv('Dpe_val.csv', index=False)
    
    return data_pred

In [15]:
@pipeline
def dpe_xgboost_pred():
    data_val = data_val_load()
    data_val, data_val_Id = data_clean_pred(data_val)
    pred = predict(data_val) 
    data_pred = pred_save(pred, data_val_Id)

In [16]:
dpe_xgboost_pred()

[1;35mInitiating a new run for the pipeline: [0m[1;36mdpe_xgboost_pred[1;35m.[0m
[1;35mRegistered new version: [0m[1;36m(version 2)[1;35m.[0m
[1;35mExecuting a new run.[0m
[1;35mUsing user: [0m[1;36mdefault[1;35m[0m
[1;35mUsing stack: [0m[1;36mdefault[1;35m[0m
[1;35m  orchestrator: [0m[1;36mdefault[1;35m[0m
[1;35m  artifact_store: [0m[1;36mdefault[1;35m[0m
[1;35mUsing cached version of [0m[1;36mdata_val_load[1;35m.[0m
[1;35mStep [0m[1;36mdata_val_load[1;35m has started.[0m
[1;35mUsing cached version of [0m[1;36mdata_clean_pred[1;35m.[0m
[1;35mStep [0m[1;36mdata_clean_pred[1;35m has started.[0m
[1;35mUsing cached version of [0m[1;36mpredict[1;35m.[0m
[1;35mStep [0m[1;36mpredict[1;35m has started.[0m
[1;35mUsing cached version of [0m[1;36mpred_save[1;35m.[0m
[1;35mStep [0m[1;36mpred_save[1;35m has started.[0m
[1;35mPipeline run has finished in [0m[1;36m1.235s[1;35m.[0m
[1;35mYou can visualize your pipeline runs

PipelineRunResponse(body=PipelineRunResponseBody(created=datetime.datetime(2024, 3, 22, 13, 28, 11, 998710), updated=datetime.datetime(2024, 3, 22, 13, 28, 11, 998710), user=UserResponse(body=UserResponseBody(created=datetime.datetime(2024, 3, 13, 11, 20, 21, 474388), updated=datetime.datetime(2024, 3, 13, 11, 20, 21, 474388), active=True, activation_token=None, full_name='', email_opted_in=None, is_service_account=False), metadata=None, resources=None, id=UUID('d6b6f061-b8c6-462b-b6ad-7018f003f258'), permission_denied=False, name='default'), status=<ExecutionStatus.INITIALIZING: 'initializing'>, stack=StackResponse(body=StackResponseBody(created=datetime.datetime(2024, 3, 13, 11, 20, 20, 712936), updated=datetime.datetime(2024, 3, 13, 11, 20, 20, 712936), user=None), metadata=None, resources=None, id=UUID('12159a6b-75ff-4da9-abaf-25808b794a76'), permission_denied=False, name='default'), pipeline=PipelineResponse(body=PipelineResponseBody(created=datetime.datetime(2024, 3, 22, 13, 28, 

In [None]:
@pipeline
def Full_train_and_predict():
    step_1 = dpe_xgboost_train()
    step_2 = dpe_xgboost_pred()
    

In [95]:
Full_train_and_predict()

[1;35mInitiating a new run for the pipeline: [0m[1;36mFull_train_and_predict[1;35m.[0m
[1;35mRegistered new version: [0m[1;36m(version 3)[1;35m.[0m
[1;35mExecuting a new run.[0m
[1;35mUsing user: [0m[1;36mdefault[1;35m[0m
[1;35mUsing stack: [0m[1;36mdefault[1;35m[0m
[1;35m  orchestrator: [0m[1;36mdefault[1;35m[0m
[1;35m  artifact_store: [0m[1;36mdefault[1;35m[0m
[1;35mStep [0m[1;36mdata_load[1;35m has started.[0m
[1;35mStep [0m[1;36mdata_load[1;35m has finished in [0m[1;36m1m10s[1;35m.[0m
[1;35mStep [0m[1;36mdata_val_load[1;35m has started.[0m
[1;35mStep [0m[1;36mdata_val_load[1;35m has finished in [0m[1;36m7.175s[1;35m.[0m
[1;35mStep [0m[1;36mdata_clean[1;35m has started.[0m
[33mFailed to extract metadata for output artifact 'target': 'float' object has no attribute 'item'[0m
[1;35mStep [0m[1;36mdata_clean[1;35m has finished in [0m[1;36m1m25s[1;35m.[0m
[1;35mStep [0m[1;36mdata_clean_pred[1;35m has started.[0

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;35mStep [0m[1;36mdata_clean_pred[1;35m has finished in [0m[1;36m5m20s[1;35m.[0m
[1;35mStep [0m[1;36mdata_prep[1;35m has started.[0m
[33mFailed to extract metadata for output artifact 'target_train': 'float' object has no attribute 'item'[0m
[33mFailed to extract metadata for output artifact 'target_test': 'float' object has no attribute 'item'[0m
[1;35mStep [0m[1;36mdata_prep[1;35m has finished in [0m[1;36m38.948s[1;35m.[0m
[1;35mStep [0m[1;36mpredict[1;35m has started.[0m
[1;35mStep [0m[1;36mpredict[1;35m has finished in [0m[1;36m14.548s[1;35m.[0m
[1;35mStep [0m[1;36mpred_save[1;35m has started.[0m
[1;35mStep [0m[1;36mpred_save[1;35m has finished in [0m[1;36m5.511s[1;35m.[0m
[1;35mStep [0m[1;36mtrain_model[1;35m has started.[0m
[1;35mStep [0m[1;36mtrain_model[1;35m has finished in [0m[1;36m2m57s[1;35m.[0m
[1;35mPipeline run has finished in [0m[1;36m12m5s[1;35m.[0m
[1;35mYou can visualize your pipeline runs in the

PipelineRunResponse(body=PipelineRunResponseBody(created=datetime.datetime(2024, 3, 22, 11, 32, 12, 686396), updated=datetime.datetime(2024, 3, 22, 11, 32, 12, 686396), user=UserResponse(body=UserResponseBody(created=datetime.datetime(2024, 3, 13, 11, 20, 21, 474388), updated=datetime.datetime(2024, 3, 13, 11, 20, 21, 474388), active=True, activation_token=None, full_name='', email_opted_in=None, is_service_account=False), metadata=None, resources=None, id=UUID('d6b6f061-b8c6-462b-b6ad-7018f003f258'), permission_denied=False, name='default'), status=<ExecutionStatus.INITIALIZING: 'initializing'>, stack=StackResponse(body=StackResponseBody(created=datetime.datetime(2024, 3, 13, 11, 20, 20, 712936), updated=datetime.datetime(2024, 3, 13, 11, 20, 20, 712936), user=None), metadata=None, resources=None, id=UUID('12159a6b-75ff-4da9-abaf-25808b794a76'), permission_denied=False, name='default'), pipeline=PipelineResponse(body=PipelineResponseBody(created=datetime.datetime(2024, 3, 22, 11, 32, 