# Initialisation

In [1]:
import sys
import notebook
import re
import subprocess
import os
import time
import random
import subprocess
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier


In [2]:
packages = {
    "Python": sys,
    "Jupyter Notebook": "notebook",
    "NumPy": "numpy",
    "Pandas": "pandas",
    "Matplotlib": "matplotlib",
    "Seaborn": "seaborn",
    "MLflow" : "mlflow",
    "Scikit-Learn": "sklearn",
    "LightGMB": "lightgbm"
}

In [3]:
errorMsg = (
    "non disponible - vérifiez que le package existe, qu'il est correctement installé, importé "
    "et qu'il dispose d'un attribut '__version__'."
)
for name, module in packages.items():
    if isinstance(module, str):
        version = getattr(sys.modules.get(module, None), '__version__', errorMsg)
    else:
        if module is sys:
            version = sys.version
        else:
            version = getattr(module, '__version__', errorMsg)

    print(f"Version de {name} : {version}")

Version de Python : 3.12.8 (tags/v3.12.8:2dc476b, Dec  3 2024, 19:30:04) [MSC v.1942 64 bit (AMD64)]
Version de Jupyter Notebook : 7.3.2
Version de NumPy : 2.2.2
Version de Pandas : 2.2.3
Version de Matplotlib : 3.10.0
Version de Seaborn : 0.13.2
Version de MLflow : 2.19.0
Version de Scikit-Learn : 1.6.1
Version de LightGMB : 4.5.0


In [4]:
def read_csv_with_fallback(filepath, encodings=['utf-8', 'latin1', 'ISO-8859-1', 'cp1252']):
    """
    Lecture d'un fichier CSV avec tentative sur plusieurs encodages.

    Parameters:
    - filepath (str): Chemin du fichier à lire.
    - encodings (list): Liste des encodages à tester.

    Returns:
    - pd.DataFrame: Le DataFrame pandas chargé.

    Raises:
    - UnicodeDecodeError: Si aucun des encodages ne fonctionne.
    """
    for encoding in encodings:
        try:
            print(f"Tentative de lecture avec l'encodage : {encoding}")
            return pd.read_csv(filepath, encoding=encoding)
        except UnicodeDecodeError:
            print(f"Erreur avec l'encodage : {encoding}")

    raise UnicodeDecodeError(f"Impossible de lire le fichier {filepath} avec les encodages {encodings}")

# Chemin vers le nouveau dossier "data"
data_path = "D://Pro//OpenClassrooms//Projet_7//data//"

# Liste des fichiers et noms de DataFrame correspondants
files = {
    "application_test": "application_test.csv",
    "application_train": "application_train.csv",
    "bureau": "bureau.csv",
    "bureau_balance": "bureau_balance.csv",
    "credit_card_balance": "credit_card_balance.csv",
    "homecredit_columns_description": "HomeCredit_columns_description.csv",
    "installments_payments": "installments_payments.csv",
    "pos_cash_balance": "POS_CASH_balance.csv",
    "previous_application": "previous_application.csv"
}

# Chargement des fichiers dans des DataFrames
loaded_data = {}
for name, file in files.items():
    filepath = data_path + file
    loaded_data[name] = read_csv_with_fallback(filepath)
    print(f"{name} chargé avec succès !\n")

# Affichage des informations pour chaque DataFrame
for name, df in loaded_data.items():
    print(f"{name.capitalize()} Info:")
    print(df.info(), "\n")

Tentative de lecture avec l'encodage : utf-8
application_test chargé avec succès !

Tentative de lecture avec l'encodage : utf-8
application_train chargé avec succès !

Tentative de lecture avec l'encodage : utf-8
bureau chargé avec succès !

Tentative de lecture avec l'encodage : utf-8
bureau_balance chargé avec succès !

Tentative de lecture avec l'encodage : utf-8
credit_card_balance chargé avec succès !

Tentative de lecture avec l'encodage : utf-8
Erreur avec l'encodage : utf-8
Tentative de lecture avec l'encodage : latin1
homecredit_columns_description chargé avec succès !

Tentative de lecture avec l'encodage : utf-8
installments_payments chargé avec succès !

Tentative de lecture avec l'encodage : utf-8
pos_cash_balance chargé avec succès !

Tentative de lecture avec l'encodage : utf-8
previous_application chargé avec succès !

Application_test Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48744 entries, 0 to 48743
Columns: 121 entries, SK_ID_CURR to AMT_REQ_CREDIT_BU

# Préparation de l'environement

## Test des modèles

In [5]:
# Données fictives pour le modèle Random Forest
X, y = make_classification(n_samples=1000, n_features=10, random_state=123)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

In [None]:
mlflow.set_tracking_uri("sqlite:///mlruns.db")  # Stockage local MLflow

# Expérimentation
experiment_name = "RandomForest_Model_Tracking"
mlflow.set_experiment(experiment_name)

print(f"Experiment '{experiment_name}' configured with tracking URI: sqlite:///mlruns.db")

# Lancement d'expérimentation pour Random Forest
with mlflow.start_run(run_name="RandomForest_Baseline_Experiment") as run:
    # Tags de l’expérimentation
    mlflow.set_tag("version_data", "v1.0")
    mlflow.set_tag("description", "Baseline Random Forest model")

    # 1. Logging des paramètres
    mlflow.log_param("n_estimators", 200)
    mlflow.log_param("max_depth", None)
    mlflow.log_param("random_state", 123)

    # 2. Entraînement du modèle Random Forest
    rf_model = RandomForestClassifier(n_estimators=200, random_state=123)
    rf_model.fit(X_train, y_train)

    # 3. Évaluation
    auc_score_rf = roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1])
    mlflow.log_metric("AUC", auc_score_rf)

    # 4. Enregistrement du modèle
    registered_model_name_rf = "RandomForest_Model"
    mlflow.sklearn.log_model(
        rf_model,
        "model",
        registered_model_name=registered_model_name_rf
    )

    # Récupération et affichage de la dernière version enregistrée
    client = MlflowClient()
    latest_version_rf = client.get_latest_versions(registered_model_name_rf, stages=["None"])
    print(f"Latest version of '{registered_model_name_rf}': {latest_version_rf[0].version}")

    print(f"Run '{run.info.run_id}' completed and logged to experiment 'RandomForest_Experiment'.")

Experiment 'RandomForest_Model_Tracking' configured with tracking URI: sqlite:///mlruns.db


Registered model 'RandomForest_Model' already exists. Creating a new version of this model...
Created version '4' of model 'RandomForest_Model'.
  latest_version_rf = client.get_latest_versions(registered_model_name_rf, stages=["None"])


Latest version of 'RandomForest_Model': 4
Run 'a1c9690cf713419d9dd07ba6be75d6f0' completed and logged to experiment 'RandomForest_Experiment'.


In [7]:
# Donnée fictives pour le modèle LightGBM
X, y = make_classification(n_samples=1000, n_features=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
mlflow.set_tracking_uri("sqlite:///mlruns.db")  # Stockage local MLflow

# Expérimentation
experiment_name = "LightGBM_Model_Tracking"
mlflow.set_experiment(experiment_name)

print(f"Experiment '{experiment_name}' configured with tracking URI: sqlite:///mlruns.db")

with mlflow.start_run(run_name="LightGBM_Baseline_Experiment") as run:
    # Tags pour l'expérimentation
    mlflow.set_tag("version_data", "v1.0")
    mlflow.set_tag("description", "Baseline model with default LightGBM parameters")

    # 1. Logging de paramètres basiques
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("random_state", 42)

    # 2. Entraînement de LightGBM (sans hyperparamètres particuliers)
    model = LGBMClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # 3. Évaluation
    auc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    mlflow.log_metric("AUC", auc_score)

    registered_model_name_lgbm = "LightGBM_Model"
    mlflow.sklearn.log_model(
        model,
        "model",
        registered_model_name=registered_model_name_lgbm
    )

    # Récupérer et affichage de la dernière version enregistrée
    client = MlflowClient()
    latest_version_lgbm = client.get_latest_versions(registered_model_name_lgbm, stages=["None"])
    print(f"Latest version of '{registered_model_name_lgbm}': {latest_version_lgbm[0].version}")

    print(f"Run '{run.info.run_id}' completed and logged to experiment '{experiment_name}'.")

Experiment 'LightGBM_Model_Tracking' configured with tracking URI: sqlite:///mlruns.db




[LightGBM] [Info] Number of positive: 362, number of negative: 388
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000793 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2502
[LightGBM] [Info] Number of data points in the train set: 750, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.482667 -> initscore=-0.069361
[LightGBM] [Info] Start training from score -0.069361


Registered model 'LightGBM_Model' already exists. Creating a new version of this model...
Created version '5' of model 'LightGBM_Model'.
  latest_version_lgbm = client.get_latest_versions(registered_model_name_lgbm, stages=["None"])


Latest version of 'LightGBM_Model': 5
Run 'c4b62b1c110b4ab0aa49a319c0a25973' completed and logged to experiment 'LightGBM_Model_Tracking'.


## Génération du fichier requirement

In [9]:
# Chemin vers les requirements
raw_requirements_file = "D://Pro//OpenClassrooms//Projet_7//3_dossier_code_012025/raw_requirements.txt"
cleaned_requirements_file = "D://Pro//OpenClassrooms//Projet_7//3_dossier_code_012025/requirements.txt"

# Chemin absolu de pip
pip_path = "D://Pro//OpenClassrooms//Projet_7//.venv//Scripts//pip.exe"

# Repo
os.makedirs(os.path.dirname(raw_requirements_file), exist_ok=True)

# Générer le fichier raw_requirements.txt
try:
    with open(raw_requirements_file, "w") as f:
        result = subprocess.run([pip_path, "freeze"], stdout=f, text=True, check=True)
    print(f"Fichier raw_requirements.txt généré à {raw_requirements_file}")
except subprocess.CalledProcessError as e:
    print(f"Erreur lors de l'exécution de pip freeze : {e}")
    exit(1)
except Exception as e:
    print(f"Une erreur inattendue s'est produite : {e}")
    exit(1)

# Nettoyage du fichier requirements
try:
    with open(raw_requirements_file, "r") as raw_f, open(cleaned_requirements_file, "w") as cleaned_f:
        for line in raw_f:
            # Nettoyage des lignes
            if "@ file://" in line:
                line = re.sub(r"@ file://.*", "", line)
            cleaned_f.write(line)
    print(f"Fichier requirements.txt nettoyé généré à {cleaned_requirements_file}")
except Exception as e:
    print(f"Une erreur inattendue s'est produite lors du nettoyage : {e}")
    exit(1)

Fichier raw_requirements.txt généré à D://Pro//OpenClassrooms//Projet_7//3_dossier_code_012025/raw_requirements.txt
Fichier requirements.txt nettoyé généré à D://Pro//OpenClassrooms//Projet_7//3_dossier_code_012025/requirements.txt


In [10]:
# Procédure d'installation sur nouvelle machine
# cd /D D:\Pro\OpenClassrooms\Projet_7\3_dossier_code_012025
# Remote
# git remote -v
# git remote add origin https://github.com/davfgh/Projet7_OpenClassrooms.git
# git push -u origin main
# davfgh
# token à la place du password git