# Create and run model

## Load libraries

In [52]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, precision_recall_curve, auc, confusion_matrix, f1_score

import pandas as pd
import numpy as np

## Set global parameters

In [56]:
data_path = "../data/04_feature/features.csv"
artifact_path = "../data/06_models/"
test_size = 0.2
random_state = 18
cost_fn = 10
cost_fp = 1

## Load data

Pourquoi j'ai plein de NaN dans la target?

In [85]:
# Load data
raw_data = pd.read_csv(data_path)

# Remove lines with TARGET = NaN
data = raw_data.dropna(subset=["TARGET"])

X = data.drop("TARGET", axis=1)
y = data["TARGET"]

## Split data

In [86]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

## Create model

### Set model parameters

In [87]:
params = {
    "n_estimators": 100,
    "random_state": random_state,
}

### Set grid parameters

In [88]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 8],
    'max_features': ['auto', 'sqrt', 'log2'],
    'criterion' :['gini', 'entropy']
}

### Cross validate the model to assess efficiently the mean score on different fold

Models to test:
- RandomForestClassifier
- XGBoost
- LightGBM
- Logistic Regression avec pondération des classes

In [89]:
from sklearn.ensemble import RandomForestClassifier
# import xgboost as xgb

In [90]:
def evaluate_model_with_cross_val(X_train, y_train, params):
    # Initialisation du modèle
    model = RandomForestClassifier(**params)

    # Évaluation du modèle avec validation croisée
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')

    # Moyenne et écart-type des scores de validation croisée
    mean_cv_score = np.mean(cv_scores)
    std_cv_score = np.std(cv_scores)

    return mean_cv_score, std_cv_score

### Train model

In [91]:
def train_model(X_train, y_train, params, param_grid):
    # Initialisation du modèle
    model = RandomForestClassifier(**params)

    # Initialisation de GridSearchCV
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)

    # Entraînement du modèle
    grid_search.fit(X_train, y_train)

    return grid_search

## Evaluate model

In [92]:
# Define a work cost function
def cost_metric(y_true, y_pred, cost_fn=cost_fn, cost_fp=cost_fp):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    cost = (fn * cost_fn) + (fp * cost_fp)
    return cost

In [93]:
def optimize_threshold(y_true, y_proba, cost_fn=cost_fn, cost_fp=cost_fp):
    fpr, tpr, thresholds = roc_curve(y_true, y_proba)
    costs = []
    for threshold in thresholds:
        y_pred = (y_proba >= threshold).astype(int)
        cost = cost_metric(y_true, y_pred, cost_fn, cost_fp)
        costs.append(cost)
    optimal_idx = np.argmin(costs)
    optimal_threshold = thresholds[optimal_idx]
    return optimal_threshold


In [94]:
def assess_model(best_model, X_test, y_test):
    # Prédictions sur l'ensemble de test
    y_proba = best_model.predict_proba(X_test)[:, 1]
    optimal_threshold = optimize_threshold(y_test, y_proba)
    y_pred = (y_proba >= optimal_threshold).astype(int)

    # Calcul des métriques
    auc_roc = roc_auc_score(y_test, y_proba)
    precision, recall, _ = precision_recall_curve(y_test, y_proba)
    auc_pr = auc(recall, precision) # Area Under the Precision-Recall Curve
    cost = cost_metric(y_test, y_pred)
    f1 = f1_score(y_test, y_pred) # F1 score
    accuracy = accuracy_score(y_test, y_pred)

    

    # return y_proba, optimal_threshold, y_pred

    return accuracy, auc_roc, cost, auc_pr, f1

# MLFlow

## Import libraries

In [95]:
import mlflow
from mlflow.models import infer_signature
# import mlflow.xgboost

## Set experiment and tracking URI

In [96]:
mlflow.set_tracking_uri(uri="../mlruns")

mlflow.set_experiment("Credit_Scoring_Model")


<Experiment: artifact_location='file:///c:/Users/Z478SG/Desktop/Ecole/OpenClassrooms-Projet-7/modeling/notebooks/../mlruns/129028389153367336', creation_time=1725033902660, experiment_id='129028389153367336', last_update_time=1725033902660, lifecycle_stage='active', name='Credit_Scoring_Model', tags={}>

## Log experiment data into MLFlow

### /!\ Before continuing, run the MLflow server in the background with a terminal window

$ ./run_mlflow_server.sh

Then open the web browser in [http://127.0.0.1:8080](http://127.0.0.1:8080)

### Log data into MLFlow

In [98]:
# Start an MLflow run
with mlflow.start_run():
    ## Run experiment
    # Evaluate model
    mean_cv_score, std_cv_score = evaluate_model_with_cross_val(X_train, y_train, params)
    print(f"Mean CV AUC-ROC: {mean_cv_score}, Std CV AUC-ROC: {std_cv_score}")

    # Train model
    grid_search = train_model(X_train, y_train, params, param_grid)

    best_model = grid_search.best_estimator_

    # Assess model
    accuracy, auc_roc, cost, auc_pr, f1, optimal_threshold = assess_model(best_model, X_test, y_test)

    ## Log data
    # Set a tag that (key, value) we can use to remind ourselves what this run was for
    mlflow.set_tag("Training Info", "Basic LR model for iris data, 30% test split")

    # Log parameters
    mlflow.log_params({
        "test_size": test_size,
        "random_state": random_state,
        "cost_fn": cost_fn
        })
    mlflow.log_params(params)
    mlflow.log_params(param_grid)
    mlflow.log_params(grid_search.best_params_)

    # Log metrics (accuracy and AUC-ROC)
    mlflow.log_metrics({
        "Accuracy": accuracy,
        "AUC-ROC": auc_roc,
        "Cost": cost,
        "AUC-PR": auc_pr,
        "F1": f1,
        "Optimal Threshold": optimal_threshold,
        "Mean CV Score": mean_cv_score,
        "Std CV Score": std_cv_score,
        })

    # Infer the model signature (input and output schema) from the training data
    signature = infer_signature(X_train, best_model.predict(X_train))
    
    # Log model
    model_info = mlflow.sklearn.log_model(
        sk_model=best_model, 
        artifact_path=artifact_path, 
        signature=signature,
        input_example=X_train,
        )

    # Log data path
    mlflow.log_artifact(data_path)
    

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Z478SG\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Z478SG\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Z478SG\AppData\Roaming\Python\Python312\site-packages\sklearn\ensemble\_forest.py", line 377, in fit
    estimator._compute_missing_values_in_feature_mask(
  File "C:\Users\Z478SG\AppData\Roaming\Python\Python312\site-packages\sklearn\tree\_classes.py", line 222, in _compute_missing_values_in_feature_mask
    _assert_all_finite_element_wise(X, xp=np, allow_nan=True, **common_kwargs)
  File "C:\Users\Z478SG\AppData\Roaming\Python\Python312\site-packages\sklearn\utils\validation.py", line 172, in _assert_all_finite_element_wise
    raise ValueError(msg_err)
ValueError: Input X contains infinity or a value too large for dtype('float32').


## View experiments in MLFlow UI

In a background terminal, run:  

./run_mlflow_ui.py

## If result is good : Register the model as an official model

In [None]:
# If not registered during the run, officially register the model artifact with a specific name

result = mlflow.register_model(
    model_info.model_uri, "name-of-the-working-model"
)