In [26]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import pickle 

from catboost import CatBoostClassifier


In [27]:
with open("datasets/dataset_withoutBadValues.pkl", "rb") as file:
    df = pickle.load(file)
file.close()
df = df.dropna()


In [28]:
# Définir X et y
X = df.drop(['MIS_Status', 'Name', 'ApprovalDate'], axis=1)
y = df["MIS_Status"]


object_cols = list(X.select_dtypes(include=["object"]).columns)
cat_cols = list(X.select_dtypes(include=["category"]).columns.drop(["ApprovalFY"]))
num_cols = list(X.select_dtypes(include=["int64"]).columns)
year_col = ["ApprovalFY"]

for col in object_cols:
    frequency_encoding = X[col].value_counts(normalize=True)
    X[col] = X[col].map(frequency_encoding)


unique_years = sorted(df["ApprovalFY"].unique())
preprocessor = ColumnTransformer([
        ("onehot", OneHotEncoder(drop="if_binary", sparse_output=False, handle_unknown="ignore"), cat_cols),
        ("scaler", StandardScaler(), num_cols),
        ("ordinal", OrdinalEncoder(categories=[unique_years], handle_unknown="use_encoded_value", unknown_value=-1), year_col),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False
)


# Création du pipeline avec le preprocessor et le modèle XGBoost
pipeline_cb = Pipeline([
    ("preprocessor", preprocessor),
    ("model", CatBoostClassifier(one_hot_max_size = 10, verbose = 0))
])


X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.05, random_state=42, stratify=y)

# Utiliser GridSearchCV pour trouver les meilleurs hyperparamètres
pipeline_cb.fit(X_train, y_train)

# Prédictions sur le jeu de données de test avec le modèle CatBoost optimal
y_pred_cb = pipeline_cb.predict(X_test)

# Afficher le rapport de classification
print(classification_report(y_test, y_pred_cb, digits=4))

              precision    recall  f1-score   support

           0     0.8996    0.8775    0.8884      7037
           1     0.9691    0.9751    0.9721     27700

    accuracy                         0.9554     34737
   macro avg     0.9343    0.9263    0.9303     34737
weighted avg     0.9550    0.9554    0.9551     34737



In [29]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_predict

y_probas_rf = cross_val_predict(pipeline_cb, X_test, y_test, method="predict_proba")
y_scores_rf = y_probas_rf[:, 1]

precisions_rf, recalls_rf, thresholds_rf = precision_recall_curve(y_test, y_scores_rf)

plt.plot(recalls_rf[:-1], precisions_rf[:-1])
plt.title("Evolution de la precision en fonction du recall pour le modèle Random Forest")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.grid()
plt.show()

# print(thresholds_rf)

In [None]:
print(classification_report(y_test, y_pred_cb))

              precision    recall  f1-score   support

           0       0.90      0.88      0.89      7037
           1       0.97      0.98      0.97     27700

    accuracy                           0.96     34737
   macro avg       0.93      0.93      0.93     34737
weighted avg       0.96      0.96      0.96     34737



In [None]:
# from sklearn.model_selection import learning_curve

# train_sizes, train_scores, validation_scores = learning_curve(
#     estimator = pipeline_cb,
#     X = X_train,
#     y = y_train,
#     train_sizes = np.linspace(0.1, 1.0, 10),
#     cv = 5,
#     scoring = 'f1_macro'
# )

# #recuperation des moyennes et des écarts types(facultatif) à chaque cv
# train_scores_mean = train_scores.mean(axis = 1)
# train_scores_std = train_scores.std(axis = 1)
# validation_scores_mean = validation_scores.mean(axis = 1)
# validation_scores_std = validation_scores.std(axis = 1)


# plt.figure()
# plt.title("Learning Curve (Random Forest)")
# plt.xlabel("Training examples")
# plt.ylabel("f1_macro")
# # plt.gca().invert_yaxis() # -> inverser le y si la métrique est une perte

# # afficher les points
# plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
# plt.plot(train_sizes, validation_scores_mean, 'o-', color="g", label="Cross-validation score")

# # relier les points
# plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
# plt.fill_between(train_sizes, validation_scores_mean - validation_scores_std, validation_scores_mean + validation_scores_std, alpha=0.1, color="g")


# plt.legend()
# plt.show()