In [65]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import pickle 
from catboost import CatBoostClassifier


In [74]:
with open("datasets/dataset_withoutBadValues.pkl", "rb") as file:
    df = pickle.load(file)
file.close()
df = df.dropna()

row_1 = df.iloc[1]
print(row_1)

display(df)

Name             LANDMARK BAR & GRILLE (THE)
City                               NEW PARIS
State                                     IN
Zip                                    46526
Bank                         1ST SOURCE BANK
BankState                                 IN
NAICS                                 722410
ApprovalDate             1997-02-28 00:00:00
ApprovalFY                              1997
Term                                      60
NoEmp                                      2
NewExist                                 2.0
CreateJob                                  0
RetainedJob                                0
FranchiseCode                              1
UrbanRural                                 0
RevLineCr                                  N
LowDoc                                     Y
MIS_Status                                 1
GrAppv                                 40000
SBA_Appv                               32000
Zip_2                                     46
NAICS_2   

Unnamed: 0,Name,City,State,Zip,Bank,BankState,NAICS,ApprovalDate,ApprovalFY,Term,...,UrbanRural,RevLineCr,LowDoc,MIS_Status,GrAppv,SBA_Appv,Zip_2,NAICS_2,Franchised,RealEstate
0,ABC HOBBYCRAFT,EVANSVILLE,IN,47711,FIFTH THIRD BANK,OH,451120,1997-02-28,1997,84,...,0,N,Y,1,60000,48000,47,45,0,0
1,LANDMARK BAR & GRILLE (THE),NEW PARIS,IN,46526,1ST SOURCE BANK,IN,722410,1997-02-28,1997,60,...,0,N,Y,1,40000,32000,46,72,0,0
2,"WHITLOCK DDS, TODD M.",BLOOMINGTON,IN,47401,GRANT COUNTY STATE BANK,IN,621210,1997-02-28,1997,180,...,0,N,N,1,287000,215250,47,62,0,0
5,"B&T SCREW MACHINE COMPANY, INC",PLAINVILLE,CT,6062,"TD BANK, NATIONAL ASSOCIATION",DE,332721,1997-02-28,1997,120,...,0,N,N,1,517000,387750,60,33,0,0
7,WEAVER PRODUCTS,SUMMERFIELD,FL,34491,REGIONS BANK,AL,811118,1997-02-28,1997,84,...,0,N,Y,1,45000,36000,34,81,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899156,"FABRICATORS STEEL, INC.",BALTIMORE,MD,21224,BANK OF AMERICA NATL ASSOC,MD,332431,1997-02-27,1997,60,...,0,0,N,1,50000,25000,21,33,0,0
899157,PULLTARPS MFG.,EL CAJON,CA,92020,U.S. BANK NATIONAL ASSOCIATION,CA,314912,1997-02-27,1997,36,...,0,N,N,1,200000,150000,92,31,0,0
899159,FABRIC FARMS,UPPER ARLINGTON,OH,43221,JPMORGAN CHASE BANK NATL ASSOC,IL,451120,1997-02-27,1997,60,...,0,0,N,1,70000,56000,43,45,0,0
899160,FABRIC FARMS,COLUMBUS,OH,43221,JPMORGAN CHASE BANK NATL ASSOC,IL,451130,1997-02-27,1997,60,...,0,Y,N,1,85000,42500,43,45,0,0


In [77]:

# Définir X et y

X = df.drop(['MIS_Status', 'ApprovalDate'], axis=1)
y = df["MIS_Status"]


object_cols = list(X.select_dtypes(include=["object"]).columns)
cat_cols = list(X.select_dtypes(include=["category"]).columns.drop(["ApprovalFY"]))
num_cols = list(X.select_dtypes(include=["int64"]).columns)
year_col = ["ApprovalFY"]

for col in object_cols:
    frequency_encoding = X[col].value_counts(normalize=True)
    X[col] = X[col].map(frequency_encoding)


unique_years = sorted(df["ApprovalFY"].unique())
preprocessor = ColumnTransformer([
        ("onehot", OneHotEncoder(drop="if_binary", sparse_output=False, handle_unknown="ignore"), cat_cols),
        ("scaler", StandardScaler(), num_cols),
        ("ordinal", OrdinalEncoder(categories=[unique_years], handle_unknown="use_encoded_value", unknown_value=-1), year_col),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False
)


# Création du pipeline avec le preprocessor et le modèle XGBoost
pipeline_cb = Pipeline([
    ("preprocessor", preprocessor),
    ("model", CatBoostClassifier(random_state=42, verbose=0))
])

param_grid = {
    'model__iterations': [50, 100, 150],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__depth': [2, 4, 6],
    'model__one_hot_max_size': [5, 10, 15],
    'model__verbose': [0]
}


X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.05, random_state=42, stratify=y)


# # Utiliser GridSearchCV pour trouver les meilleurs hyperparamètres. Best: 0.9194
# grid_search = GridSearchCV(pipeline_cb, param_grid, cv=5, scoring='f1_macro')
# grid_search.fit(X_train, y_train)

# # Afficher les meilleurs hyperparamètres
# print("Meilleurs hyperparamètres :", grid_search.best_params_)

# # Entraîner le modèle CatBoost avec les meilleurs hyperparamètres sur l'ensemble d'entraînement complet
# best_cb_model = grid_search.best_estimator_
# best_cb_model.fit(X_train, y_train)

# # Prédictions sur le jeu de données de test avec le modèle CatBoost optimal
# y_pred_best_cb = best_cb_model.predict(X_test)

# # Afficher le rapport de classification
# print(classification_report(y_test, y_pred_best_cb, digits=4))





# Utiliser GridSearchCV pour trouver les meilleurs hyperparamètres. Best: 0.9311
pipeline_cb.fit(X_train, y_train)

display(X_test)

# Prédictions sur le jeu de données de test avec le modèle CatBoost optimal
y_pred_cb = pipeline_cb.predict(X_test)


# Afficher le rapport de classification
print(classification_report(y_test, y_pred_cb, digits=4))


Unnamed: 0,Name,City,State,Zip,Bank,BankState,NAICS,ApprovalFY,Term,NoEmp,...,FranchiseCode,UrbanRural,RevLineCr,LowDoc,GrAppv,SBA_Appv,Zip_2,NAICS_2,Franchised,RealEstate
180730,0.000001,0.000184,NJ,0.000186,0.032070,DE,0.000691,2007,84,2,...,0.272715,1,N,N,50000,25000,70,42,0,0
327139,0.000003,0.000013,IL,0.000207,0.009201,NY,0.000900,2009,57,1,...,0.272715,1,N,N,25000,12500,60,81,0,0
457870,0.000001,0.000258,FL,0.000078,0.062054,SD,0.011323,1991,122,3,...,0.677084,0,N,N,79000,71100,33,81,0,0
238588,0.000003,0.003659,TX,0.000122,0.000321,TX,0.003527,2007,90,5,...,0.272715,1,N,N,103600,88060,78,54,0,0
439436,0.000001,0.003941,CO,0.000137,0.059250,IL,0.003868,2010,84,35,...,0.272715,1,Y,N,250000,125000,80,54,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22551,0.000001,0.000092,CA,0.000102,0.031342,VA,0.040209,2005,71,7,...,0.677084,1,N,N,50000,25000,95,72,0,0
653425,0.000001,0.000048,MA,0.000158,0.046419,RI,0.004043,2003,84,3,...,0.677084,1,Y,N,25000,12500,27,81,0,0
42838,0.000001,0.000186,NH,0.000196,0.046419,RI,0.000223,2005,84,25,...,0.677084,1,N,N,100000,50000,32,33,0,0
219626,0.000001,0.000050,CO,0.000020,0.000960,CO,0.027969,2007,240,1,...,0.000045,2,0,N,625000,625000,80,72,1,1


              precision    recall  f1-score   support

           0     0.8982    0.8818    0.8899      7037
           1     0.9701    0.9746    0.9724     27700

    accuracy                         0.9558     34737
   macro avg     0.9342    0.9282    0.9311     34737
weighted avg     0.9555    0.9558    0.9557     34737



In [68]:
print (X_test)
print(X_test.info())

            Name      City State       Zip      Bank BankState     NAICS  \
180730  0.000001  0.000184    NJ  0.000186  0.032070        DE  0.000691   
327139  0.000003  0.000013    IL  0.000207  0.009201        NY  0.000900   
457870  0.000001  0.000258    FL  0.000078  0.062054        SD  0.011323   
238588  0.000003  0.003659    TX  0.000122  0.000321        TX  0.003527   
439436  0.000001  0.003941    CO  0.000137  0.059250        IL  0.003868   
...          ...       ...   ...       ...       ...       ...       ...   
22551   0.000001  0.000092    CA  0.000102  0.031342        VA  0.040209   
653425  0.000001  0.000048    MA  0.000158  0.046419        RI  0.004043   
42838   0.000001  0.000186    NH  0.000196  0.046419        RI  0.000223   
219626  0.000001  0.000050    CO  0.000020  0.000960        CO  0.027969   
814251  0.000001  0.000081    NY  0.000089  0.108134        NC  0.013211   

       ApprovalFY  Term  NoEmp  ... FranchiseCode  UrbanRural  RevLineCr  \
180730     

In [73]:
# Ajuster le préprocesseur avec les données d'entraînement
pipeline_cb.named_steps['preprocessor'].fit(X_train)


# Export du model sous pickel

with open("datasets/model_catboost.pkl", "wb") as file:
    pickle.dump(pipeline_cb, file)
file.close()

In [70]:
# from sklearn.preprocessing import LabelEncoder
# from sklearn.metrics import confusion_matrix, precision_recall_curve
# import matplotlib.pyplot as plt
# from sklearn.model_selection import train_test_split, cross_val_predict

# y_probas_rf = cross_val_predict(pipeline_cb, X_test, y_test, method="predict_proba")
# y_scores_rf = y_probas_rf[:, 1]

# precisions_rf, recalls_rf, thresholds_rf = precision_recall_curve(y_test, y_scores_rf)

# plt.plot(recalls_rf[:-1], precisions_rf[:-1])
# plt.title("Evolution de la precision en fonction du recall pour le modèle Random Forest")
# plt.xlabel("Recall")
# plt.ylabel("Precision")
# plt.grid()
# plt.show()

# # print(thresholds_rf)

In [71]:
print(classification_report(y_test, y_pred_cb))

              precision    recall  f1-score   support

           0       0.90      0.88      0.89      7037
           1       0.97      0.97      0.97     27700

    accuracy                           0.96     34737
   macro avg       0.93      0.93      0.93     34737
weighted avg       0.96      0.96      0.96     34737



In [72]:
# from sklearn.model_selection import learning_curve

# train_sizes, train_scores, validation_scores = learning_curve(
#     estimator = pipeline_cb,
#     X = X_train,
#     y = y_train,
#     train_sizes = np.linspace(0.1, 1.0, 10),
#     cv = 5,
#     scoring = 'f1_macro'
# )

# #recuperation des moyennes et des écarts types(facultatif) à chaque cv
# train_scores_mean = train_scores.mean(axis = 1)
# train_scores_std = train_scores.std(axis = 1)
# validation_scores_mean = validation_scores.mean(axis = 1)
# validation_scores_std = validation_scores.std(axis = 1)


# plt.figure()
# plt.title("Learning Curve (Random Forest)")
# plt.xlabel("Training examples")
# plt.ylabel("f1_macro")
# # plt.gca().invert_yaxis() # -> inverser le y si la métrique est une perte

# # afficher les points
# plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
# plt.plot(train_sizes, validation_scores_mean, 'o-', color="g", label="Cross-validation score")

# # relier les points
# plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
# plt.fill_between(train_sizes, validation_scores_mean - validation_scores_std, validation_scores_mean + validation_scores_std, alpha=0.1, color="g")


# plt.legend()
# plt.show()