In [204]:
#!pip install xgboost
import pandas as pd

df = pd.read_csv('../data/processed/credit_risk_dataset_processed.csv')
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,...,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_grade_encoded,cb_person_default_on_file_encoded
0,0.017241,0.002751,OWN,0.121951,EDUCATION,B,0.014493,0.321348,0,0.120482,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0
1,0.086207,0.002751,MORTGAGE,0.02439,MEDICAL,C,0.144928,0.418539,1,0.686747,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0
2,0.051724,0.030209,RENT,0.097561,MEDICAL,C,1.0,0.551124,1,0.638554,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0
3,0.068966,0.024757,RENT,0.195122,MEDICAL,C,1.0,0.497191,1,0.662651,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,1
4,0.017241,0.002898,OWN,0.04878,VENTURE,A,0.057971,0.096629,1,0.301205,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0


In [205]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
import time
from tabulate import tabulate

X = df[['person_age', 'person_income', 'person_emp_length', 'loan_amnt',  'loan_int_rate', 
 'loan_percent_income', 'cb_person_cred_hist_length',  'loan_intent_DEBTCONSOLIDATION',
 'loan_intent_EDUCATION', 'loan_intent_HOMEIMPROVEMENT', 'loan_intent_MEDICAL',
 'loan_intent_PERSONAL',  'loan_intent_VENTURE', 'person_home_ownership_MORTGAGE',
 'person_home_ownership_OTHER', 'person_home_ownership_OWN', 'person_home_ownership_RENT',
 'loan_grade_encoded', 'cb_person_default_on_file_encoded']]

y = df['loan_status']

# Division des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Création d'un dictionnaire pour stocker les résultats
resultats = []

# Régression logistique
debut = time.time()
modele_lr = LogisticRegression(max_iter=1000)
modele_lr.fit(X_train, y_train)
predictions_lr = modele_lr.predict(X_test)
fin = time.time()
resultats.append({
    'Modèle': 'Régression logistique',
    'Précision': precision_score(y_test, predictions_lr),
    'Rappel': recall_score(y_test, predictions_lr),
    'F1-score': f1_score(y_test, predictions_lr),
    'Temps d\'exécution': fin - debut
})

# Arbre de décision
debut = time.time()
modele_dt = DecisionTreeClassifier(max_depth=10)
modele_dt.fit(X_train, y_train)
predictions_dt = modele_dt.predict(X_test)
fin = time.time()
resultats.append({
    'Modèle': 'Arbre de décision',
    'Précision': precision_score(y_test, predictions_dt),
    'Rappel': recall_score(y_test, predictions_dt),
    'F1-score': f1_score(y_test, predictions_dt),
    'Temps d\'exécution': fin - debut
})
max_depth=10
modele_rf = RandomForestClassifier(max_depth=20)
modele_rf.fit(X_train, y_train)
predictions_rf = modele_rf.predict(X_test)
fin = time.time()
resultats.append({
    'Modèle': 'Forêt aléatoire',
    'Précision': precision_score(y_test, predictions_rf),
    'Rappel': recall_score(y_test, predictions_rf),
    'F1-score': f1_score(y_test, predictions_rf),
    'Temps d\'exécution': fin - debut
})

# KNN
debut = time.time()
modele_knn = KNeighborsClassifier(n_neighbors=10)
modele_knn.fit(X_train, y_train)
predictions_knn = modele_knn.predict(X_test)
fin = time.time()
resultats.append({
    'Modèle': 'KNN',
    'Précision': precision_score(y_test, predictions_knn),
    'Rappel': recall_score(y_test, predictions_knn),
    'F1-score': f1_score(y_test, predictions_knn),
    'Temps d\'exécution': fin - debut
})

# Naive Bayes
debut = time.time()
modele_nb = GaussianNB()
modele_nb.fit(X_train, y_train)
predictions_nb = modele_nb.predict(X_test)
fin = time.time()
resultats.append({
    'Modèle': 'Naive Bayes',
    'Précision': precision_score(y_test, predictions_nb),
    'Rappel': recall_score(y_test, predictions_nb),
    'F1-score': f1_score(y_test, predictions_nb),
    'Temps d\'exécution': fin - debut
})

# XGBoost
debut = time.time()
modele_xgb = xgb.XGBClassifier(
    colsample_bytree=0.8,
    gamma=0.1,
    learning_rate=0.1,
    max_depth=6,
    n_estimators=200,
    subsample=1,
    random_state=42
)
modele_xgb.fit(X_train, y_train)
predictions_xgb = modele_xgb.predict(X_test)
fin = time.time()
resultats.append({
    'Modèle': 'XGBoost',
    'Précision': precision_score(y_test, predictions_xgb),
    'Rappel': recall_score(y_test, predictions_xgb),
    'F1-score': f1_score(y_test, predictions_xgb),
    'Temps d\'exécution': fin - debut
})

# Affichage des résultats sous forme de tableau
tableau = [[modele['Modèle'], f"{modele['Précision']:.3f}", f"{modele['Rappel']:.3f}", f"{modele['F1-score']:.3f}", f"{modele['Temps d\'exécution']:.2f}"] for modele in resultats]
print(tabulate(tableau, headers=['Modèle', 'Précision', 'Rappel', 'F1-score', 'Temps d\'exécution'], tablefmt='orgtbl'))


| Modèle                |   Précision |   Rappel |   F1-score |   Temps d'exécution |
|-----------------------+-------------+----------+------------+---------------------|
| Régression logistique |       0.746 |    0.491 |      0.592 |                0.08 |
| Arbre de décision     |       0.962 |    0.701 |      0.811 |                0.08 |
| Forêt aléatoire       |       0.976 |    0.702 |      0.817 |                2.5  |
| KNN                   |       0.896 |    0.564 |      0.692 |                0.5  |
| Naive Bayes           |       0.537 |    0.703 |      0.609 |                0.01 |
| XGBoost               |       0.973 |    0.717 |      0.826 |                0.37 |


In [206]:
from sklearn.model_selection import GridSearchCV
def tester_parametres_xgbooster():
    parametres = {
        'max_depth': [3, 5, 10],
        'learning_rate': [0.1, 0.5, 1],
        'n_estimators': [50, 100, 200],
        'gamma': [0, 0.1, 0.5],
        'subsample': [0.5, 0.8, 1],
        'colsample_bytree': [0.5, 0.8, 1]
    }
    
    grid_search = GridSearchCV(xgb.XGBClassifier(), parametres, cv=5)
    grid_search.fit(X_train, y_train)
    print("Les meilleurs paramètres :", grid_search.best_params_)
    print("Meilleur score :", grid_search.best_score_)



def demande_confirmation():
    reponse = input("Ce processus prendra beaucoup de temps. Êtes-vous sûr de vouloir le lancer ? (oui/non) ")
    if reponse.lower() == "oui":
        tester_parametres_xgbooster()
    elif reponse.lower() == "non":
        print("Processus annulé")
    else:
        print("Réponse invalide. Veuillez répondre par 'oui' ou 'non'.")
        demande_confirmation()

demande_confirmation()



Ce processus prendra beaucoup de temps. Êtes-vous sûr de vouloir le lancer ? (oui/non)  non


Processus annulé


AttributeError: 'GridSearchCV' object has no attribute 'best_params_'