In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import mlflow, os
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder

from model_utils import get_pipeline, get_gridsearch
from mlFlow.MLflowTracker import MLflowTracker
from model_utils import run_training_with_mlflow
from data_prep import reduce_mem_usage


In [14]:
target = "TARGET"
#train = reduce_mem_usage(pd.read_csv("../model/application_train_enriched.csv"))
train = reduce_mem_usage(pd.read_csv("../home-credit-default-risk/application_train.csv"))
#test = reduce_mem_usage(pd.read_csv("../model/application_test_enriched.csv"))
test = reduce_mem_usage(pd.read_csv("../home-credit-default-risk/application_test.csv"))

Usage mémoire initial du DataFrame: 286.23 MB
Usage mémoire final du DataFrame: 92.38 MB
Mémoire réduite de 67.7 %
Usage mémoire initial du DataFrame: 45.00 MB
Usage mémoire final du DataFrame: 14.60 MB
Mémoire réduite de 67.6 %


In [15]:
# size of train and test
print(f"Train size: {train.shape}")

ones = train[train[target] == 1]
zeros = train[train[target] == 0].sample(n=len(ones)*3, random_state=42)
train = pd.concat([ones, zeros], axis=0).reset_index(drop=True)
print(f"Train size after balancing: {train.shape}")

Train size: (307511, 122)
Train size after balancing: (99300, 122)


In [16]:
X = train.drop(columns=[target])
y = train[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [17]:
number_col = X_train.select_dtypes(include=np.number).columns
print(f"{number_col=}")
categorical_col = X_train.select_dtypes(include=['object']).columns
print(f"{categorical_col=}")

number_col=Index(['SK_ID_CURR', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT',
       'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE',
       'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION',
       ...
       'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20',
       'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR',
       'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR'],
      dtype='object', length=105)
categorical_col=Index(['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
       'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE',
       'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE',
       'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE'],
      dtype='object')


In [18]:
param_grid = {
    "model__n_estimators": [100, 200],
   "model__max_depth": [10, 20],
   "model__min_samples_split": [2, 5],
    "model__class_weight": ['balanced', {0: 1, 1: 10}],
    "model__random_state": [42]
}

In [19]:
pipeline = get_pipeline(RandomForestClassifier(), numeric_features=number_col, categorical_features=categorical_col)
grid = get_gridsearch(pipeline, param_grid, skf) # GridSearchCV(pipeline, param_grid, scoring="f1", cv=skf, n_jobs=-1, verbose=2)

In [20]:
tracker = MLflowTracker("home_credit_experiment")

2025/12/15 14:00:12 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/12/15 14:00:12 INFO mlflow.store.db.utils: Updating database tables
2025-12-15 14:00:12 INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
2025-12-15 14:00:12 INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
2025-12-15 14:00:12 INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
2025-12-15 14:00:12 INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


[MLflow] L'expérience existante 'home_credit_experiment' a un artifact_location différent.
Création et basculement vers la nouvelle expérience 'home_credit_experiment_20251215_140012' avec artifact_location=file:///C:/Users/corentin/Nextcloud/Onedrive-Esaip/1%20Cours/S7/Majeur%20Projets/Project%20DEA/model.
[MLflow] Tracking sur: sqlite:///C:/Users/corentin/Nextcloud/Onedrive-Esaip/1 Cours/S7/Majeur Projets/Project DEA/mlruns/mlflow.db ; artefacts -> file:///C:/Users/corentin/Nextcloud/Onedrive-Esaip/1%20Cours/S7/Majeur%20Projets/Project%20DEA/model


In [None]:
best_model, metrics = run_training_with_mlflow(
    grid=grid,
    pipeline=pipeline,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    tracker=tracker,
    run_name="rf_gridsearch_v1",
    model_name="home_credit_rf"
)


Fitting 3 folds for each of 16 candidates, totalling 48 fits


In [None]:
y_pred = grid.predict(X_test)
print("Best parameters:")
print(grid.best_params_)

print("Accuracy :", accuracy_score(y_test, y_pred))
print("F1 Score :", f1_score(y_test, y_pred, average='weighted'))
print(classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues")
plt.title("Matrice de confusion")
plt.show()
model = grid.best_estimator_.named_steps["model"]
preprocessor = grid.best_estimator_.named_steps["preprocessing"]
feature_names = preprocessor.get_feature_names_out()
clean_names = [name.split("__")[-1] for name in feature_names]
importances = pd.Series(model.feature_importances_, index=clean_names)
importances.sort_values(ascending=False).head(25).plot(kind="barh", figsize=(12, 8))
plt.title("Top 15 Features importantes")
plt.show()

# Logistic Regression Model

In [None]:
param_grid = {
    "model__C": [0.01, 0.1, 1, 10],
    "model__solver": ['liblinear', 'saga'],
    "model__max_iter": [200],
    "model__class_weight": ['balanced', {0: 1, 1: 10}],
    "model__random_state": [42]
}
pipeline = get_pipeline(LogisticRegression(), numeric_features=number_col, categorical_features=categorical_col)
grid = get_gridsearch(pipeline, param_grid, skf) # GridSearchCV(pipeline, param_grid, scoring="f1", cv=skf, n_jobs=-1, verbose=2)

best_model, metrics = run_training_with_mlflow(
    grid=grid,
    pipeline=pipeline,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    tracker=tracker,
    run_name="logreg_gridsearch_v1",
    model_name="home_credit_logreg"
)

# LightGBM

In [None]:
param_grid = {
    "model__class_weight": ['balanced', {0: 1, 1: 10}],
    "model__n_estimators": [400, 500],
    "model__learning_rate": [0.1],
    "model__num_leaves": [31, 50],
    "model__max_depth": [-1, 10, 20],
    "model__min_child_samples": [20, 50],
    "model__random_state": [42]
}
from lightgbm import LGBMClassifier
pipeline = get_pipeline(LGBMClassifier(), numeric_features=number_col, categorical_features=categorical_col)
grid = get_gridsearch(pipeline, param_grid, skf) # GridSearchCV(pipeline, param_grid, scoring="f1", cv=skf, n_jobs=-1, verbose=2)

In [None]:
best_model, metrics = run_training_with_mlflow(
    grid=grid,
    pipeline=pipeline,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    tracker=tracker,
    run_name="lgbm_gridsearch_v1",
    model_name="home_credit_lgbm"
)

In [None]:
y_pred = grid.predict(X_test)
print("Best parameters:")
print(grid.best_params_)

print("Accuracy :", accuracy_score(y_test, y_pred))
print("F1 Score :", f1_score(y_test, y_pred, average='weighted'))
print(classification_report(y_test, y_pred))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues")
plt.title("Matrice de confusion")
plt.show()

# MLP Classifier

In [None]:
param_grid = {
    "model__hidden_layer_sizes": [(100,), (200,)],
    "model__activation": ['relu'],
    "model__solver": ['adam'],
    #"model__alpha": [0.0001, 0.001],
    #"model__learning_rate": ['constant', 'adaptive'],
    "model__max_iter": [200],
    "model__random_state": [42]
}
from sklearn.neural_network import MLPClassifier
pipeline = get_pipeline(MLPClassifier(), numeric_features=number_col, categorical_features=categorical_col)
grid = get_gridsearch(pipeline, param_grid, skf) # GridSearchCV(pipeline, param_grid, scoring="f1", cv=skf, n_jobs=-1, verbose=2)

In [None]:
best_model, metrics = run_training_with_mlflow(
    grid=grid,
    pipeline=pipeline,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    tracker=tracker,
    run_name="mlp_gridsearch_v1",
    model_name="home_credit_mlp"
)

In [None]:
y_pred = grid.predict(X_test)
print("Best parameters:")
print(grid.best_params_)

print("Accuracy :", accuracy_score(y_test, y_pred))
print("F1 Score :", f1_score(y_test, y_pred, average='weighted'))
print(classification_report(y_test, y_pred))

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues")
plt.title("Matrice de confusion")
plt.show()

# Submit predictions

In [None]:
pred = best_model.predict(test)
submission = pd.DataFrame({
    "SK_ID_CURR": test["SK_ID_CURR"],
    "TARGET": pred
})
submission.to_csv("../model/submission.csv", index=False)