In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import mlflow, os
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder

from model_utils import get_pipeline
from mlFlow.MLflowTracker import MLflowTracker
from model_utils import run_training_with_mlflow
from data_prep import reduce_mem_usage


In [None]:
target = "TARGET"
train = reduce_mem_usage(pd.read_csv("../model/application_train_enriched.csv")).sample(frac=0.5, random_state=42)
test = reduce_mem_usage(pd.read_csv("../model/application_test_enriched.csv"))

In [None]:
X = train.drop(columns=[target])
y = train[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
number_col = X_train.select_dtypes(include=np.number).columns
print(f"{number_col=}")
categorical_col = X_train.select_dtypes(include=['object']).columns
print(f"{categorical_col=}")

In [None]:
param_grid = {
    "model__n_estimators": [100, 200],
    "model__max_depth": [10, 20],
    "model__min_samples_split": [2, 5],
    "model__min_samples_leaf": [1, 2],
    "model__max_features": ["sqrt", "log2"],
     "model__class_weight": ['balanced', {0: 1, 1: 10}]
}

In [None]:
pipeline = get_pipeline(RandomForestClassifier(), numeric_features=number_col, categorical_features=categorical_col)
grid = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=2)

In [None]:
tracker = MLflowTracker("home_credit_experiment")

best_model, metrics = run_training_with_mlflow(
    grid=grid,
    pipeline=pipeline,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    tracker=tracker,
    run_name="rf_gridsearch_v1",
    model_name="home_credit_rf"
)


In [None]:
y_pred = grid.predict(X_test)
print("Best parameters:")
print(grid.best_params_)

print("Accuracy :", accuracy_score(y_test, y_pred))
print("F1 Score :", f1_score(y_test, y_pred, average='weighted'))
print(classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues")
plt.title("Matrice de confusion")
plt.show()
model = grid.best_estimator_.named_steps["model"]
preprocessor = grid.best_estimator_.named_steps["preprocessing"]
feature_names = preprocessor.get_feature_names_out()
clean_names = [name.split("__")[-1] for name in feature_names]
importances = pd.Series(model.feature_importances_, index=clean_names)
importances.sort_values(ascending=False).head(25).plot(kind="barh", figsize=(12, 8))
plt.title("Top 15 Features importantes")
plt.show()

# Logistic Regression Model

In [None]:
param_grid = {
    "model__C": [0.01, 0.1, 1, 10],
    "model__solver": ['liblinear', 'saga'],
    "model__max_iter": [200],
    "model__class_weight": ['balanced', {0: 1, 1: 10}]
}
pipeline = get_pipeline(LogisticRegression(), numeric_features=number_col, categorical_features=categorical_col)
grid = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=2)
tracker = MLflowTracker("home_credit_experiment")

best_model, metrics = run_training_with_mlflow(
    grid=grid,
    pipeline=pipeline,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    tracker=tracker,
    run_name="logreg_gridsearch_v1",
    model_name="home_credit_logreg"
)

# LightGBM

In [None]:
param_grid = {
    "model__class_weight": ['balanced', {0: 1, 1: 10}]
}
from lightgbm import LGBMClassifier
pipeline = get_pipeline(LGBMClassifier(), numeric_features=number_col, categorical_features=categorical_col)
grid = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=2)
tracker = MLflowTracker("home_credit_experiment")

# Submit predictions

In [None]:
pred = pipeline.predict(test)
submission = pd.DataFrame({
    "SK_ID_CURR": test["SK_ID_CURR"],
    "TARGET": pred
})
submission.to_csv("../model/submission.csv", index=False)