In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder

In [2]:
target = "TARGET"
train = pd.read_csv("../model/application_train_enriched.csv")
test = pd.read_csv("../model/application_test_enriched.csv")

In [3]:
X = train.drop(columns=[target])
y = train[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
number_col = X_train.select_dtypes(include=np.number).columns
print(f"{number_col=}")
categorical_col = X_train.select_dtypes(include=['object']).columns
print(f"{categorical_col=}")

number_col=Index(['SK_ID_CURR', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT',
       'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE',
       'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION',
       ...
       'SK_DPD_DEF_sum', 'SK_DPD_DEF_mean_x', 'SK_DPD_DEF_max_x',
       'SK_DPD_DEF_min', 'CNT_INSTALMENT_sum', 'CNT_INSTALMENT_mean',
       'SK_DPD_max_y', 'SK_DPD_mean_y', 'SK_DPD_DEF_max_y',
       'SK_DPD_DEF_mean_y'],
      dtype='object', length=321)
categorical_col=Index(['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
       'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE',
       'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'HOUSETYPE_MODE',
       'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE'],
      dtype='object')


In [5]:
numerical_pipeline = make_pipeline(SimpleImputer(strategy='mean'))
categorical_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder())

preprocessor = make_column_transformer((numerical_pipeline, number_col),
                                   (categorical_pipeline, categorical_col))

In [6]:
param_grid = {
    "randomforestclassifier__n_estimators": [100, 200],
    "randomforestclassifier__max_depth": [10, 20],
    #"randomforestclassifier__min_samples_split": [2, 5],
    #"randomforestclassifier__min_samples_leaf": [1, 2],
    #"randomforestclassifier__max_features": ["sqrt", "log2", None],
    "randomforestclassifier__class_weight": ['balanced', 'balanced_subsample']
}

In [None]:
pipeline = make_pipeline(preprocessor, RandomForestClassifier())
grid = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=2)

pipeline.fit(X_train, y_train)
grid.fit(X_train, y_train)

In [None]:
y_pred = grid.predict(X_test)
print("Best parameters:")
print(grid.best_params_)

print("Accuracy :", accuracy_score(y_test, y_pred))
print("F1 Score :", f1_score(y_test, y_pred, average='weighted'))
print(classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues")
plt.title("Matrice de confusion")
plt.show()
model = grid.best_estimator_.named_steps[pipeline.steps[-1][0]]
feature_names = preprocessor.get_feature_names_out()
clean_names = [name.split("__")[-1] for name in feature_names]
importances = pd.Series(model.feature_importances_, index=clean_names)
importances.sort_values(ascending=False).head(25).plot(kind="barh", figsize=(12, 8))
plt.title("Top 15 Features importantes")
plt.show()

# Submit predictions

In [None]:
pred = pipeline.predict(test)
submission = pd.DataFrame({
    "SK_ID_CURR": test["SK_ID_CURR"],
    "TARGET": pred
})
submission.to_csv("../model/submission.csv", index=False)