In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, classification_report, precision_recall_curve
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

In [2]:
# Load additional files
bureau = pd.read_csv("/content/drive/MyDrive/home-credit-default-risk/bureau.csv")
bureau_balance = pd.read_csv("/content/drive/MyDrive/home-credit-default-risk/bureau_balance.csv")
previous_application = pd.read_csv("/content/drive/MyDrive/home-credit-default-risk/previous_application.csv")
pos_cash = pd.read_csv("/content/drive/MyDrive/home-credit-default-risk/POS_CASH_balance.csv")
installments_payments = pd.read_csv("/content/drive/MyDrive/home-credit-default-risk/installments_payments.csv")
credit_card_balance = pd.read_csv("/content/drive/MyDrive/home-credit-default-risk/credit_card_balance.csv")

In [3]:
# Feature Engineering - Agréger les données

# ---- Bureau Balance: details of previous credits (détails des crédits précédents) ----
bureau_balance_agg = bureau_balance.groupby("SK_ID_BUREAU")["STATUS"].value_counts().unstack().fillna(0)
bureau_balance_agg.columns = ["STATUS_" + str(col) for col in bureau_balance_agg.columns]
bureau = bureau.merge(bureau_balance_agg, on="SK_ID_BUREAU", how="left")

# ---- Bureau: credit history of customers in other banks (historique de crédit des clients dans d'autres banques) ----
bureau_agg = bureau.groupby("SK_ID_CURR").agg({
    "DAYS_CREDIT": ["mean", "min", "max"],  # Seniority of credits (Ancienneté des crédits)
    "CREDIT_DAY_OVERDUE": ["sum"],  # Total number of days late (Nombre total de jours en retard)
    "AMT_CREDIT_SUM": ["sum", "mean"],  # Total amount of credit (Montant total du crédit)
    "AMT_CREDIT_SUM_DEBT": ["sum", "mean"],  # Total amount of debt (Montant total de la dette)
}).reset_index()
bureau_agg.columns = ['_'.join(col) for col in bureau_agg.columns]  # Rename columns (Renommer les colonnes)
bureau_agg.rename(columns={"SK_ID_CURR_": "SK_ID_CURR"}, inplace=True)

# ---- Previous Application: past credit requests (demandes de crédits passées) ----
previous_application_agg = previous_application.groupby("SK_ID_CURR").agg({
    "AMT_APPLICATION": ["mean", "max"],  # Average/maximum amount requested (Montant moyen/maximal demandé)
    "AMT_CREDIT": ["mean", "max"],  # Average/maximum amount granted (Montant moyen/maximal accordé)
    "NAME_CONTRACT_STATUS": ["nunique"]  # Number of different status (Nombre de statuts différents)
}).reset_index()
previous_application_agg.columns = ['_'.join(col) for col in previous_application_agg.columns]
previous_application_agg.rename(columns={"SK_ID_CURR_": "SK_ID_CURR"}, inplace=True)

# ---- POS_CASH_balance: credit cards and revolving loans (cartes de crédit et prêts renouvelables) ----
pos_cash_agg = pos_cash.groupby("SK_ID_CURR").agg({
    "MONTHS_BALANCE": ["min", "max"],  # Age of transactions (Ancienneté des transactions)
    "CNT_INSTALMENT": ["sum", "mean"],  # Total/average number of installments (Nombre total/moyen d'échéances)
    "SK_DPD": ["sum"]  # Cumulative delays (Retards cumulés)
}).reset_index()
pos_cash_agg.columns = ['_'.join(col) for col in pos_cash_agg.columns]
pos_cash_agg.rename(columns={"SK_ID_CURR_": "SK_ID_CURR"}, inplace=True)

# ---- Installments Payments (historique des paiements) ----
installments_payments["PAYMENT_DIFF"] = installments_payments["DAYS_ENTRY_PAYMENT"] - installments_payments["DAYS_INSTALMENT"]
installments_agg = installments_payments.groupby("SK_ID_CURR").agg({
    "PAYMENT_DIFF": ["mean", "sum"],  # Average and cumulative delays (Retards moyens et cumulés)
    "AMT_PAYMENT": ["sum", "mean"]  # Sum of payments (Somme des paiements)
}).reset_index()
installments_agg.columns = ['_'.join(col) for col in installments_agg.columns]
installments_agg.rename(columns={"SK_ID_CURR_": "SK_ID_CURR"}, inplace=True)

# ---- Credit Card Balance (historique des cartes de crédit) ----
credit_card_agg = credit_card_balance.groupby("SK_ID_CURR").agg({
    "AMT_BALANCE": ["mean", "max"],  # Average/maximum balance (Solde moyen/maximal)
    "AMT_CREDIT_LIMIT_ACTUAL": ["mean"],  # Average credit limit (Limite de crédit moyenne)
    "SK_DPD": ["sum"],  # Cumulative delays (Retards cumulés)
}).reset_index()
credit_card_agg.columns = ['_'.join(col) for col in credit_card_agg.columns]
credit_card_agg.rename(columns={"SK_ID_CURR_": "SK_ID_CURR"}, inplace=True)

In [4]:
# Merger of new features with the main base
train_df = pd.read_csv("/content/drive/MyDrive/home-credit-default-risk/application_train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/home-credit-default-risk/application_test.csv")

# Merging with aggregated datasets
for dataset in [bureau_agg, previous_application_agg, pos_cash_agg, installments_agg, credit_card_agg]:
    train_df = train_df.merge(dataset, on="SK_ID_CURR", how="left")
    test_df = test_df.merge(dataset, on="SK_ID_CURR", how="left")

In [6]:
#Preprocessing and normalization
from sklearn.preprocessing import LabelEncoder
train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)

X = train_df.drop(columns=["SK_ID_CURR", "TARGET"])
y = train_df["TARGET"]
# Ensure all categorical columns are in string
for col in X.select_dtypes(include=['object']).columns:
   X[col] = X[col].astype(str)

# Encode all categorical columns
label_encoders = {}
for col in X.select_dtypes(include=['object']).columns:
    label_encoders[col] = LabelEncoder()
    X[col] = label_encoders[col].fit_transform(X[col])

# Normalization of numerical values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [7]:
# Managing class imbalance with SMOTE
smote = SMOTE(sampling_strategy=0.5, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

In [8]:
# Model training (LightGBM)
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

model = lgb.LGBMClassifier(objective='binary', metric='auc', random_state=42, n_estimators=1000)
model.fit(X_train, y_train)



[LightGBM] [Info] Number of positive: 113044, number of negative: 226179
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.331811 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 31119
[LightGBM] [Info] Number of data points in the train set: 339223, number of used features: 143
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333244 -> initscore=-0.693550
[LightGBM] [Info] Start training from score -0.693550


In [9]:
# Prédictions and évaluation
y_pred_proba = model.predict_proba(X_val)[:, 1]
optimal_threshold = 0.155  # Threshold based on F1-score
y_pred = (y_pred_proba >= optimal_threshold).astype(int)

print("AUC:", roc_auc_score(y_val, y_pred_proba))
print("F1-score:", f1_score(y_val, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_val, y_pred))



AUC: 0.9602628903321362
F1-score: 0.8487752750830676
Confusion matrix:
 [[50332  6175]
 [ 2882 25417]]


In [11]:
# Final Predictions
# Check remaining categorical columns
categorical_cols = test_df.select_dtypes(include=['object']).columns
# Apply encoding avoiding errors due to new values
for col in categorical_cols:
    if col in label_encoders:  # Check if encoding was done on train_df
        # Replace unknown values ​​with -1 before transformation
        test_df[col] = test_df[col].apply(lambda x: x if x in label_encoders[col].classes_ else "UNKNOWN")
        # Add "UNKNOWN" to known classes and transform
        label_encoders[col].classes_ = np.append(label_encoders[col].classes_, "UNKNOWN")
        test_df[col] = label_encoders[col].transform(test_df[col])
    else:
        test_df[col] = label_encoders.fit_transform(test_df[col])  # If it is a new column, we encode it

# Transform data with the same scaler
test_scaled = scaler.transform(test_df.drop(columns=["SK_ID_CURR"]))

test_predictions = model.predict_proba(test_scaled)[:, 1]

submission = pd.DataFrame({"SK_ID_CURR": test_df["SK_ID_CURR"], "TARGET": test_predictions})
submission.to_csv("submission.csv", index=False)
print("Saved submission.csv file ✅")



Saved submission.csv file ✅
