In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

# Chargement des données
application_data = pd.read_csv('/home/machou/openclassroom/mlflow_project/data/application_train.csv')
bureau_data = pd.read_csv('/home/machou/openclassroom/mlflow_project/data/bureau.csv')


In [19]:
bureau_balance_data = pd.read_csv('/home/machou/openclassroom/mlflow_project/data/bureau_balance.csv')
pos_cash_balance_data = pd.read_csv('/home/machou/openclassroom/mlflow_project/data/POS_CASH_balance.csv')


In [20]:
credit_card_balance_data = pd.read_csv('/home/machou/openclassroom/mlflow_project/data/credit_card_balance.csv')


In [21]:
previous_application_data = pd.read_csv('/home/machou/openclassroom/mlflow_project/data/previous_application.csv')


In [22]:
installments_payments_data = pd.read_csv('/home/machou/openclassroom/mlflow_project/data/installments_payments.csv')


In [23]:
application_data.drop_duplicates(inplace=True)
bureau_data.drop_duplicates(inplace=True)
bureau_balance_data.drop_duplicates(inplace=True)
pos_cash_balance_data.drop_duplicates(inplace=True)
credit_card_balance_data.drop_duplicates(inplace=True)
previous_application_data.drop_duplicates(inplace=True)
installments_payments_data.drop_duplicates(inplace=True)


In [16]:
import pandas as pd

# Fonction pour identifier et encoder les colonnes catégorielles
def one_hot_encode(df, drop_first=True):
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    if len(categorical_cols) > 0:
        df = pd.get_dummies(df, columns=categorical_cols, drop_first=drop_first)
    return df

# Nettoyage et encodage de chaque table
application_data = one_hot_encode(application_data)
bureau_data = one_hot_encode(bureau_data)
bureau_balance_data = one_hot_encode(bureau_balance_data)
pos_cash_balance_data = one_hot_encode(pos_cash_balance_data)
credit_card_balance_data = one_hot_encode(credit_card_balance_data)
previous_application_data = one_hot_encode(previous_application_data)
installments_payments_data = one_hot_encode(installments_payments_data)


In [None]:
Étapes pour la fusion avec groupby :

    Identifier les clés communes :
        Les tables sont regroupées (groupby) en fonction de leur clé commune, comme SK_ID_CURR ou SK_ID_BUREAU.

    Appliquer des agrégations :
        Résumer les colonnes des tables secondaires avec des métriques comme mean, sum, count, min, et max.

    Fusionner les résultats agrégés avec la table principale :
        Joindre les tables agrégées à la table principale (application_data) sur les clés communes.

In [24]:
# 1. Agrégation des tables secondaires avec groupby

# Bureau Data
bureau_agg = bureau_data.groupby('SK_ID_CURR').agg({
    'AMT_CREDIT_SUM': ['sum', 'mean', 'max'],
    'AMT_CREDIT_SUM_DEBT': ['sum', 'mean', 'max'],
    'DAYS_CREDIT': ['mean', 'min']
}).reset_index()

# Renommer les colonnes pour éviter les doublons
bureau_agg.columns = ['SK_ID_CURR'] + [f"BUREAU_{col[0]}_{col[1].upper()}" for col in bureau_agg.columns[1:]]

# POS Cash Balance
pos_cash_agg = pos_cash_balance_data.groupby('SK_ID_CURR').agg({
    'SK_DPD': 'sum',
    'SK_DPD_DEF': 'sum',
    'CNT_INSTALMENT': 'mean',
    'CNT_INSTALMENT_FUTURE': 'mean'
}).reset_index()

pos_cash_agg.columns = ['SK_ID_CURR'] + [f"POS_{col}" for col in pos_cash_agg.columns[1:]]

# Credit Card Balance
credit_card_agg = credit_card_balance_data.groupby('SK_ID_CURR').agg({
    'AMT_BALANCE': 'sum',
    'AMT_CREDIT_LIMIT_ACTUAL': 'mean',
    'AMT_DRAWINGS_ATM_CURRENT': 'sum',
    'AMT_PAYMENT_CURRENT': 'sum'
}).reset_index()

credit_card_agg.columns = ['SK_ID_CURR'] + [f"CREDIT_CARD_{col}" for col in credit_card_agg.columns[1:]]

# Previous Applications
previous_application_agg = previous_application_data.groupby('SK_ID_CURR').agg({
    'AMT_APPLICATION': 'sum',
    'AMT_CREDIT': 'sum',
    'CNT_PAYMENT': 'mean'
}).reset_index()

previous_application_agg.columns = ['SK_ID_CURR'] + [f"PREVIOUS_{col}" for col in previous_application_agg.columns[1:]]

# Installments Payments
installments_agg = installments_payments_data.groupby('SK_ID_CURR').agg({
    'AMT_INSTALMENT': 'sum',
    'AMT_PAYMENT': 'sum',
    'NUM_INSTALMENT_NUMBER': 'count'
}).reset_index()

installments_agg.columns = ['SK_ID_CURR'] + [f"INSTALLMENTS_{col}" for col in installments_agg.columns[1:]]

# 2. Fusionner toutes les tables agrégées avec la table principale
data = application_data.copy()

# Joindre les tables
data = data.merge(bureau_agg, on='SK_ID_CURR', how='left')
data = data.merge(pos_cash_agg, on='SK_ID_CURR', how='left')
data = data.merge(credit_card_agg, on='SK_ID_CURR', how='left')
data = data.merge(previous_application_agg, on='SK_ID_CURR', how='left')
data = data.merge(installments_agg, on='SK_ID_CURR', how='left')

# 3. Vérifier les dimensions finales
print(f"Final dataset shape: {data.shape}")


Final dataset shape: (307511, 144)


In [25]:
print(data.head())

   SK_ID_CURR  TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR  \
0      100002       1         Cash loans           M            N   
1      100003       0         Cash loans           F            N   
2      100004       0    Revolving loans           M            Y   
3      100006       0         Cash loans           F            N   
4      100007       0         Cash loans           M            N   

  FLAG_OWN_REALTY  CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  \
0               Y             0          202500.0    406597.5      24700.5   
1               N             0          270000.0   1293502.5      35698.5   
2               Y             0           67500.0    135000.0       6750.0   
3               Y             0          135000.0    312682.5      29686.5   
4               Y             0          121500.0    513000.0      21865.5   

   ...  CREDIT_CARD_AMT_BALANCE CREDIT_CARD_AMT_CREDIT_LIMIT_ACTUAL  \
0  ...                      NaN              

In [26]:
# 4. Enregistrer le dataset final dans un fichier CSV
output_path = "/home/machou/openclassroom/mlflow_project/data/data_preprocessed_train.csv"
data.to_csv(output_path, index=False)
print(f"Dataset saved to {output_path}")

Dataset saved to /home/machou/openclassroom/mlflow_project/data/data_preprocessed_train.csv
