In [1]:
import pandas as pd
import numpy as np

# Load datasets
bureau = pd.read_csv('bureau.csv')
bureau_balance = pd.read_csv('bureau_balance.csv')
pos_cash_balance = pd.read_csv('POS_CASH_balance.csv')
credit_card_balance = pd.read_csv('credit_card_balance.csv')
previous_application = pd.read_csv('previous_application.csv')
installments_payments = pd.read_csv('installments_payments.csv')

In [3]:
# Aggregate bureau_balance to summarize SK_BUREAU_ID
bureau_balance_agg = bureau_balance.groupby('SK_ID_BUREAU').agg({
    'MONTHS_BALANCE': ['min', 'max', 'mean'],
    'STATUS': lambda x: x.mode()[0] if not x.mode().empty else np.nan
}).reset_index()

bureau_balance_agg.columns = ['SK_ID_BUREAU'] + ['BUREAU_BAL_' + col[1].upper() for col in bureau_balance_agg.columns[1:]]

# Merge with bureau
bureau = bureau.merge(bureau_balance_agg, on='SK_ID_BUREAU', how='left')

# Aggregate bureau by SK_ID_CURR
bureau_agg = bureau.groupby('SK_ID_CURR').agg({
    'CREDIT_ACTIVE': lambda x: x.mode()[0] if not x.mode().empty else np.nan,
    'AMT_CREDIT_SUM': ['sum', 'mean'],
    'AMT_CREDIT_SUM_DEBT': ['sum', 'mean'],
    'CREDIT_DAY_OVERDUE': ['max', 'mean']
}).reset_index()

bureau_agg.columns = ['SK_ID_CURR'] + ['BUREAU_' + col[1].upper() for col in bureau_agg.columns[1:]]

In [4]:
pos_cash_agg = pos_cash_balance.groupby('SK_ID_CURR').agg({
    'CNT_INSTALMENT_FUTURE': ['min', 'max', 'mean'],
    'SK_DPD': ['max', 'mean']
}).reset_index()

pos_cash_agg.columns = ['SK_ID_CURR'] + ['POS_' + col[1].upper() for col in pos_cash_agg.columns[1:]]

In [5]:
credit_card_agg = credit_card_balance.groupby('SK_ID_CURR').agg({
    'AMT_BALANCE': ['mean', 'max'],
    'AMT_PAYMENT_CURRENT': ['sum', 'mean'],
    'SK_DPD': ['max', 'mean']
}).reset_index()

credit_card_agg.columns = ['SK_ID_CURR'] + ['CC_' + col[1].upper() for col in credit_card_agg.columns[1:]]

In [6]:
prev_app_agg = previous_application.groupby('SK_ID_CURR').agg({
    'AMT_ANNUITY': ['mean', 'max'],
    'AMT_CREDIT': ['mean', 'max'],
    'AMT_DOWN_PAYMENT': ['sum', 'mean'],
    'RATE_DOWN_PAYMENT': ['mean'],
    'DAYS_DECISION': ['max', 'min']
}).reset_index()

prev_app_agg.columns = ['SK_ID_CURR'] + ['PREV_' + col[1].upper() for col in prev_app_agg.columns[1:]]

In [7]:
installments_agg = installments_payments.groupby('SK_ID_CURR').agg({
    'AMT_PAYMENT': ['sum', 'mean'],
    'AMT_INSTALMENT': ['sum', 'mean'],
    'DAYS_ENTRY_PAYMENT': ['max', 'min']
}).reset_index()

installments_agg.columns = ['SK_ID_CURR'] + ['INST_' + col[1].upper() for col in installments_agg.columns[1:]]

In [8]:
# Start with bureau_agg and progressively merge other datasets
supporting_master_data = bureau_agg.merge(
    pos_cash_agg, on='SK_ID_CURR', how='left'
).merge(
    credit_card_agg, on='SK_ID_CURR', how='left'
).merge(
    prev_app_agg, on='SK_ID_CURR', how='left'
).merge(
    installments_agg, on='SK_ID_CURR', how='left'
)

In [9]:
# Save the supporting master data
supporting_master_data.to_csv('supporting_master_data.csv', index=False)

In [17]:
# Check for duplicates
num_duplicates = supporting_master_data.duplicated().sum()
print(f"Number of duplicate rows: {num_duplicates}")

# Check for missing values (exact count and percentage)
missing_values = supporting_master_data.isnull().sum()
missing_percent = (missing_values / len(supporting_master_data)) * 100

missing_df = pd.DataFrame({
    'Feature': supporting_master_data.columns,
    'Missing Count': missing_values,
    'Missing Percentage': missing_percent
}).reset_index(drop=True)

print("\nMissing Values Summary:")
print(missing_df.sort_values(by='Missing Percentage', ascending=False))

Number of duplicate rows: 0

Missing Values Summary:
            Feature  Missing Count  Missing Percentage
16          CC_MEAN         242922           79.435338
17           CC_MAX         215881           70.592948
18          CC_MEAN         215881           70.592948
15           CC_SUM         215881           70.592948
14           CC_MAX         215881           70.592948
13          CC_MEAN         215881           70.592948
25        PREV_MEAN          31577           10.325659
24        PREV_MEAN          31577           10.325659
8           POS_MIN          16005            5.233625
9           POS_MAX          16005            5.233625
10         POS_MEAN          16005            5.233625
12         POS_MEAN          15983            5.226431
11          POS_MAX          15983            5.226431
19        PREV_MEAN          15160            4.957310
20         PREV_MAX          15160            4.957310
22         PREV_MAX          14720            4.813431
26         P