In [1]:
import pandas as pd

# Load application_train
df = pd.read_csv('application_train.csv')

# Load installments_payments
installments = pd.read_csv('installments_payments.csv')

print("Ukuran data installments:", installments.shape)
print(installments[['SK_ID_CURR', 'NUM_INSTALMENT_NUMBER', 'DAYS_INSTALMENT', 'AMT_INSTALMENT', 'AMT_PAYMENT']].head())

Ukuran data installments: (13605401, 8)
   SK_ID_CURR  NUM_INSTALMENT_NUMBER  DAYS_INSTALMENT  AMT_INSTALMENT  \
0      161674                      6          -1180.0        6948.360   
1      151639                     34          -2156.0        1716.525   
2      193053                      1            -63.0       25425.000   
3      199697                      3          -2418.0       24350.130   
4      167756                      2          -1383.0        2165.040   

   AMT_PAYMENT  
0     6948.360  
1     1716.525  
2    25425.000  
3    24350.130  
4     2160.585  


In [3]:
# Hitung keterlambatan
installments['PAYMENT_DELAY'] = installments['DAYS_ENTRY_PAYMENT'] - installments['DAYS_INSTALMENT']

# Selisih pembayaran (positif jika underpayment)
installments['PAYMENT_DIFF'] = installments['AMT_INSTALMENT'] - installments['AMT_PAYMENT']

In [4]:
inst_agg = installments.groupby('SK_ID_CURR').agg({
    'PAYMENT_DELAY': ['mean', 'max'],
    'PAYMENT_DIFF': ['mean', 'sum'],
    'AMT_PAYMENT': ['mean', 'sum'],
    'AMT_INSTALMENT': ['mean', 'sum']
})

# Rename kolom multi-index
inst_agg.columns = ['INST_' + '_'.join(col).upper() for col in inst_agg.columns]
inst_agg = inst_agg.reset_index()

In [5]:
df = df.merge(inst_agg, on='SK_ID_CURR', how='left')

In [6]:
correlation = df.corr(numeric_only=True)['TARGET'].sort_values(ascending=False)
print("Fitur dari installments_payments.csv yang paling berkorelasi dengan TARGET:")
print(correlation.head(10))

Fitur dari installments_payments.csv yang paling berkorelasi dengan TARGET:
TARGET                         1.000000
DAYS_BIRTH                     0.078239
REGION_RATING_CLIENT_W_CITY    0.060893
REGION_RATING_CLIENT           0.058899
DAYS_LAST_PHONE_CHANGE         0.055218
DAYS_ID_PUBLISH                0.051457
REG_CITY_NOT_WORK_CITY         0.050994
FLAG_EMP_PHONE                 0.045982
REG_CITY_NOT_LIVE_CITY         0.044395
FLAG_DOCUMENT_3                0.044346
Name: TARGET, dtype: float64


In [7]:
df.to_csv('df_merged_with_installments.csv', index=False)