In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load data train
df_ap_train = pd.read_csv('homecredit/application_train.csv')
df_ap_train

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,Cash loans,M,N,N,0,157500.0,254700.0,27558.0,...,0,0,0,0,,,,,,
307507,456252,0,Cash loans,F,N,Y,0,72000.0,269550.0,12001.5,...,0,0,0,0,,,,,,
307508,456253,0,Cash loans,F,N,Y,0,153000.0,677664.0,29979.0,...,0,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0
307509,456254,1,Cash loans,F,N,Y,0,171000.0,370107.0,20205.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
unique = df_ap_train.select_dtypes('object').apply(pd.Series.nunique, axis = 0).to_frame('Jumlah Unique Kategori')

def get_unique_values(col):
  return df_ap_train[col].unique().tolist()

unique['Kategori'] = unique.index.map(get_unique_values)

unique

Unnamed: 0,Jumlah Unique Kategori,Kategori
NAME_CONTRACT_TYPE,2,"[Cash loans, Revolving loans]"
CODE_GENDER,3,"[M, F, XNA]"
FLAG_OWN_CAR,2,"[N, Y]"
FLAG_OWN_REALTY,2,"[Y, N]"
NAME_TYPE_SUITE,7,"[Unaccompanied, Family, Spouse, partner, Child..."
NAME_INCOME_TYPE,8,"[Working, State servant, Commercial associate,..."
NAME_EDUCATION_TYPE,5,"[Secondary / secondary special, Higher educati..."
NAME_FAMILY_STATUS,6,"[Single / not married, Married, Civil marriage..."
NAME_HOUSING_TYPE,6,"[House / apartment, Rented apartment, With par..."
OCCUPATION_TYPE,18,"[Laborers, Core staff, Accountants, Managers, ..."


In [4]:
# Replace 'XNA' values in 'CODE_GENDER' with np.nan
df_ap_train['CODE_GENDER'] = df_ap_train['CODE_GENDER'].replace('XNA', np.nan)

# Replace 'XNA' values in 'ORGANIZATION_TYPE' with np.nan
df_ap_train['ORGANIZATION_TYPE'] = df_ap_train['ORGANIZATION_TYPE'].replace('XNA', np.nan)

# **Data Test**
---
* This is the main table, broken into two files for Train (with TARGET) and Test (without TARGET).
* Static data for all applications. One row represents one loan in our data sample.
---
**Semua proses preprocessing yang dilakukan di data train akan dilakukan juga di data test**

In [5]:
# Load data train
df_ap_test = pd.read_csv('homecredit/application_test.csv')
df_ap_test

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,...,0,0,0,0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,456221,Cash loans,F,N,Y,0,121500.0,412560.0,17473.5,270000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
48740,456222,Cash loans,F,N,N,2,157500.0,622413.0,31909.5,495000.0,...,0,0,0,0,,,,,,
48741,456223,Cash loans,F,Y,Y,1,202500.0,315000.0,33205.5,315000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,3.0,1.0
48742,456224,Cash loans,M,N,N,0,225000.0,450000.0,25128.0,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0


In [6]:
unique = df_ap_test.select_dtypes('object').apply(pd.Series.nunique, axis = 0).to_frame('Jumlah Unique Kategori')

def get_unique_values(col):
  return df_ap_test[col].unique().tolist()

unique['Kategori'] = unique.index.map(get_unique_values)

unique

Unnamed: 0,Jumlah Unique Kategori,Kategori
NAME_CONTRACT_TYPE,2,"[Cash loans, Revolving loans]"
CODE_GENDER,2,"[F, M]"
FLAG_OWN_CAR,2,"[N, Y]"
FLAG_OWN_REALTY,2,"[Y, N]"
NAME_TYPE_SUITE,7,"[Unaccompanied, nan, Family, Spouse, partner, ..."
NAME_INCOME_TYPE,7,"[Working, State servant, Pensioner, Commercial..."
NAME_EDUCATION_TYPE,5,"[Higher education, Secondary / secondary speci..."
NAME_FAMILY_STATUS,5,"[Married, Single / not married, Civil marriage..."
NAME_HOUSING_TYPE,6,"[House / apartment, With parents, Rented apart..."
OCCUPATION_TYPE,18,"[nan, Low-skill Laborers, Drivers, Sales staff..."


In [7]:
# Replace 'XNA' values in 'ORGANIZATION_TYPE' with np.nan
df_ap_test['ORGANIZATION_TYPE'] = df_ap_test['ORGANIZATION_TYPE'].replace('XNA', np.nan)

# Merge Train & Test

In [8]:
# Create a label encoder object
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in df_ap_train:
    if df_ap_train[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(df_ap_train[col].unique())) <= 2:
            # Train on the training data
            le.fit(df_ap_train[col])
            # Transform both training and testing data
            df_ap_train[col] = le.transform(df_ap_train[col])
            df_ap_test[col] = le.transform(df_ap_test[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
            
print('%d columns were label encoded.' % le_count)

3 columns were label encoded.


In [9]:
df_ap_train = pd.get_dummies(df_ap_train)
df_ap_test = pd.get_dummies(df_ap_test)

print('Training Features shape: ', df_ap_train.shape)
print('Testing Features shape: ', df_ap_test.shape)

Training Features shape:  (307511, 241)
Testing Features shape:  (48744, 238)


In [10]:
train_labels = df_ap_train['TARGET']

# Align the training and testing data, keep only columns present in both dataframes
df_ap_train, df_ap_test = df_ap_train.align(df_ap_test, join = 'inner', axis = 1)

# Add the target back in
df_ap_train['TARGET'] = train_labels

print('Training Features shape: ', df_ap_train.shape)
print('Testing Features shape: ', df_ap_test.shape)

Training Features shape:  (307511, 239)
Testing Features shape:  (48744, 238)


In [11]:
df_ap_combined = pd.concat([df_ap_train, df_ap_test], ignore_index=True)

In [12]:
df_ap_combined

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,TARGET
0,100002,0,0,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,...,0,0,0,0,0,1,0,1,0,1.0
1,100003,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,...,1,0,0,0,0,0,0,1,0,0.0
2,100004,1,1,1,0,67500.0,135000.0,6750.0,135000.0,0.010032,...,0,0,0,0,0,0,0,0,0,0.0
3,100006,0,0,1,0,135000.0,312682.5,29686.5,297000.0,0.008019,...,0,0,0,0,0,0,0,0,0,0.0
4,100007,0,0,1,0,121500.0,513000.0,21865.5,513000.0,0.028663,...,0,0,0,0,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
356250,456221,0,0,1,0,121500.0,412560.0,17473.5,270000.0,0.002042,...,0,0,0,0,0,0,0,0,0,
356251,456222,0,0,0,2,157500.0,622413.0,31909.5,495000.0,0.035792,...,0,0,0,0,0,0,0,0,0,
356252,456223,0,1,1,1,202500.0,315000.0,33205.5,315000.0,0.026392,...,0,0,0,0,0,1,0,1,0,
356253,456224,0,0,0,0,225000.0,450000.0,25128.0,450000.0,0.018850,...,0,0,0,0,1,0,0,1,0,


In [13]:
df_ap_combined.to_csv('train_test.csv', index=False)

In [14]:
read = pd.read_csv('train_test.csv')
read

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,TARGET
0,100002,0,0,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,...,0,0,0,0,0,1,0,1,0,1.0
1,100003,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,...,1,0,0,0,0,0,0,1,0,0.0
2,100004,1,1,1,0,67500.0,135000.0,6750.0,135000.0,0.010032,...,0,0,0,0,0,0,0,0,0,0.0
3,100006,0,0,1,0,135000.0,312682.5,29686.5,297000.0,0.008019,...,0,0,0,0,0,0,0,0,0,0.0
4,100007,0,0,1,0,121500.0,513000.0,21865.5,513000.0,0.028663,...,0,0,0,0,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
356250,456221,0,0,1,0,121500.0,412560.0,17473.5,270000.0,0.002042,...,0,0,0,0,0,0,0,0,0,
356251,456222,0,0,0,2,157500.0,622413.0,31909.5,495000.0,0.035792,...,0,0,0,0,0,0,0,0,0,
356252,456223,0,1,1,1,202500.0,315000.0,33205.5,315000.0,0.026392,...,0,0,0,0,0,1,0,1,0,
356253,456224,0,0,0,0,225000.0,450000.0,25128.0,450000.0,0.018850,...,0,0,0,0,1,0,0,1,0,
