Below is summary of data schema - 
![image](https://storage.googleapis.com/kaggle-media/competitions/home-credit/home_credit.png)

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
train_data_df = pd.read_csv('../input/application_train.csv')

# Label Encoding

In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
def label_encoder(dfname):
    le = LabelEncoder()
    j = 0
    for column in dfname:
        if dfname[column].dtype == 'object':
            if len(list(dfname[column].unique())) <= 2:
                le.fit(dfname[column])
                dfname[column] = le.transform(dfname[column])
                j += 1
    print(str(j) + ' Columns encoded')

In [6]:
label_encoder(train_data_df)
train_data_df = pd.get_dummies(train_data_df)

3 Columns encoded


# For each table counting no. of entries for each loan and averaging out all entries to create one row per loan
1. Count the no. of entries in new table which has primary key SK_ID_PREV for each value of SK_ID_CURR which is the foreign key referring to a loan
2. take rest of columns and take average of all entries
3. append table identifier to prevent name clashes
4. Join the table with main table

# Previous loan applications data

In [7]:
prev_applications_df = pd.read_csv('../input/previous_application.csv')
label_encoder(prev_applications_df)
prev_applications_df = pd.get_dummies(prev_applications_df)

1 Columns encoded


In [8]:
previous_apps_count = prev_applications_df[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
prev_applications_df['SK_ID_PREV'] = prev_applications_df['SK_ID_CURR'].map(previous_apps_count['SK_ID_PREV'])
previous_apps_mean = prev_applications_df.groupby('SK_ID_CURR').mean()
previous_apps_mean.columns = ['prev_apps' + col for col in previous_apps_mean.columns]
train_data_df = train_data_df.merge(right=previous_apps_mean.reset_index(), how='left', on='SK_ID_CURR')

# Previous loan payments data

In [9]:
installments_df = pd.read_csv('../input/installments_payments.csv')
label_encoder(installments_df)
installments_df = pd.get_dummies(installments_df)

0 Columns encoded


In [10]:
installments_count = installments_df[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
installments_df['SK_ID_PREV'] = installments_df['SK_ID_CURR'].map(installments_count['SK_ID_PREV'])
installments_mean = installments_df.groupby('SK_ID_CURR').mean()
installments_mean.columns = ['installment_' + col for col in installments_mean.columns]
train_data_df = train_data_df.merge(right=installments_mean.reset_index(), how='left', on='SK_ID_CURR')

# Monthly balance data of loans previously held

In [11]:
previous_loans_df = pd.read_csv('../input/POS_CASH_balance.csv')
label_encoder(previous_loans_df)
previous_loans_df = pd.get_dummies(previous_loans_df)

0 Columns encoded


In [12]:
previous_loans_count = previous_loans_df[['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').count()
previous_loans_df['SK_ID_PREV'] = previous_loans_df['SK_ID_CURR'].map(previous_apps_count['SK_ID_PREV'])
previous_loans_mean = previous_loans_df.groupby('SK_ID_CURR').mean()
previous_loans_mean.columns = ['prev_loans_' + col for col in previous_loans_mean.columns]
train_data_df = train_data_df.merge(right=previous_loans_mean.reset_index(), how='left', on='SK_ID_CURR')

# Credit card balance data

In [13]:
credit_card_df = pd.read_csv('../input/credit_card_balance.csv')
label_encoder(credit_card_df)
credit_card_df = pd.get_dummies(credit_card_df)

0 Columns encoded


In [14]:
credit_card_count = credit_card_df[['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').count()
credit_card_df['SK_ID_PREV'] = credit_card_df['SK_ID_CURR'].map(credit_card_count['SK_ID_PREV'])
credit_card_mean = credit_card_df.groupby('SK_ID_CURR').mean()
credit_card_mean.columns = ['card_'+ col for col in credit_card_mean.columns]
train_data_df = train_data_df.merge(right=credit_card_mean.reset_index(), how='left',on='SK_ID_CURR')

# Applicant's credit bureau history

In [15]:
bureau_history_df = pd.read_csv('../input/bureau.csv')
label_encoder(bureau_history_df)
bureau_history_df = pd.get_dummies(bureau_history_df)

0 Columns encoded


In [16]:
bureau_history_count =bureau_history_df[['SK_ID_CURR','SK_ID_BUREAU']].groupby('SK_ID_CURR').count()
bureau_history_df['SK_ID_BUREAU'] = bureau_history_df['SK_ID_CURR'].map(bureau_history_count['SK_ID_BUREAU'])
bureau_history_mean = bureau_history_df.groupby('SK_ID_CURR').mean()
bureau_history_mean.columns = ['bureau_' + col for col in bureau_history_mean.columns]
train_data_df = train_data_df.merge(right=bureau_history_mean.reset_index(), how='left', on='SK_ID_CURR')

# Imputing Missing Values

In [17]:
from sklearn.impute import SimpleImputer

In [18]:
X = train_data_df.drop('TARGET',axis=1)
list_of_features = X.columns
Y = train_data_df['TARGET']

imputer = SimpleImputer()

X = pd.DataFrame(imputer.fit_transform(X))
X.columns = list_of_features

In [19]:
Y.to_csv('../input/final_target.csv')

In [20]:
X.to_csv('../input/final_data.csv')