In [None]:
import pandas as pd

In [None]:
# Load in previously processed dataset
application_train = pd.read_csv('./home-credit-train.csv')
application_train.shape

In [None]:
application_valid = pd.read_csv('./home-credit-valid.csv')
application_valid.shape

In [None]:
application = pd.concat([application_train, application_valid], axis=0)

In [None]:
application.drop(labels=['Unnamed: 0'], inplace=True)

In [None]:
application.shape

In [None]:
rows = application.shape[0]

In [None]:
prev_app = pd.read_csv('./previous_application.csv', nrows=rows)
prev_app = prev_app.sort_values(['SK_ID_CURR', 'SK_ID_PREV']).reset_index(drop=True)
prev_app = prev_app[prev_app['SK_ID_CURR'].isin(application['SK_ID_CURR'].values.tolist())]
prev_app.shape

In [None]:
bureau = pd.read_csv('./bureau.csv', nrows=rows)
bureau = bureau.sort_values(['SK_ID_CURR', 'SK_ID_BUREAU']).reset_index(drop=True)
bureau.shape

In [None]:
bureau_balance = pd.read_csv('./bureau_balance.csv', nrows=rows)
bureau_balance = bureau_balance.sort_values('SK_ID_BUREAU').reset_index(drop = True)
bureau_balance.shape

In [None]:
bureau_merged = pd.merge(bureau, bureau_balance, on=['SK_ID_BUREAU'])
bureau_merged.shape

In [None]:
bureau_merged = bureau_merged[bureau_merged['SK_ID_CURR'].isin(application['SK_ID_CURR'].values.tolist())]

In [None]:
credit_balance = pd.read_csv('./credit_card_balance.csv', nrows=rows)
credit_balance = credit_balance.sort_values(['SK_ID_CURR', 'SK_ID_PREV']).reset_index(drop = True)
credit_balance.shape

In [None]:
credit_balance = credit_balance[credit_balance['SK_ID_CURR'].isin(application['SK_ID_CURR'].values.tolist())]

In [None]:
pos_cash = pd.read_csv('./POS_CASH_balance.csv', nrows=rows)
pos_cash = pos_cash.sort_values(['SK_ID_CURR', 'SK_ID_PREV']).reset_index(drop = True)
pos_cash.shape

In [None]:
pos_cash = pos_cash[pos_cash['SK_ID_CURR'].isin(application['SK_ID_CURR'].values.tolist())]

In [None]:
installments = pd.read_csv('./installments_payments.csv', nrows=rows)
installments = installments.sort_values(['SK_ID_CURR', 'SK_ID_PREV']).reset_index(drop = True)
installments.shape

In [None]:
installments = installments[installments['SK_ID_CURR'].isin(application['SK_ID_CURR'].values.tolist())]

In [None]:
dataframes = [prev_app, bureau_merged, installments, pos_cash, credit_balance]

In [None]:
from functools import reduce

df_merged = reduce(lambda left, right: pd.merge(left,right,on=['SK_ID_CURR'], how='outer'), dataframes)

In [None]:
df_merged.shape

In [None]:
df_merged.to_csv('home-credit-df-merged.csv', index=False)

In [None]:
numerical_features = df_merged.select_dtypes(exclude=['object'])

In [None]:
categorical_features = df_merged.select_dtypes(exclude=['int', 'float'])
categorical_features.shape

In [None]:
tmp_categorical = pd.get_dummies(categorical_features)

In [None]:
pt_1_float_df = application.select_dtypes(include=['float'])
pt_2_float_df = numerical_features.select_dtypes(include=['float'])

pt_1_float_df['SK_ID_CURR'] = application['SK_ID_CURR']
pt_2_float_df['SK_ID_CURR'] = df_merged['SK_ID_CURR']

float_merged = pd.merge(pt_1_float_df, pt_2_float_df, on=['SK_ID_CURR'])
float_merged.shape

In [None]:
pt_1_int_df = application.select_dtypes(include=['int'])
pt_2_int_df = numerical_features.select_dtypes(include=['int'])

pt_1_int_df['SK_ID_CURR'] = application['SK_ID_CURR']
pt_2_int_df['SK_ID_CURR'] = df_merged['SK_ID_CURR']

int_merged = pd.merge(pt_1_int_df, pt_2_int_df, on=['SK_ID_CURR'])
int_merged.drop(labels=['TARGET'], inplace=True, axis=1)

In [None]:
int_merged.shape

In [None]:
float_merged = float_merged.fillna(method='ffill')
float_merged = float_merged.fillna(method='bfill')

In [None]:
float_nans = float_merged.columns[float_merged.isnull().any().tolist()].tolist()
float_merged.drop(labels=float_nans, axis=1, inplace=True)

In [None]:
IDS = float_merged['SK_ID_CURR']

In [None]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
scaled = scaler.fit_transform(float_merged.drop('SK_ID_CURR', axis=1))
normalized_float_df = pd.DataFrame(scaled)

In [None]:
normalized_float_df.shape

In [None]:
normalized_float_df['SK_ID_CURR'] = IDS
normalized_float_df.columns = float_merged.columns

In [None]:
normalized_float_df.to_csv('home-credit-float-features.csv', index=False)

In [None]:
# Add automated features back into the dataset
autofeature = pd.read_csv("../home-credit-float-autofeatures.csv")
autofeature = pd.concat([IDS, autofeature], axis=1)
autofeature.shape

In [None]:
# Add manual features back into the dataset
manual_features = pd.read_csv("./home-credit-manual-float-features.csv")
manual_features = pd.concat([IDS, manual_features], axis=1)
manual_features.shape

In [None]:
int_merged.shape

In [None]:
categorical_features.shape

In [None]:
# Autofeatures
features_A = pd.concat([
    autofeature,
    tmp_categorical,
    int_merged.drop('SK_ID_CURR', axis=1)], axis=1)

In [None]:
# Manual Features
features_B = pd.concat([
    manual_features,
    tmp_categorical,
    int_merged.drop('SK_ID_CURR', axis=1)], axis=1)

In [None]:
app = application[['SK_ID_CURR', 'TARGET']]

In [None]:
features_A_final = pd.merge(features_A, app, on='SK_ID_CURR', how='left')

In [None]:
features_B_final = pd.merge(features_B, app, on='SK_ID_CURR', how='left')

In [None]:
features_final['TARGET'].unique()

In [None]:
from sklearn.model_selection import train_test_split
 
def split_data(df):
    X_train, X_test, y_train, y_test = train_test_split(
        df.drop('TARGET', axis=1), df['TARGET'], test_size=0.33, random_state=42)
    return pd.concat([X_train, y_train], axis=1), pd.concat([X_test, y_test], axis=1)

In [None]:
def write_to_file(df, outfile):
    df.drop('Unnamed: 0', axis=1, inplace=True)
    df.to_csv(outfile, index=False)

In [None]:
train_A, valid_A = split_data(features_A_final)
train_B, valid_B = split_data(features_B_final)

In [None]:
write_to_file(train_A, 'home-credit-manual-processed-train.csv')
write_to_file(valid_A, 'home-credit-manual-processed-valid.csv')
write_to_file(train_B, 'home-credit-auto-processed-train.csv')
write_to_file(valid_B, 'home-credit-auto-processed-valid.csv')