In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os 

current_dir = os.getcwd()
parent_dir  = os.path.dirname(current_dir)
file_path1  = os.path.join(parent_dir, 'data', 'raw_data', 'application_train.csv')
file_path2  = os.path.join(parent_dir, 'data', 'raw_data', 'application_test.csv')
appl_train  = pd.read_csv(file_path1)
appl_test   = pd.read_csv(file_path2)
appl_train.head()

In [None]:
appl_test.shape

In [None]:
y = appl_train[["SK_ID_CURR", "TARGET"]]
appl_train.drop(columns='TARGET', inplace=True)
appl_test = appl_test.rename(columns={'tSK_ID_CURR': 'SK_ID_CURR'})
appl_total = pd.concat([appl_train, appl_test])


In [None]:
appl_total.head()

In [None]:
"""train = appl_total[appl_total["SK_ID_CURR"].isin(y["SK_ID_CURR"]) == True]
test  = appl_total[appl_total["SK_ID_CURR"].isin(y["SK_ID_CURR"]) == False]
test.shape"""

In [None]:
#function for transforming

def create_logs(data, features):
    for col in features:
        data["LOG_" + str(col)] = np.log1p(data[col])
    
    return data

def convert_days(data, features, t = 12, rounding = True):
    for var in features:
            if rounding == True:
                data["CONVERTED_" + str(var)] = round(-data[var]/t)
            else:
                data["CONVERTED_" + str(var)] = -data[var]/t
            data["CONVERTED_" + str(var)][data["CONVERTED_" + str(var)] < 0] = None
    return data

In [None]:
#function for count missing data
def count_missings(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum() / data.isnull().count() * 100).sort_values(ascending = False)
    table = pd.concat([total, percent], axis = 1, keys = ["Total", "Percent"])
    table = table[table["Total"] > 0]
    return table

In [None]:

#feature engineering 


# income ratios
appl_total["CREDIT_BY_INCOME"]      = appl_total["AMT_CREDIT"]      / appl_total["AMT_INCOME_TOTAL"]
appl_total["ANNUITY_BY_INCOME"]     = appl_total["AMT_ANNUITY"]     / appl_total["AMT_INCOME_TOTAL"]
appl_total["GOODS_PRICE_BY_INCOME"] = appl_total["AMT_GOODS_PRICE"] / appl_total["AMT_INCOME_TOTAL"]
appl_total["INCOME_PER_PERSON"]     = appl_total["AMT_INCOME_TOTAL"]/appl_total["CNT_FAM_MEMBERS"]
#annuity length
appl_total['ANNUITY LENGTH'] = appl_total['AMT_CREDIT'] / appl_total['AMT_ANNUITY']

#ex_mean
appl_total["EXT_SOURCE_MEAN"] = appl_total[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].mean(axis = 1)

# number of documents
doc_vars = [col for col in appl_total.columns if "FLAG_DOCUMENT_" in col ]
appl_total["NUM_DOCUMENTS"] = appl_total[doc_vars].sum(axis = 1)

# log transformation
log_vars = ["AMT_CREDIT", "AMT_INCOME_TOTAL", "AMT_GOODS_PRICE", "AMT_ANNUITY"]
appl_total = create_logs(appl_total, log_vars)
appl_total = appl_total.drop(columns=log_vars)

#convert days
appl_total['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
day_to_years = ["DAYS_BIRTH", "DAYS_EMPLOYED"] # to years
appl_total["AGE"] = np.round(appl_total['DAYS_BIRTH']/(-365))
appl_total["YEARS_EMPLOYED"] = appl_total['DAYS_EMPLOYED']/(-365)
appl_total['EMPLOYMENT_STABILITY'] = appl_total['YEARS_EMPLOYED']/appl_total['AGE']

day_to_months = ["DAYS_REGISTRATION", "DAYS_ID_PUBLISH", "DAYS_LAST_PHONE_CHANGE"]
appl_total = convert_days(appl_total, day_to_months, t = 30, rounding = True)


#colums to drop - area related columns
drops = ['APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 
         'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI',
         'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI','YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI',
         'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'COMMONAREA_MODE','ELEVATORS_MODE', 'ENTRANCES_MODE', 
         'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 
         'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'TOTALAREA_MODE',  'YEARS_BEGINEXPLUATATION_MODE']

appl_total = appl_total.drop(columns = drops)