# Classification Project

# SetUp

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

pd.set_option("display.max_columns", 300)
pd.set_option("display.max_info_columns", 300)

application_train = pd.read_csv("/Users/drewpeterson/Documents/Programming/Python/LFZ_Python/LFZ_Week12/Classification_Project/home-credit-default-risk/application_train.csv")
application_test = pd.read_csv("/Users/drewpeterson/Documents/Programming/Python/LFZ_Python/LFZ_Week12/Classification_Project/home-credit-default-risk/application_test.csv")
credit_card_balance = pd.read_csv("/Users/drewpeterson/Documents/Programming/Python/LFZ_Python/LFZ_Week12/Classification_Project/home-credit-default-risk/credit_card_balance.csv")
installments_payments = pd.read_csv("/Users/drewpeterson/Documents/Programming/Python/LFZ_Python/LFZ_Week12/Classification_Project/home-credit-default-risk/installments_payments.csv")
previous_application = pd.read_csv("/Users/drewpeterson/Documents/Programming/Python/LFZ_Python/LFZ_Week12/Classification_Project/home-credit-default-risk/previous_application.csv")
POS_CASH_balance = pd.read_csv("/Users/drewpeterson/Documents/Programming/Python/LFZ_Python/LFZ_Week12/Classification_Project/home-credit-default-risk/POS_CASH_balance.csv")
bureau = pd.read_csv("/Users/drewpeterson/Documents/Programming/Python/LFZ_Python/LFZ_Week12/Classification_Project/home-credit-default-risk/bureau.csv")
bureau_balance = pd.read_csv("/Users/drewpeterson/Documents/Programming/Python/LFZ_Python/LFZ_Week12/Classification_Project/home-credit-default-risk/bureau_balance.csv")

# ONE at a Time

## Installments Payments

In [5]:
installments_payments["DAYS_DIFFERENCE"] = installments_payments.DAYS_INSTALMENT - installments_payments.DAYS_ENTRY_PAYMENT
installments_payments["AMT_DIFFERENCE"] = installments_payments.AMT_PAYMENT - installments_payments.AMT_INSTALMENT

installments_payments.drop(columns=["NUM_INSTALMENT_NUMBER", "DAYS_INSTALMENT", "DAYS_ENTRY_PAYMENT", "AMT_INSTALMENT", "AMT_PAYMENT"],inplace= True)
installments_payments.dropna(inplace=True)

installments_payments.NUM_INSTALMENT_VERSION = installments_payments.groupby(["SK_ID_PREV","SK_ID_CURR"]).NUM_INSTALMENT_VERSION.transform(lambda x: x.median())
installments_payments.DAYS_DIFFERENCE = installments_payments.groupby(["SK_ID_PREV","SK_ID_CURR"]).DAYS_DIFFERENCE.transform(lambda x: x.mean())
installments_payments.AMT_DIFFERENCE = installments_payments.groupby(["SK_ID_PREV","SK_ID_CURR"]).AMT_DIFFERENCE.transform(lambda x: x.mean())

installments_payments.drop_duplicates(inplace=True)

## Credit Card Balance

In [6]:
credit_card_balance = credit_card_balance.fillna(0)

credit_card_balance["AMT_DRAWINGS_TOTAL"] = credit_card_balance.AMT_DRAWINGS_ATM_CURRENT + credit_card_balance.AMT_DRAWINGS_OTHER_CURRENT + credit_card_balance.AMT_DRAWINGS_CURRENT + credit_card_balance.AMT_DRAWINGS_POS_CURRENT
credit_card_balance["CNT_DRAWINGS_TOTAL"] = credit_card_balance.CNT_DRAWINGS_ATM_CURRENT + credit_card_balance.CNT_DRAWINGS_OTHER_CURRENT + credit_card_balance.CNT_DRAWINGS_CURRENT + credit_card_balance.CNT_DRAWINGS_POS_CURRENT
credit_card_balance["COUNT_CREDIT_CARD"] = credit_card_balance.groupby(["SK_ID_PREV","SK_ID_CURR"]).AMT_BALANCE.transform(lambda x: x.count())

credit_card_balance = credit_card_balance.drop(columns = ["SK_DPD", "AMT_DRAWINGS_ATM_CURRENT", "AMT_DRAWINGS_OTHER_CURRENT", "AMT_DRAWINGS_CURRENT", "AMT_DRAWINGS_POS_CURRENT", "CNT_DRAWINGS_ATM_CURRENT", "CNT_DRAWINGS_OTHER_CURRENT", "CNT_DRAWINGS_CURRENT", "CNT_DRAWINGS_POS_CURRENT", "AMT_RECEIVABLE_PRINCIPAL", "AMT_RECIVABLE", "AMT_PAYMENT_CURRENT", "MONTHS_BALANCE"])

contract_status_dict = {"Refused":1, "Demand":2, "Sent proposal":3, "Active": 4 , "Signed":5 ,"Completed": 6,"Approved":7}
credit_card_balance.NAME_CONTRACT_STATUS = credit_card_balance.NAME_CONTRACT_STATUS.map(contract_status_dict)

def compress_median(df_example, list_of_columns):
    for each_column in list_of_columns:
        df_example[each_column] = df_example.groupby(["SK_ID_PREV","SK_ID_CURR"])[each_column].transform(lambda x: x.mean())
    return df_example.drop_duplicates()

credit_card_balance = compress_median(credit_card_balance, credit_card_balance.drop(columns = ["SK_ID_PREV","SK_ID_CURR"]).columns)
credit_card_balance.drop_duplicates(inplace=True)

## Previous Application

In [7]:
previous_application.drop(columns =["RATE_INTEREST_PRIMARY", "RATE_INTEREST_PRIVILEGED"], inplace=True)
previous_application.dropna(subset=["AMT_CREDIT", "PRODUCT_COMBINATION"], inplace=True)

previous_application.AMT_DOWN_PAYMENT = previous_application.AMT_DOWN_PAYMENT.interpolate(method='linear', x=previous_application.AMT_APPLICATION)
previous_application.RATE_DOWN_PAYMENT = previous_application.RATE_DOWN_PAYMENT.interpolate(method='linear', x=previous_application.AMT_DOWN_PAYMENT)
previous_application.CNT_PAYMENT = previous_application.CNT_PAYMENT.interpolate(method='linear', x=previous_application.AMT_APPLICATION)
previous_application.AMT_GOODS_PRICE  = previous_application.AMT_GOODS_PRICE.interpolate(method='linear', x=previous_application.AMT_APPLICATION)

previous_application.AMT_ANNUITY =  previous_application.groupby("NAME_CONTRACT_TYPE").AMT_ANNUITY.transform(lambda x: x.fillna(x.median()))
previous_application.NAME_TYPE_SUITE = previous_application.NAME_TYPE_SUITE.fillna("Unaccompanied")
previous_application.drop(columns = ["DAYS_FIRST_DRAWING", "DAYS_FIRST_DUE","DAYS_LAST_DUE_1ST_VERSION","DAYS_LAST_DUE","DAYS_TERMINATION","NFLAG_INSURED_ON_APPROVAL"], inplace=True)

weekday_encoding_map = {'TUESDAY':2, 'WEDNESDAY':3, 'MONDAY':1, 'FRIDAY':5, 'THURSDAY':4, 'SATURDAY':6, 'SUNDAY':7}
previous_application.WEEKDAY_APPR_PROCESS_START = previous_application.WEEKDAY_APPR_PROCESS_START.map(weekday_encoding_map)

yield_encoding_map = {'XNA':0, 'middle':3, 'high':4, 'low_normal':2, 'low_action':1}
previous_application.NAME_YIELD_GROUP = previous_application.NAME_YIELD_GROUP.map(yield_encoding_map)

def categorical_ordinal_encoding(df_example):
    df_example = df_example.copy()
    for each_column in df_example.select_dtypes(include="object").columns:
        map_dict = {}
        categories = list(df_example[each_column].value_counts(ascending=True).index)
        for each_category in categories:
            # for each_index in range(1, len(df_example[each_column].value_counts(ascending=True).index)):
            map_dict[each_category] = categories.index(each_category)
        df_example[each_column] = df_example[each_column].map(map_dict)
    return df_example

previous_application = categorical_ordinal_encoding(previous_application)

# ORDINAL ENCODE THE DATA IN PREVIOUS APPLICATION

Simplify so dont need to one hot encode everything, control the amount of columns


For application_train 

Train encoder on training data

Then use encode for application_test

# (1) HANDLE NULLS and (2) Apply same transformation to each Train and test so have same columns

Handle Nulls: 
First Pass: median and mode
Future: interpolate based on highest correlated column

## FILL NA Simple

In [8]:
for each_column in application_test.select_dtypes(exclude="object").columns:
    application_test[each_column] = application_test[each_column].fillna(application_train[each_column].median())
    
for each_column in application_test.select_dtypes(include="object").columns:
    application_test[each_column] = application_test[each_column].fillna(application_train[each_column].mode()[0])

for each_column in application_train.select_dtypes(exclude="object").columns:
    application_train[each_column] = application_train[each_column].fillna(application_train[each_column].median())
    
for each_column in application_train.select_dtypes(include="object").columns:
    application_train[each_column] = application_train[each_column].fillna(application_train[each_column].mode()[0])

## Apply Same Encoding

In [10]:
from sklearn.preprocessing import OneHotEncoder

combined_data = pd.concat([application_train.drop(columns ="TARGET"), application_test])

# Initialize and fit the encoder on combined data
encoder = OneHotEncoder()
encoder.fit_transform(combined_data).toarray()

# Transform training data
# encoder.transform(application_train.drop(columns ="TARGET"))

# Transform testing data
# encoded_test_data = encoder.transform(application_test)

: 

In [1]:
encoder.fit_transform(combined_data)

NameError: name 'encoder' is not defined

In [None]:
encoded_train_df

In [None]:
encoder.

In [104]:
encoder.transform_(application_test)

<48744x775465 sparse matrix of type '<class 'numpy.float64'>'
	with 5898024 stored elements in Compressed Sparse Row format>

In [None]:
encoded_train_data

In [101]:
pd.DataFrame(encoded_train_data, delimiter)

Unnamed: 0,0
0,"(0, 1)\t1.0\n (0, 356255)\t1.0\n (0, 35625..."
1,"(0, 2)\t1.0\n (0, 356255)\t1.0\n (0, 35625..."
2,"(0, 3)\t1.0\n (0, 356256)\t1.0\n (0, 35625..."
3,"(0, 5)\t1.0\n (0, 356255)\t1.0\n (0, 35625..."
4,"(0, 6)\t1.0\n (0, 356255)\t1.0\n (0, 35625..."
...,...
307506,"(0, 356250)\t1.0\n (0, 356255)\t1.0\n (0, ..."
307507,"(0, 356251)\t1.0\n (0, 356255)\t1.0\n (0, ..."
307508,"(0, 356252)\t1.0\n (0, 356255)\t1.0\n (0, ..."
307509,"(0, 356253)\t1.0\n (0, 356255)\t1.0\n (0, ..."


In [96]:
pd.get_dummies(application_test)

Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,TOTALAREA_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,NAME_CONTRACT_TYPE_Cash loans,NAME_CONTRACT_TYPE_Revolving loans,CODE_GENDER_F,CODE_GENDER_M,FLAG_OWN_CAR_N,FLAG_OWN_CAR_Y,FLAG_OWN_REALTY_N,FLAG_OWN_REALTY_Y,NAME_TYPE_SUITE_Children,NAME_TYPE_SUITE_Family,NAME_TYPE_SUITE_Group of people,NAME_TYPE_SUITE_Other_A,NAME_TYPE_SUITE_Other_B,"NAME_TYPE_SUITE_Spouse, partner",NAME_TYPE_SUITE_Unaccompanied,NAME_INCOME_TYPE_Businessman,NAME_INCOME_TYPE_Commercial associate,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_State servant,NAME_INCOME_TYPE_Student,NAME_INCOME_TYPE_Unemployed,NAME_INCOME_TYPE_Working,NAME_EDUCATION_TYPE_Academic degree,NAME_EDUCATION_TYPE_Higher education,NAME_EDUCATION_TYPE_Incomplete higher,NAME_EDUCATION_TYPE_Lower secondary,NAME_EDUCATION_TYPE_Secondary / secondary special,NAME_FAMILY_STATUS_Civil marriage,NAME_FAMILY_STATUS_Married,NAME_FAMILY_STATUS_Separated,NAME_FAMILY_STATUS_Single / not married,NAME_FAMILY_STATUS_Widow,NAME_HOUSING_TYPE_Co-op apartment,NAME_HOUSING_TYPE_House / apartment,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Office apartment,NAME_HOUSING_TYPE_Rented apartment,NAME_HOUSING_TYPE_With parents,OCCUPATION_TYPE_Accountants,OCCUPATION_TYPE_Cleaning staff,OCCUPATION_TYPE_Cooking staff,OCCUPATION_TYPE_Core staff,OCCUPATION_TYPE_Drivers,OCCUPATION_TYPE_HR staff,OCCUPATION_TYPE_High skill tech staff,OCCUPATION_TYPE_IT staff,OCCUPATION_TYPE_Laborers,OCCUPATION_TYPE_Low-skill Laborers,OCCUPATION_TYPE_Managers,OCCUPATION_TYPE_Medicine staff,OCCUPATION_TYPE_Private service staff,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff,WEEKDAY_APPR_PROCESS_START_FRIDAY,WEEKDAY_APPR_PROCESS_START_MONDAY,WEEKDAY_APPR_PROCESS_START_SATURDAY,WEEKDAY_APPR_PROCESS_START_SUNDAY,WEEKDAY_APPR_PROCESS_START_THURSDAY,WEEKDAY_APPR_PROCESS_START_TUESDAY,WEEKDAY_APPR_PROCESS_START_WEDNESDAY,ORGANIZATION_TYPE_Advertising,ORGANIZATION_TYPE_Agriculture,ORGANIZATION_TYPE_Bank,ORGANIZATION_TYPE_Business Entity Type 1,ORGANIZATION_TYPE_Business Entity Type 2,ORGANIZATION_TYPE_Business Entity Type 3,ORGANIZATION_TYPE_Cleaning,ORGANIZATION_TYPE_Construction,ORGANIZATION_TYPE_Culture,ORGANIZATION_TYPE_Electricity,ORGANIZATION_TYPE_Emergency,ORGANIZATION_TYPE_Government,ORGANIZATION_TYPE_Hotel,ORGANIZATION_TYPE_Housing,ORGANIZATION_TYPE_Industry: type 1,ORGANIZATION_TYPE_Industry: type 10,ORGANIZATION_TYPE_Industry: type 11,ORGANIZATION_TYPE_Industry: type 12,ORGANIZATION_TYPE_Industry: type 13,ORGANIZATION_TYPE_Industry: type 2,ORGANIZATION_TYPE_Industry: type 3,ORGANIZATION_TYPE_Industry: type 4,ORGANIZATION_TYPE_Industry: type 5,ORGANIZATION_TYPE_Industry: type 6,ORGANIZATION_TYPE_Industry: type 7,ORGANIZATION_TYPE_Industry: type 8,ORGANIZATION_TYPE_Industry: type 9,ORGANIZATION_TYPE_Insurance,ORGANIZATION_TYPE_Kindergarten,ORGANIZATION_TYPE_Legal Services,ORGANIZATION_TYPE_Medicine,ORGANIZATION_TYPE_Military,ORGANIZATION_TYPE_Mobile,ORGANIZATION_TYPE_Other,ORGANIZATION_TYPE_Police,ORGANIZATION_TYPE_Postal,ORGANIZATION_TYPE_Realtor,ORGANIZATION_TYPE_Religion,ORGANIZATION_TYPE_Restaurant,ORGANIZATION_TYPE_School,ORGANIZATION_TYPE_Security,ORGANIZATION_TYPE_Security Ministries,ORGANIZATION_TYPE_Self-employed,ORGANIZATION_TYPE_Services,ORGANIZATION_TYPE_Telecom,ORGANIZATION_TYPE_Trade: type 1,ORGANIZATION_TYPE_Trade: type 2,ORGANIZATION_TYPE_Trade: type 3,ORGANIZATION_TYPE_Trade: type 4,ORGANIZATION_TYPE_Trade: type 5,ORGANIZATION_TYPE_Trade: type 6,ORGANIZATION_TYPE_Trade: type 7,ORGANIZATION_TYPE_Transport: type 1,ORGANIZATION_TYPE_Transport: type 2,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_Transport: type 4,ORGANIZATION_TYPE_University,ORGANIZATION_TYPE_XNA,FONDKAPREMONT_MODE_not specified,FONDKAPREMONT_MODE_org spec account,FONDKAPREMONT_MODE_reg oper account,FONDKAPREMONT_MODE_reg oper spec account,HOUSETYPE_MODE_block of flats,HOUSETYPE_MODE_specific housing,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,100001,0,135000.0,568800.0,20560.5,450000.0,0.018850,-19241,-2329,-5170.0,-812,9.0,1,1,0,1,0,1,2.0,2,2,18,0,0,0,0,0,0,0.752614,0.789654,0.159520,0.0660,0.0590,0.9732,0.7552,0.0227,0.00,0.1379,0.1250,0.2083,0.0483,0.0756,0.0505,0.0000,0.0038,0.0672,0.0612,0.9732,0.7583,0.0203,0.0000,0.1379,0.1250,0.2083,0.0462,0.0817,0.0526,0.0000,0.0012,0.0666,0.0590,0.9732,0.7585,0.0223,0.00,0.1379,0.1250,0.2083,0.0488,0.0770,0.0514,0.0000,0.0031,0.0392,0.0,0.0,0.0,0.0,-1740.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,True,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,True,False
1,100005,0,99000.0,222768.0,17370.0,180000.0,0.035792,-18064,-4469,-9118.0,-1623,9.0,1,1,0,1,0,0,2.0,2,2,9,0,0,0,0,0,0,0.564990,0.291656,0.432962,0.0928,0.0781,0.9816,0.7552,0.0227,0.00,0.1379,0.1667,0.2083,0.0483,0.0756,0.0770,0.0000,0.0038,0.0851,0.0770,0.9816,0.7583,0.0203,0.0000,0.1379,0.1667,0.2083,0.0462,0.0817,0.0751,0.0000,0.0012,0.0926,0.0778,0.9816,0.7585,0.0223,0.00,0.1379,0.1667,0.2083,0.0488,0.0770,0.0776,0.0000,0.0031,0.0707,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0,True,False,False,True,True,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,True,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,True,False,False,True,False
2,100013,0,202500.0,663264.0,69777.0,630000.0,0.019101,-20038,-4458,-2175.0,-3503,5.0,1,1,0,1,0,0,2.0,2,2,14,0,0,0,0,0,0,0.506771,0.699787,0.610991,0.0928,0.0781,0.9816,0.7552,0.0227,0.00,0.1379,0.1667,0.2083,0.0483,0.0756,0.0770,0.0000,0.0038,0.0851,0.0770,0.9816,0.7583,0.0203,0.0000,0.1379,0.1667,0.2083,0.0462,0.0817,0.0751,0.0000,0.0012,0.0926,0.0778,0.9816,0.7585,0.0223,0.00,0.1379,0.1667,0.2083,0.0488,0.0770,0.0776,0.0000,0.0031,0.0707,0.0,0.0,0.0,0.0,-856.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0,True,False,False,True,False,True,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,True,False,False,False,False,False,False,True,False,False,True,False
3,100028,2,315000.0,1575000.0,49018.5,1575000.0,0.026392,-13976,-1866,-2000.0,-4208,9.0,1,1,0,1,1,0,4.0,2,2,11,0,0,0,0,0,0,0.525734,0.509677,0.612704,0.3052,0.1974,0.9970,0.9592,0.1165,0.32,0.2759,0.3750,0.0417,0.2042,0.2404,0.3673,0.0386,0.0800,0.3109,0.2049,0.9970,0.9608,0.1176,0.3222,0.2759,0.3750,0.0417,0.2089,0.2626,0.3827,0.0389,0.0847,0.3081,0.1974,0.9970,0.9597,0.1173,0.32,0.2759,0.3750,0.0417,0.2078,0.2446,0.3739,0.0388,0.0817,0.3700,0.0,0.0,0.0,0.0,-1805.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0,True,False,True,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,True,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,True,False,False,True,False
4,100038,1,180000.0,625500.0,32067.0,625500.0,0.010032,-13040,-2191,-4000.0,-4262,16.0,1,1,1,1,0,0,3.0,2,2,5,0,0,0,0,1,1,0.202145,0.425687,0.519097,0.0928,0.0781,0.9816,0.7552,0.0227,0.00,0.1379,0.1667,0.2083,0.0483,0.0756,0.0770,0.0000,0.0038,0.0851,0.0770,0.9816,0.7583,0.0203,0.0000,0.1379,0.1667,0.2083,0.0462,0.0817,0.0751,0.0000,0.0012,0.0926,0.0778,0.9816,0.7585,0.0223,0.00,0.1379,0.1667,0.2083,0.0488,0.0770,0.0776,0.0000,0.0031,0.0707,0.0,0.0,0.0,0.0,-821.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0,True,False,False,True,False,True,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,True,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,456221,0,121500.0,412560.0,17473.5,270000.0,0.002042,-19970,-5169,-9094.0,-3399,9.0,1,1,1,1,1,0,1.0,3,3,16,0,0,0,0,0,0,0.506771,0.648575,0.643026,0.0928,0.0781,0.9816,0.7552,0.0227,0.00,0.1379,0.1667,0.2083,0.0483,0.0756,0.0770,0.0000,0.0038,0.0851,0.0770,0.9816,0.7583,0.0203,0.0000,0.1379,0.1667,0.2083,0.0462,0.0817,0.0751,0.0000,0.0012,0.0926,0.0778,0.9816,0.7585,0.0223,0.00,0.1379,0.1667,0.2083,0.0488,0.0770,0.0776,0.0000,0.0031,0.0707,1.0,0.0,1.0,0.0,-684.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,True,False,True,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,True,False,False,True,False
48740,456222,2,157500.0,622413.0,31909.5,495000.0,0.035792,-11186,-1149,-3015.0,-3003,9.0,1,1,0,1,0,0,4.0,2,2,11,0,0,0,0,1,1,0.506771,0.684596,0.519097,0.0928,0.0781,0.9816,0.7552,0.0227,0.00,0.1379,0.1667,0.2083,0.0483,0.0756,0.0770,0.0000,0.0038,0.0851,0.0770,0.9816,0.7583,0.0203,0.0000,0.1379,0.1667,0.2083,0.0462,0.0817,0.0751,0.0000,0.0012,0.0926,0.0778,0.9816,0.7585,0.0223,0.00,0.1379,0.1667,0.2083,0.0488,0.0770,0.0776,0.0000,0.0031,0.0707,2.0,0.0,2.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0,True,False,True,False,True,False,True,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,True,False,False,True,False
48741,456223,1,202500.0,315000.0,33205.5,315000.0,0.026392,-15922,-3037,-2681.0,-1504,4.0,1,1,0,1,1,0,3.0,2,2,12,0,0,0,0,0,0,0.733503,0.632770,0.283712,0.1113,0.1364,0.9955,0.7552,0.0227,0.16,0.1379,0.3333,0.2083,0.0483,0.0756,0.1383,0.0000,0.0542,0.1134,0.1415,0.9955,0.7583,0.0203,0.1611,0.1379,0.3333,0.2083,0.0462,0.0817,0.1441,0.0000,0.0574,0.1124,0.1364,0.9955,0.7585,0.0223,0.16,0.1379,0.3333,0.2083,0.0488,0.0770,0.1408,0.0000,0.0554,0.1663,0.0,0.0,0.0,0.0,-838.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,3.0,1.0,True,False,True,False,False,True,False,True,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,True,False
48742,456224,0,225000.0,450000.0,25128.0,450000.0,0.018850,-13968,-2731,-1461.0,-1364,9.0,1,1,1,1,1,0,2.0,2,2,10,0,1,1,0,1,1,0.373090,0.445701,0.595456,0.1629,0.0723,0.9896,0.7552,0.0227,0.16,0.0690,0.6250,0.2083,0.0483,0.0756,0.1563,0.0000,0.1490,0.1660,0.0750,0.9896,0.7583,0.0203,0.1611,0.0690,0.6250,0.2083,0.0462,0.0817,0.1204,0.0000,0.1577,0.1645,0.0723,0.9896,0.7585,0.0223,0.16,0.0690,0.6250,0.2083,0.0488,0.0770,0.1591,0.0000,0.1521,0.1974,0.0,0.0,0.0,0.0,-2308.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0,True,False,False,True,True,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,True,False,False,True,False


In [94]:
application_train.isnull().sum()

SK_ID_CURR                      0
TARGET                          0
NAME_CONTRACT_TYPE              0
CODE_GENDER                     0
FLAG_OWN_CAR                    0
FLAG_OWN_REALTY                 0
CNT_CHILDREN                    0
AMT_INCOME_TOTAL                0
AMT_CREDIT                      0
AMT_ANNUITY                     0
AMT_GOODS_PRICE                 0
NAME_TYPE_SUITE                 0
NAME_INCOME_TYPE                0
NAME_EDUCATION_TYPE             0
NAME_FAMILY_STATUS              0
NAME_HOUSING_TYPE               0
REGION_POPULATION_RELATIVE      0
DAYS_BIRTH                      0
DAYS_EMPLOYED                   0
DAYS_REGISTRATION               0
DAYS_ID_PUBLISH                 0
OWN_CAR_AGE                     0
FLAG_MOBIL                      0
FLAG_EMP_PHONE                  0
FLAG_WORK_PHONE                 0
FLAG_CONT_MOBILE                0
FLAG_PHONE                      0
FLAG_EMAIL                      0
OCCUPATION_TYPE                 0
CNT_FAM_MEMBER

## Application Test & Train

In [83]:
(application_test.isnull().sum()) > 0

SK_ID_CURR                      False
NAME_CONTRACT_TYPE              False
CODE_GENDER                     False
FLAG_OWN_CAR                    False
FLAG_OWN_REALTY                 False
CNT_CHILDREN                    False
AMT_INCOME_TOTAL                False
AMT_CREDIT                      False
AMT_ANNUITY                      True
AMT_GOODS_PRICE                 False
NAME_TYPE_SUITE                  True
NAME_INCOME_TYPE                False
NAME_EDUCATION_TYPE             False
NAME_FAMILY_STATUS              False
NAME_HOUSING_TYPE               False
REGION_POPULATION_RELATIVE      False
DAYS_BIRTH                      False
DAYS_EMPLOYED                   False
DAYS_REGISTRATION               False
DAYS_ID_PUBLISH                 False
OWN_CAR_AGE                      True
FLAG_MOBIL                      False
FLAG_EMP_PHONE                  False
FLAG_WORK_PHONE                 False
FLAG_CONT_MOBILE                False
FLAG_PHONE                      False
FLAG_EMAIL  

In [84]:
test_df = (application_test.isnull().sum()).reset_index().sort_values(by=0)

In [65]:
pd.set_option("display.max_rows", 200)

AMT CREDIT and AMT_GOODS_PRICE

REGION_RATING_CLIENT	REGION_RATING_CLIENT_W_CITY

APARTMENTS_AVG

In [78]:
for each in list_test:
    print("POOPY POOP " + each)

POOPY POOP EXT_SOURCE_2
POOPY POOP AMT_ANNUITY
POOPY POOP DEF_30_CNT_SOCIAL_CIRCLE
POOPY POOP DEF_60_CNT_SOCIAL_CIRCLE
POOPY POOP OBS_60_CNT_SOCIAL_CIRCLE
POOPY POOP OBS_30_CNT_SOCIAL_CIRCLE
POOPY POOP NAME_TYPE_SUITE
POOPY POOP AMT_REQ_CREDIT_BUREAU_DAY
POOPY POOP AMT_REQ_CREDIT_BUREAU_HOUR
POOPY POOP AMT_REQ_CREDIT_BUREAU_WEEK
POOPY POOP AMT_REQ_CREDIT_BUREAU_MON
POOPY POOP AMT_REQ_CREDIT_BUREAU_YEAR
POOPY POOP AMT_REQ_CREDIT_BUREAU_QRT
POOPY POOP EXT_SOURCE_3
POOPY POOP OCCUPATION_TYPE
POOPY POOP EXT_SOURCE_1
POOPY POOP EMERGENCYSTATE_MODE
POOPY POOP TOTALAREA_MODE
POOPY POOP YEARS_BEGINEXPLUATATION_MODE
POOPY POOP YEARS_BEGINEXPLUATATION_MEDI
POOPY POOP YEARS_BEGINEXPLUATATION_AVG
POOPY POOP FLOORSMAX_AVG
POOPY POOP FLOORSMAX_MEDI
POOPY POOP FLOORSMAX_MODE
POOPY POOP LIVINGAREA_AVG
POOPY POOP LIVINGAREA_MODE
POOPY POOP LIVINGAREA_MEDI
POOPY POOP ENTRANCES_MODE
POOPY POOP ENTRANCES_AVG
POOPY POOP ENTRANCES_MEDI
POOPY POOP HOUSETYPE_MODE
POOPY POOP APARTMENTS_MODE
POOPY POOP APARTMEN

In [59]:
test_df.rename(columns = {"index":"COLUMN"}, inplace=True)

In [63]:
list_test = list(test_df[test_df[0] > 0].COLUMN)

In [64]:
list_test

['EXT_SOURCE_2',
 'AMT_ANNUITY',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'OBS_30_CNT_SOCIAL_CIRCLE',
 'NAME_TYPE_SUITE',
 'AMT_REQ_CREDIT_BUREAU_DAY',
 'AMT_REQ_CREDIT_BUREAU_HOUR',
 'AMT_REQ_CREDIT_BUREAU_WEEK',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_YEAR',
 'AMT_REQ_CREDIT_BUREAU_QRT',
 'EXT_SOURCE_3',
 'OCCUPATION_TYPE',
 'EXT_SOURCE_1',
 'EMERGENCYSTATE_MODE',
 'TOTALAREA_MODE',
 'YEARS_BEGINEXPLUATATION_MODE',
 'YEARS_BEGINEXPLUATATION_MEDI',
 'YEARS_BEGINEXPLUATATION_AVG',
 'FLOORSMAX_AVG',
 'FLOORSMAX_MEDI',
 'FLOORSMAX_MODE',
 'LIVINGAREA_AVG',
 'LIVINGAREA_MODE',
 'LIVINGAREA_MEDI',
 'ENTRANCES_MODE',
 'ENTRANCES_AVG',
 'ENTRANCES_MEDI',
 'HOUSETYPE_MODE',
 'APARTMENTS_MODE',
 'APARTMENTS_MEDI',
 'APARTMENTS_AVG',
 'WALLSMATERIAL_MODE',
 'ELEVATORS_MODE',
 'ELEVATORS_MEDI',
 'ELEVATORS_AVG',
 'NONLIVINGAREA_MEDI',
 'NONLIVINGAREA_AVG',
 'NONLIVINGAREA_MODE',
 'BASEMENTAREA_MEDI',
 'BASEMENTAREA_MODE',
 'BASEMENTARE

In [None]:
for each_incomplete_column in list_test:
    if each_incomplete_column in application_test.select_dtypes(include="object").index:
        
    application_test[each_incomplete_column].fillna()

In [43]:
application_train.shape

(307511, 122)

## Reviewing Data before Merge

In [148]:
first_merge_df = installments_payments.merge(compressed_credit_card, on=["SK_ID_PREV", "SK_ID_CURR"])
first_merge_df.shape

(72459, 16)

In [149]:
second_merge_df = first_merge_df.merge(previous_application, on=["SK_ID_PREV", "SK_ID_CURR"])
second_merge_df.shape

(62267, 43)

In [164]:
different_columns = []
same_columns = []

for each_column in pd.get_dummies(application_train).columns:
    if each_column in pd.get_dummies(application_test).columns:
        same_columns.append(each_column)
    else:
        different_columns.append(each_column)


In [166]:
different_columns_2 = []
same_columns_2 = []

for each_column in pd.get_dummies(application_test).columns:
    if each_column in pd.get_dummies(application_train).columns:
        same_columns_2.append(each_column)
    else:
        different_columns_2.append(each_column)

KeyboardInterrupt: 

Drop Rows with the Following Observations

In [165]:
different_columns

['TARGET',
 'CODE_GENDER_XNA',
 'NAME_INCOME_TYPE_Maternity leave',
 'NAME_FAMILY_STATUS_Unknown']

Inspect rows with the following Observations

In [None]:
different_columns_2

In [162]:
pd.get_dummies(application_train).columns

Index(['SK_ID_CURR', 'TARGET', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE',
       'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       ...
       'HOUSETYPE_MODE_terraced house', 'WALLSMATERIAL_MODE_Block',
       'WALLSMATERIAL_MODE_Mixed', 'WALLSMATERIAL_MODE_Monolithic',
       'WALLSMATERIAL_MODE_Others', 'WALLSMATERIAL_MODE_Panel',
       'WALLSMATERIAL_MODE_Stone, brick', 'WALLSMATERIAL_MODE_Wooden',
       'EMERGENCYSTATE_MODE_No', 'EMERGENCYSTATE_MODE_Yes'],
      dtype='object', length=246)

In [163]:
pd.get_dummies(application_test).columns

Index(['SK_ID_CURR', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT',
       'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE',
       'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION',
       ...
       'HOUSETYPE_MODE_terraced house', 'WALLSMATERIAL_MODE_Block',
       'WALLSMATERIAL_MODE_Mixed', 'WALLSMATERIAL_MODE_Monolithic',
       'WALLSMATERIAL_MODE_Others', 'WALLSMATERIAL_MODE_Panel',
       'WALLSMATERIAL_MODE_Stone, brick', 'WALLSMATERIAL_MODE_Wooden',
       'EMERGENCYSTATE_MODE_No', 'EMERGENCYSTATE_MODE_Yes'],
      dtype='object', length=242)

In [157]:
application_train

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.018801,-9461,-637,-3648.0,-2120,,1,1,0,1,1,0,Laborers,1.0,2,2,WEDNESDAY,10,0,0,0,0,0,0,Business Entity Type 3,0.083037,0.262949,0.139376,0.0247,0.0369,0.9722,0.6192,0.0143,0.00,0.0690,0.0833,0.1250,0.0369,0.0202,0.0190,0.0000,0.0000,0.0252,0.0383,0.9722,0.6341,0.0144,0.0000,0.0690,0.0833,0.1250,0.0377,0.0220,0.0198,0.0,0.0000,0.0250,0.0369,0.9722,0.6243,0.0144,0.00,0.0690,0.0833,0.1250,0.0375,0.0205,0.0193,0.0000,0.0000,reg oper account,block of flats,0.0149,"Stone, brick",No,2.0,2.0,2.0,2.0,-1134.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,State servant,Higher education,Married,House / apartment,0.003541,-16765,-1188,-1186.0,-291,,1,1,0,1,1,0,Core staff,2.0,1,1,MONDAY,11,0,0,0,0,0,0,School,0.311267,0.622246,,0.0959,0.0529,0.9851,0.7960,0.0605,0.08,0.0345,0.2917,0.3333,0.0130,0.0773,0.0549,0.0039,0.0098,0.0924,0.0538,0.9851,0.8040,0.0497,0.0806,0.0345,0.2917,0.3333,0.0128,0.0790,0.0554,0.0,0.0000,0.0968,0.0529,0.9851,0.7987,0.0608,0.08,0.0345,0.2917,0.3333,0.0132,0.0787,0.0558,0.0039,0.0100,reg oper account,block of flats,0.0714,Block,No,1.0,0.0,1.0,0.0,-828.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.010032,-19046,-225,-4260.0,-2531,26.0,1,1,1,1,1,0,Laborers,1.0,2,2,MONDAY,9,0,0,0,0,0,0,Government,,0.555912,0.729567,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-815.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.008019,-19005,-3039,-9833.0,-2437,,1,1,0,1,0,0,Laborers,2.0,2,2,WEDNESDAY,17,0,0,0,0,0,0,Business Entity Type 3,,0.650442,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,0.0,2.0,0.0,-617.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.028663,-19932,-3038,-4311.0,-3458,,1,1,0,1,0,0,Core staff,1.0,2,2,THURSDAY,11,0,0,0,0,1,1,Religion,,0.322738,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-1106.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,Cash loans,M,N,N,0,157500.0,254700.0,27558.0,225000.0,Unaccompanied,Working,Secondary / secondary special,Separated,With parents,0.032561,-9327,-236,-8456.0,-1982,,1,1,0,1,0,0,Sales staff,1.0,1,1,THURSDAY,15,0,0,0,0,0,0,Services,0.145570,0.681632,,0.2021,0.0887,0.9876,0.8300,0.0202,0.22,0.1034,0.6042,0.2708,0.0594,0.1484,0.1965,0.0753,0.1095,0.1008,0.0172,0.9782,0.7125,0.0172,0.0806,0.0345,0.4583,0.0417,0.0094,0.0882,0.0853,0.0,0.0125,0.2040,0.0887,0.9876,0.8323,0.0203,0.22,0.1034,0.6042,0.2708,0.0605,0.1509,0.2001,0.0757,0.1118,reg oper account,block of flats,0.2898,"Stone, brick",No,0.0,0.0,0.0,0.0,-273.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,
307507,456252,0,Cash loans,F,N,Y,0,72000.0,269550.0,12001.5,225000.0,Unaccompanied,Pensioner,Secondary / secondary special,Widow,House / apartment,0.025164,-20775,365243,-4388.0,-4090,,1,0,0,1,1,0,,1.0,2,2,MONDAY,8,0,0,0,0,0,0,XNA,,0.115992,,0.0247,0.0435,0.9727,0.6260,0.0022,0.00,0.1034,0.0833,0.1250,0.0579,0.0202,0.0257,0.0000,0.0000,0.0252,0.0451,0.9727,0.6406,0.0022,0.0000,0.1034,0.0833,0.1250,0.0592,0.0220,0.0267,0.0,0.0000,0.0250,0.0435,0.9727,0.6310,0.0022,0.00,0.1034,0.0833,0.1250,0.0589,0.0205,0.0261,0.0000,0.0000,reg oper account,block of flats,0.0214,"Stone, brick",No,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,
307508,456253,0,Cash loans,F,N,Y,0,153000.0,677664.0,29979.0,585000.0,Unaccompanied,Working,Higher education,Separated,House / apartment,0.005002,-14966,-7921,-6737.0,-5150,,1,1,0,1,0,1,Managers,1.0,3,3,THURSDAY,9,0,0,0,0,1,1,School,0.744026,0.535722,0.218859,0.1031,0.0862,0.9816,0.7484,0.0123,0.00,0.2069,0.1667,0.2083,,0.0841,0.9279,0.0000,0.0000,0.1050,0.0894,0.9816,0.7583,0.0124,0.0000,0.2069,0.1667,0.2083,,0.0918,0.9667,0.0,0.0000,0.1041,0.0862,0.9816,0.7518,0.0124,0.00,0.2069,0.1667,0.2083,,0.0855,0.9445,0.0000,0.0000,reg oper account,block of flats,0.7970,Panel,No,6.0,0.0,6.0,0.0,-1909.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0
307509,456254,1,Cash loans,F,N,Y,0,171000.0,370107.0,20205.0,319500.0,Unaccompanied,Commercial associate,Secondary / secondary special,Married,House / apartment,0.005313,-11961,-4786,-2562.0,-931,,1,1,0,1,0,0,Laborers,2.0,2,2,WEDNESDAY,9,0,0,0,1,1,0,Business Entity Type 1,,0.514163,0.661024,0.0124,,0.9771,,,,0.0690,0.0417,,,,0.0061,,,0.0126,,0.9772,,,,0.0690,0.0417,,,,0.0063,,,0.0125,,0.9771,,,,0.0690,0.0417,,,,0.0062,,,,block of flats,0.0086,"Stone, brick",No,0.0,0.0,0.0,0.0,-322.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [153]:
train_merge = second_merge_df.merge(application_train, on="SK_ID_CURR")

In [156]:
test_merge = second_merge_df.merge(application_test, on="SK_ID_CURR")
test_merge

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,DAYS_DIFFERENCE,AMT_DIFFERENCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_TOTAL_CURRENT,AMT_TOTAL_RECEIVABLE,CNT_INSTALMENT_MATURE_CUM,SK_DPD_DEF,AMT_DRAWINGS_TOTAL,CNT_DRAWINGS_TOTAL,COUNT,NAME_CONTRACT_STATUS_COUNT,NAME_CONTRACT_TYPE_x,AMT_ANNUITY_x,AMT_APPLICATION,AMT_CREDIT_x,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE_x,WEEKDAY_APPR_PROCESS_START_x,HOUR_APPR_PROCESS_START_x,FLAG_LAST_APPL_PER_CONTRACT,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,NAME_CASH_LOAN_PURPOSE,NAME_CONTRACT_STATUS,DAYS_DECISION,NAME_PAYMENT_TYPE,CODE_REJECT_REASON,NAME_TYPE_SUITE_x,NAME_CLIENT_TYPE,NAME_GOODS_CATEGORY,NAME_PORTFOLIO,NAME_PRODUCT_TYPE,CHANNEL_TYPE,SELLERPLACE_AREA,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,NAME_CONTRACT_TYPE_y,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_y,AMT_ANNUITY_y,AMT_GOODS_PRICE_y,NAME_TYPE_SUITE_y,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START_y,HOUR_APPR_PROCESS_START_y,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,1669153,112468,0.0,2.055556,-7.250000,34799.426250,180000.000000,1277.025000,16618.233750,34799.355000,2.583333,0.000000,59671.462500,7.166667,12.0,1,Revolving loans,9000.0,180000.0,180000.0,0.000000,180000.00,WEDNESDAY,9,Y,1,0.000000,XAP,Approved,-386,XNA,XAP,Unaccompanied,Repeater,XNA,Cards,x-sell,AP+ (Cash loan),10,XNA,0.0,XNA,Card X-Sell,Cash loans,F,Y,N,0,112500.0,573408.0,27585.0,495000.0,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.025164,-9991,-969,-1639.0,-2663,15.0,1,1,1,1,0,0,Sales staff,2.0,2,2,WEDNESDAY,14,0,0,0,0,0,0,Business Entity Type 3,0.254623,0.526273,0.483050,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-641.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,1.0
1,1249017,112028,0.0,3.652174,-29.250000,216153.336000,796500.000000,12285.810000,100686.420000,214485.033000,6.666667,0.000000,197536.602000,4.266667,30.0,1,Revolving loans,45000.0,0.0,900000.0,13136.538462,351750.00,MONDAY,8,Y,1,0.092154,XAP,Approved,-926,XNA,XAP,Unaccompanied,Refreshed,XNA,Cards,x-sell,AP+ (Cash loan),3,XNA,0.0,XNA,Card X-Sell,Cash loans,M,Y,N,2,135000.0,285264.0,30852.0,252000.0,Unaccompanied,Working,Higher education,Married,House / apartment,0.020713,-14297,-6941,-63.0,-4754,9.0,1,1,0,1,0,0,High skill tech staff,4.0,3,3,WEDNESDAY,8,0,0,0,1,1,0,Telecom,,0.478798,0.631355,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,1.0,0.0,-2321.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,1991612,195100,0.0,3.500000,0.000000,358097.761731,450000.000000,18388.986923,20708.653846,357696.541731,12.500000,0.000000,45232.310769,5.769231,26.0,1,Revolving loans,22500.0,0.0,450000.0,302.884615,369900.00,SATURDAY,6,Y,1,0.009586,XAP,Approved,-801,XNA,XAP,Unaccompanied,Repeater,XNA,Cards,x-sell,Credit and cash offices,-1,XNA,0.0,XNA,Card X-Sell,Cash loans,M,N,Y,1,112500.0,577147.5,29596.5,459000.0,"Spouse, partner",Working,Secondary / secondary special,Married,House / apartment,0.018029,-12786,-1879,-4663.0,-1436,,1,1,0,1,0,0,Core staff,3.0,3,2,TUESDAY,11,0,0,0,0,0,0,Self-employed,0.247220,0.673838,0.431192,0.2082,0.1757,0.9906,0.8708,0.1098,0.20,0.2414,0.3333,0.2500,0.1325,0.1689,0.2566,0.0039,0.0087,0.2122,0.1823,0.9906,0.8759,0.1108,0.2014,0.2414,0.3333,0.2500,0.1356,0.1846,0.2674,0.0039,0.0092,0.2103,0.1757,0.9906,0.8725,0.1105,0.20,0.2414,0.3333,0.2500,0.1348,0.1719,0.2612,0.0039,0.0089,reg oper account,block of flats,0.2637,Panel,No,2.0,0.0,2.0,0.0,-1076.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,1835377,134812,0.0,5.306122,-91.836735,166561.504773,331363.636364,11184.126136,21237.443182,166521.860284,38.965909,0.011364,34130.200909,1.204545,88.0,1,Revolving loans,13500.0,0.0,112500.0,10237.500000,102372.75,TUESDAY,13,Y,1,0.104456,XAP,Approved,-2688,XNA,XAP,Unaccompanied,Repeater,XNA,Cards,x-sell,Country-wide,1222,Consumer electronics,0.0,XNA,Card Street,Cash loans,F,N,Y,0,112500.0,568800.0,24039.0,450000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.007305,-21129,-837,-11588.0,-4618,,1,1,0,1,0,1,Security staff,2.0,3,3,TUESDAY,15,0,0,0,0,0,0,Security,,0.509644,0.513694,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.0,0.0,4.0,0.0,-284.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,5.0
4,1589437,163385,0.0,2.303867,-124.309392,81442.811968,93351.063830,4547.872340,7757.712766,81483.934309,45.510638,0.063830,11135.106383,0.531915,94.0,1,Revolving loans,3375.0,0.0,67500.0,8032.500000,34803.00,TUESDAY,14,Y,1,0.215627,XAP,Approved,-2866,XNA,XAP,Unaccompanied,Repeater,XNA,Cards,x-sell,Country-wide,1006,Consumer electronics,0.0,XNA,Card Street,Cash loans,F,N,Y,0,112500.0,218938.5,16497.0,189000.0,Unaccompanied,Working,Higher education,Married,House / apartment,0.024610,-23294,-12111,-15722.0,-4851,,1,1,0,1,0,0,Laborers,2.0,2,2,FRIDAY,11,0,0,0,0,1,1,Postal,,0.720499,0.315472,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.0,0.0,4.0,0.0,-1505.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9162,1536508,413851,0.0,0.000000,0.000000,99.562500,0.000000,99.562500,14162.062500,0.000000,0.937500,0.000000,28125.000000,0.875000,16.0,1,Revolving loans,0.0,0.0,0.0,0.000000,0.00,MONDAY,20,Y,1,0.000000,XAP,Approved,-525,XNA,XAP,Unaccompanied,New,XNA,Cards,walk-in,Country-wide,36,Connectivity,0.0,XNA,Card Street,Cash loans,F,Y,Y,1,135000.0,601470.0,32760.0,450000.0,Unaccompanied,Commercial associate,Incomplete higher,Civil marriage,House / apartment,0.030755,-10707,-867,-733.0,-796,31.0,1,1,0,1,0,0,Sales staff,3.0,2,2,MONDAY,16,0,0,0,0,0,0,Trade: type 3,,0.385224,0.382502,0.1856,0.1086,0.9955,,,0.16,0.0690,0.6667,,0.0546,,0.1785,,0.0823,0.1891,0.1127,0.9955,,,0.1611,0.0690,0.6667,,0.0558,,0.1860,,0.0871,0.1874,0.1086,0.9955,,,0.16,0.0690,0.6667,,0.0555,,0.1817,,0.0840,,block of flats,0.1583,"Stone, brick",No,0.0,0.0,0.0,0.0,-525.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,1.0
9163,2409956,448241,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,14318.181818,0.000000,0.000000,0.000000,28636.363636,0.909091,11.0,1,Revolving loans,0.0,0.0,0.0,1418.100000,0.00,WEDNESDAY,8,Y,1,0.077707,XAP,Approved,-331,XNA,XAP,Unaccompanied,Repeater,XNA,Cards,walk-in,Regional / Local,240,Consumer electronics,0.0,XNA,Card Street,Cash loans,F,N,Y,0,157500.0,218016.0,17352.0,180000.0,Unaccompanied,Working,Higher education,Married,House / apartment,0.007020,-22505,-5346,-4257.0,-4257,,1,1,0,1,0,0,,2.0,2,2,FRIDAY,10,0,0,0,0,0,0,School,,0.454729,0.616122,0.0124,,0.9762,,,,,,,,,0.0106,,,0.0126,,0.9762,,,,,,,,,0.0110,,,0.0125,,0.9762,,,,,,,,,0.0108,,,,block of flats,0.0124,,Yes,1.0,0.0,1.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,3.0
9164,1887311,423133,0.0,0.000000,0.000000,12270.262500,157500.000000,0.000000,12270.262500,12270.262500,0.000000,0.000000,24540.525000,0.500000,12.0,2,Revolving loans,7875.0,157500.0,157500.0,4811.625000,157500.00,SATURDAY,12,Y,1,0.095323,XAP,Approved,-375,XNA,XAP,Unaccompanied,Refreshed,XNA,Cards,x-sell,Credit and cash offices,-1,XNA,0.0,XNA,Card X-Sell,Cash loans,F,Y,Y,0,153000.0,405000.0,32125.5,405000.0,Unaccompanied,Working,Higher education,Separated,House / apartment,0.026392,-11190,-1882,-1085.0,-3885,11.0,1,1,0,1,0,0,Sales staff,1.0,2,2,WEDNESDAY,16,0,0,0,0,0,0,Business Entity Type 3,0.323780,0.692530,0.657784,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-375.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
9165,2396818,445165,0.0,0.000000,0.000000,1165.860000,337500.000000,0.000000,2374.500000,1165.860000,0.000000,0.000000,7080.720000,0.933333,15.0,1,Revolving loans,16875.0,337500.0,337500.0,7200.000000,337500.00,MONDAY,13,Y,1,0.065630,XAP,Approved,-449,XNA,XAP,Family,Repeater,XNA,Cards,x-sell,Credit and cash offices,-1,XNA,0.0,XNA,Card X-Sell,Cash loans,M,N,Y,2,157500.0,285723.0,22239.0,238500.0,Family,Working,Higher education,Married,House / apartment,0.031329,-12467,-5296,-5592.0,-4439,,1,1,0,1,0,0,Laborers,4.0,2,2,TUESDAY,12,0,0,0,0,0,0,Business Entity Type 2,0.217004,0.644071,0.825636,,0.1469,0.9762,,,,0.2759,0.1667,,,,0.1087,,,,0.1525,0.9762,,,,0.2759,0.1667,,,,0.1133,,,,0.1469,0.9762,,,,0.2759,0.1667,,,,0.1107,,,,,0.1029,Panel,No,0.0,0.0,0.0,0.0,-2723.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [154]:
train_merge

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,DAYS_DIFFERENCE,AMT_DIFFERENCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_TOTAL_CURRENT,AMT_TOTAL_RECEIVABLE,CNT_INSTALMENT_MATURE_CUM,SK_DPD_DEF,AMT_DRAWINGS_TOTAL,CNT_DRAWINGS_TOTAL,COUNT,NAME_CONTRACT_STATUS_COUNT,NAME_CONTRACT_TYPE_x,AMT_ANNUITY_x,AMT_APPLICATION,AMT_CREDIT_x,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE_x,WEEKDAY_APPR_PROCESS_START_x,HOUR_APPR_PROCESS_START_x,FLAG_LAST_APPL_PER_CONTRACT,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,NAME_CASH_LOAN_PURPOSE,NAME_CONTRACT_STATUS,DAYS_DECISION,NAME_PAYMENT_TYPE,CODE_REJECT_REASON,NAME_TYPE_SUITE_x,NAME_CLIENT_TYPE,NAME_GOODS_CATEGORY,NAME_PORTFOLIO,NAME_PRODUCT_TYPE,CHANNEL_TYPE,SELLERPLACE_AREA,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,TARGET,NAME_CONTRACT_TYPE_y,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_y,AMT_ANNUITY_y,AMT_GOODS_PRICE_y,NAME_TYPE_SUITE_y,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START_y,HOUR_APPR_PROCESS_START_y,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,2723183,112102,0.0,5.480000,0.000000,263745.031154,270000.000000,11401.186154,50.015769,263498.126538,5.076923,0.000000,46927.052308,1.230769,13.0,1,Revolving loans,13500.0,270000.0,270000.0,7587.000000,270000.000000,TUESDAY,17,Y,1,0.040841,XAP,Approved,-414,XNA,XAP,Unaccompanied,Refreshed,XNA,Cards,x-sell,Credit and cash offices,-1,XNA,0.0,XNA,Card X-Sell,0,Cash loans,M,Y,N,2,234000.0,1494486.0,41224.5,1305000.0,Unaccompanied,State servant,Secondary / secondary special,Married,Rented apartment,0.030755,-15904,-8011,-677.0,-4256,2.0,1,1,0,1,0,0,Core staff,4.0,2,2,WEDNESDAY,9,0,0,0,1,1,0,Police,,0.767049,0.719491,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-2108.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0
1,1570206,147645,0.0,0.257143,-649.559571,421228.420714,450000.000000,20638.197000,21384.986143,419412.789000,17.000000,0.142857,41961.600000,3.142857,35.0,1,Revolving loans,22500.0,0.0,450000.0,0.000000,262588.235294,THURSDAY,14,Y,1,0.000000,XAP,Approved,-1073,XNA,XAP,Unaccompanied,Repeater,XNA,Cards,x-sell,Contact center,-1,XNA,0.0,XNA,Card X-Sell,0,Cash loans,F,N,N,2,108000.0,450000.0,21649.5,450000.0,Unaccompanied,Working,Higher education,Married,House / apartment,0.030755,-13638,-3342,-5549.0,-1506,,1,1,0,1,0,0,Laborers,4.0,2,2,FRIDAY,18,0,0,0,0,1,1,Industry: type 3,,0.108729,0.562060,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-1386.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.0,0.0,2.0
2,1594684,100193,0.0,1.064516,0.000000,1845.862412,11117.647059,207.731118,786.176471,1817.250353,4.823529,0.000000,1408.129412,0.023529,85.0,1,Revolving loans,4410.0,80995.5,63000.0,21150.000000,80995.500000,TUESDAY,13,Y,1,0.273729,XAP,Approved,-2589,XNA,XAP,Unaccompanied,Refreshed,Consumer Electronics,Cards,x-sell,Country-wide,1307,Consumer electronics,0.0,XNA,Card Street,0,Cash loans,F,Y,N,0,225000.0,296280.0,15124.5,225000.0,Unaccompanied,Commercial associate,Secondary / secondary special,Separated,House / apartment,0.020246,-17879,-940,-2294.0,-1441,15.0,1,1,1,1,0,0,,1.0,3,3,SUNDAY,15,0,0,0,0,0,0,Other,0.731105,0.748854,0.508287,0.1546,0.084,0.9776,0.6940,0.0190,0.0,0.1034,0.1667,0.2083,0.0149,0.1261,0.0568,0.0,0.0,0.1576,0.0871,0.9777,0.7060,0.0191,0.0,0.1034,0.1667,0.2083,0.0153,0.1377,0.0591,0.0,0.0,0.1561,0.084,0.9776,0.6981,0.0191,0.0,0.1034,0.1667,0.2083,0.0152,0.1283,0.0578,0.0,0.0,reg oper account,block of flats,0.0550,"Stone, brick",No,1.0,0.0,1.0,0.0,-2553.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.0,1.0,6.0
3,2391610,183431,0.0,10.653846,0.000000,200233.508684,219078.947368,10406.872303,13774.500000,199434.361382,36.434211,0.013158,19936.421053,1.552632,76.0,2,Revolving loans,7875.0,0.0,157500.0,0.000000,400386.281400,WEDNESDAY,19,Y,1,0.000000,XAP,Approved,-2289,XNA,XAP,Unaccompanied,Repeater,XNA,Cards,x-sell,Contact center,-1,XNA,0.0,XNA,Card X-Sell,0,Cash loans,F,N,Y,0,135000.0,832500.0,24471.0,832500.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.028663,-19844,-2440,-11005.0,-3387,,1,1,0,1,0,0,Laborers,2.0,2,2,WEDNESDAY,15,0,0,0,0,0,0,Self-employed,,0.755236,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,3.0,5.0,2.0,-1852.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0
4,2270983,159586,0.0,1.125000,-346.153846,86122.915372,149361.702128,5312.731596,8926.755319,86129.758245,35.425532,0.223404,11738.297872,0.446809,94.0,2,Revolving loans,9000.0,0.0,67500.0,3028.846154,76226.413846,TUESDAY,11,Y,1,0.042404,XAP,Approved,-2853,XNA,XAP,Unaccompanied,Repeater,XNA,Cards,x-sell,Contact center,-1,XNA,0.0,XNA,Card Street,0,Cash loans,F,N,Y,1,121500.0,1005120.0,29520.0,720000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.007305,-17297,-3482,-6802.0,-823,,1,1,0,1,1,0,Laborers,3.0,3,3,SATURDAY,10,0,0,0,0,0,0,Industry: type 9,,0.457257,0.347418,0.0165,0.000,0.9806,0.7348,0.0014,0.0,0.0690,0.0417,0.0417,0.0190,0.0134,0.0145,0.0,0.0,0.0168,0.0000,0.9806,0.7452,0.0014,0.0,0.0690,0.0417,0.0417,0.0194,0.0147,0.0151,0.0,0.0,0.0167,0.000,0.9806,0.7383,0.0014,0.0,0.0690,0.0417,0.0417,0.0193,0.0137,0.0148,0.0,0.0,reg oper account,block of flats,0.0122,"Stone, brick",No,2.0,1.0,2.0,1.0,-1284.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53095,2001970,404395,0.0,0.000000,0.000000,0.000000,270000.000000,0.000000,150.000000,0.000000,0.000000,0.000000,150.000000,0.000000,15.0,1,Revolving loans,33750.0,0.0,675000.0,6511.500000,73152.000000,SATURDAY,14,Y,1,0.108932,XAP,Approved,-466,XNA,XAP,Unaccompanied,Repeater,XNA,Cards,x-sell,Country-wide,1552,Consumer electronics,0.0,XNA,Card X-Sell,0,Cash loans,M,N,N,0,175500.0,679500.0,22585.5,679500.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.010500,-13777,-2252,-1184.0,-4370,,1,1,1,1,1,0,Laborers,2.0,3,3,SUNDAY,11,0,0,0,0,0,0,Self-employed,0.482768,0.631687,0.706205,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-1355.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
53096,1982325,437333,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,32500.000000,0.000000,0.000000,0.000000,65000.000000,0.444444,9.0,2,Revolving loans,0.0,0.0,0.0,0.000000,0.000000,FRIDAY,10,Y,1,0.000000,XAP,Approved,-297,XNA,XAP,Unaccompanied,Repeater,XNA,Cards,walk-in,Country-wide,533,Consumer electronics,0.0,XNA,Card Street,0,Revolving loans,F,N,Y,0,112500.0,292500.0,14625.0,292500.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.007114,-17097,-1709,-7065.0,-640,,1,1,0,1,0,0,Sales staff,2.0,2,2,MONDAY,16,0,0,0,0,0,0,Self-employed,,0.643168,0.158555,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,14.0,0.0,14.0,0.0,-1331.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.0,0.0,9.0
53097,2305961,408738,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,120000.000000,0.000000,0.000000,0.000000,120000.000000,0.000000,3.0,1,Revolving loans,0.0,0.0,0.0,5944.500000,0.000000,THURSDAY,7,Y,1,0.099456,XAP,Approved,-113,XNA,XAP,Unaccompanied,Refreshed,XNA,Cards,walk-in,Stone,270,Consumer electronics,0.0,XNA,Card Street,0,Cash loans,F,Y,Y,1,125910.0,339948.0,23656.5,315000.0,Unaccompanied,Pensioner,Secondary / secondary special,Married,House / apartment,0.018801,-19502,365243,-3039.0,-3041,7.0,1,0,0,1,1,0,,3.0,2,2,FRIDAY,9,0,0,0,0,0,0,XNA,0.413263,0.555777,0.574447,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-1325.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
53098,1538868,424735,0.0,0.000000,0.000000,0.000000,675000.000000,0.000000,9.000000,0.000000,0.000000,0.000000,18.000000,1.000000,2.0,1,Revolving loans,33750.0,675000.0,675000.0,0.000000,675000.000000,THURSDAY,13,Y,1,0.000000,XAP,Approved,-96,XNA,XAP,Unaccompanied,Repeater,XNA,Cards,x-sell,Credit and cash offices,-1,XNA,0.0,XNA,Card X-Sell,0,Cash loans,F,N,Y,0,121050.0,112500.0,7569.0,112500.0,Unaccompanied,Pensioner,Lower secondary,Married,House / apartment,0.022800,-22503,365243,-831.0,-4589,,1,0,0,1,0,0,,2.0,2,2,TUESDAY,11,0,0,0,0,0,0,XNA,,0.430023,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,1.0,2.0,0.0,-399.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,


# Exploring First Merge

## Compress Columns: Engineering

DAYS_DIFFERENCE
* If entry payment has larger abs, then paid on time
* If negative, how many days late

AMT_DIFFERECE 
* If negative, then they did not fully pay the installment they were supposed to



In [6]:
installments_payments["DAYS_DIFFERENCE"] = installments_payments.DAYS_INSTALMENT - installments_payments.DAYS_ENTRY_PAYMENT
installments_payments["AMT_DIFFERENCE"] = installments_payments.AMT_PAYMENT - installments_payments.AMT_INSTALMENT

In [7]:
installments_payments.drop(columns=["NUM_INSTALMENT_NUMBER", "DAYS_INSTALMENT", "DAYS_ENTRY_PAYMENT", "AMT_INSTALMENT", "AMT_PAYMENT"],inplace= True)
installments_payments.dropna(inplace=True)

In [11]:
installments_payments.NUM_INSTALMENT_VERSION = installments_payments.groupby(["SK_ID_PREV","SK_ID_CURR"]).NUM_INSTALMENT_VERSION.transform(lambda x: x.median())
installments_payments.DAYS_DIFFERENCE = installments_payments.groupby(["SK_ID_PREV","SK_ID_CURR"]).DAYS_DIFFERENCE.transform(lambda x: x.mean())
installments_payments.AMT_DIFFERENCE = installments_payments.groupby(["SK_ID_PREV","SK_ID_CURR"]).AMT_DIFFERENCE.transform(lambda x: x.mean())

In [12]:
installments_payments.drop_duplicates(inplace=True)

## First Merge

In [None]:
def merge_data(df_example):
    first_merge_df = installments_payments.merge(credit_card_balance, on=["SK_ID_PREV", "SK_ID_CURR"])

In [15]:
first_merge_df = installments_payments.merge(credit_card_balance, on=["SK_ID_PREV", "SK_ID_CURR"])
first_merge_df.shape

(3162221, 26)

In [18]:
second_merge_df = first_merge_df.merge(previous_application, on=["SK_ID_PREV", "SK_ID_CURR"])
second_merge_df.shape

(2189224, 61)

In [21]:
third_merge_df = second_merge_df.merge(application_train, on="SK_ID_CURR")
third_merge_df.shape

(1896355, 182)

NOTE: Opportuntiies to also merge POS_CASH_balance and Bureau 

In [33]:
# bureau_df = bureau_balance.merge(bureau, on="SK_ID_BUREAU")
# bureau_df.groupby("SK_ID_BUREAU").STATUS.count().reset_index().sort_values(by="STATUS")
# MERGE BREAKS: third_merge_df.merge(bureau_df, on="SK_ID_CURR")

# Early Model

In [25]:
third_merge_df.select_dtypes(include="object").columns

Index(['NAME_CONTRACT_STATUS_x', 'NAME_CONTRACT_TYPE_x',
       'WEEKDAY_APPR_PROCESS_START_x', 'FLAG_LAST_APPL_PER_CONTRACT',
       'NAME_CASH_LOAN_PURPOSE', 'NAME_CONTRACT_STATUS_y', 'NAME_PAYMENT_TYPE',
       'CODE_REJECT_REASON', 'NAME_TYPE_SUITE_x', 'NAME_CLIENT_TYPE',
       'NAME_GOODS_CATEGORY', 'NAME_PORTFOLIO', 'NAME_PRODUCT_TYPE',
       'CHANNEL_TYPE', 'NAME_SELLER_INDUSTRY', 'NAME_YIELD_GROUP',
       'PRODUCT_COMBINATION', 'NAME_CONTRACT_TYPE_y', 'CODE_GENDER',
       'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE_y',
       'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS',
       'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START_y',
       'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE',
       'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE'],
      dtype='object')

In [30]:
third_merge_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1896355 entries, 0 to 1896354
Data columns (total 182 columns):
 #    Column                        Dtype  
---   ------                        -----  
 0    SK_ID_PREV                    int64  
 1    SK_ID_CURR                    int64  
 2    NUM_INSTALMENT_VERSION        float64
 3    DAYS_DIFFERENCE               float64
 4    AMT_DIFFERENCE                float64
 5    MONTHS_BALANCE                int64  
 6    AMT_BALANCE                   float64
 7    AMT_CREDIT_LIMIT_ACTUAL       int64  
 8    AMT_DRAWINGS_ATM_CURRENT      float64
 9    AMT_DRAWINGS_CURRENT          float64
 10   AMT_DRAWINGS_OTHER_CURRENT    float64
 11   AMT_DRAWINGS_POS_CURRENT      float64
 12   AMT_INST_MIN_REGULARITY       float64
 13   AMT_PAYMENT_CURRENT           float64
 14   AMT_PAYMENT_TOTAL_CURRENT     float64
 15   AMT_RECEIVABLE_PRINCIPAL      float64
 16   AMT_RECIVABLE                 float64
 17   AMT_TOTAL_RECEIVABLE          float64
 18   

In [55]:
df_train = pd.get_dummies(third_merge_df).fillna(0)

Dropping below columns for the entry to be on the same dimension as test

In [None]:
X_train.dropcolumns_to_drop_for_test

In [83]:
df_train

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,DAYS_DIFFERENCE,AMT_DIFFERENCE,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_CURRENT,AMT_PAYMENT_TOTAL_CURRENT,AMT_RECEIVABLE_PRINCIPAL,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,SK_DPD,SK_DPD_DEF,AMT_ANNUITY_x,AMT_APPLICATION,AMT_CREDIT_x,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE_x,HOUR_APPR_PROCESS_START_x,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIVILEGED,DAYS_DECISION,SELLERPLACE_AREA,CNT_PAYMENT,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_y,AMT_ANNUITY_y,AMT_GOODS_PRICE_y,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,HOUR_APPR_PROCESS_START_y,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,TOTALAREA_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,NAME_CONTRACT_STATUS_x_Active,...,CHANNEL_TYPE_Regional / Local,CHANNEL_TYPE_Stone,NAME_SELLER_INDUSTRY_Auto technology,NAME_SELLER_INDUSTRY_Clothing,NAME_SELLER_INDUSTRY_Connectivity,NAME_SELLER_INDUSTRY_Construction,NAME_SELLER_INDUSTRY_Consumer electronics,NAME_SELLER_INDUSTRY_Furniture,NAME_SELLER_INDUSTRY_Industry,NAME_SELLER_INDUSTRY_Jewelry,NAME_SELLER_INDUSTRY_MLM partners,NAME_SELLER_INDUSTRY_XNA,NAME_YIELD_GROUP_XNA,PRODUCT_COMBINATION_Card Street,PRODUCT_COMBINATION_Card X-Sell,NAME_CONTRACT_TYPE_y_Cash loans,NAME_CONTRACT_TYPE_y_Revolving loans,CODE_GENDER_F,CODE_GENDER_M,FLAG_OWN_CAR_N,FLAG_OWN_CAR_Y,FLAG_OWN_REALTY_N,FLAG_OWN_REALTY_Y,NAME_TYPE_SUITE_y_Children,NAME_TYPE_SUITE_y_Family,NAME_TYPE_SUITE_y_Group of people,NAME_TYPE_SUITE_y_Other_A,NAME_TYPE_SUITE_y_Other_B,"NAME_TYPE_SUITE_y_Spouse, partner",NAME_TYPE_SUITE_y_Unaccompanied,NAME_INCOME_TYPE_Commercial associate,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_State servant,NAME_INCOME_TYPE_Unemployed,NAME_INCOME_TYPE_Working,NAME_EDUCATION_TYPE_Academic degree,NAME_EDUCATION_TYPE_Higher education,NAME_EDUCATION_TYPE_Incomplete higher,NAME_EDUCATION_TYPE_Lower secondary,NAME_EDUCATION_TYPE_Secondary / secondary special,NAME_FAMILY_STATUS_Civil marriage,NAME_FAMILY_STATUS_Married,NAME_FAMILY_STATUS_Separated,NAME_FAMILY_STATUS_Single / not married,NAME_FAMILY_STATUS_Widow,NAME_HOUSING_TYPE_Co-op apartment,NAME_HOUSING_TYPE_House / apartment,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Office apartment,NAME_HOUSING_TYPE_Rented apartment,NAME_HOUSING_TYPE_With parents,OCCUPATION_TYPE_Accountants,OCCUPATION_TYPE_Cleaning staff,OCCUPATION_TYPE_Cooking staff,OCCUPATION_TYPE_Core staff,OCCUPATION_TYPE_Drivers,OCCUPATION_TYPE_HR staff,OCCUPATION_TYPE_High skill tech staff,OCCUPATION_TYPE_IT staff,OCCUPATION_TYPE_Laborers,OCCUPATION_TYPE_Low-skill Laborers,OCCUPATION_TYPE_Managers,OCCUPATION_TYPE_Medicine staff,OCCUPATION_TYPE_Private service staff,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff,WEEKDAY_APPR_PROCESS_START_y_FRIDAY,WEEKDAY_APPR_PROCESS_START_y_MONDAY,WEEKDAY_APPR_PROCESS_START_y_SATURDAY,WEEKDAY_APPR_PROCESS_START_y_SUNDAY,WEEKDAY_APPR_PROCESS_START_y_THURSDAY,WEEKDAY_APPR_PROCESS_START_y_TUESDAY,WEEKDAY_APPR_PROCESS_START_y_WEDNESDAY,ORGANIZATION_TYPE_Advertising,ORGANIZATION_TYPE_Agriculture,ORGANIZATION_TYPE_Bank,ORGANIZATION_TYPE_Business Entity Type 1,ORGANIZATION_TYPE_Business Entity Type 2,ORGANIZATION_TYPE_Business Entity Type 3,ORGANIZATION_TYPE_Cleaning,ORGANIZATION_TYPE_Construction,ORGANIZATION_TYPE_Culture,ORGANIZATION_TYPE_Electricity,ORGANIZATION_TYPE_Emergency,ORGANIZATION_TYPE_Government,ORGANIZATION_TYPE_Hotel,ORGANIZATION_TYPE_Housing,ORGANIZATION_TYPE_Industry: type 1,ORGANIZATION_TYPE_Industry: type 10,ORGANIZATION_TYPE_Industry: type 11,ORGANIZATION_TYPE_Industry: type 12,ORGANIZATION_TYPE_Industry: type 13,ORGANIZATION_TYPE_Industry: type 2,ORGANIZATION_TYPE_Industry: type 3,ORGANIZATION_TYPE_Industry: type 4,ORGANIZATION_TYPE_Industry: type 5,ORGANIZATION_TYPE_Industry: type 6,ORGANIZATION_TYPE_Industry: type 7,ORGANIZATION_TYPE_Industry: type 8,ORGANIZATION_TYPE_Industry: type 9,ORGANIZATION_TYPE_Insurance,ORGANIZATION_TYPE_Kindergarten,ORGANIZATION_TYPE_Legal Services,ORGANIZATION_TYPE_Medicine,ORGANIZATION_TYPE_Military,ORGANIZATION_TYPE_Mobile,ORGANIZATION_TYPE_Other,ORGANIZATION_TYPE_Police,ORGANIZATION_TYPE_Postal,ORGANIZATION_TYPE_Realtor,ORGANIZATION_TYPE_Religion,ORGANIZATION_TYPE_Restaurant,ORGANIZATION_TYPE_School,ORGANIZATION_TYPE_Security,ORGANIZATION_TYPE_Security Ministries,ORGANIZATION_TYPE_Self-employed,ORGANIZATION_TYPE_Services,ORGANIZATION_TYPE_Telecom,ORGANIZATION_TYPE_Trade: type 1,ORGANIZATION_TYPE_Trade: type 2,ORGANIZATION_TYPE_Trade: type 3,ORGANIZATION_TYPE_Trade: type 4,ORGANIZATION_TYPE_Trade: type 5,ORGANIZATION_TYPE_Trade: type 6,ORGANIZATION_TYPE_Trade: type 7,ORGANIZATION_TYPE_Transport: type 1,ORGANIZATION_TYPE_Transport: type 2,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_Transport: type 4,ORGANIZATION_TYPE_University,ORGANIZATION_TYPE_XNA,FONDKAPREMONT_MODE_not specified,FONDKAPREMONT_MODE_org spec account,FONDKAPREMONT_MODE_reg oper account,FONDKAPREMONT_MODE_reg oper spec account,HOUSETYPE_MODE_block of flats,HOUSETYPE_MODE_specific housing,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,2723183,112102,0.0,5.48,0.0,-2,253491.345,270000,0.0,0.00,0.0,0.00,14063.265,13950.0,95.625,249187.635,253491.345,253491.345,0.0,0,0.0,0.0,11.0,0,0,13500.0,270000.0,270000.0,0.0,270000.0,17,1,0.0,0.0,0.0,-414,-1,0.0,-413.0,-369.0,365243.0,365243.0,365243.0,0.0,0,2,234000.0,1494486.0,41224.5,1305000.0,0.030755,-15904,-8011,-677.0,-4256,2.0,1,1,0,1,0,0,4.0,2,2,9,0,0,0,1,1,0,0.000000,0.767049,0.719491,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2108.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0,True,...,False,False,False,False,False,False,False,False,False,False,False,True,True,False,True,True,False,False,True,False,True,True,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,2723183,112102,0.0,5.48,0.0,-7,250588.395,270000,0.0,0.00,0.0,0.00,13208.760,13050.0,70.740,246535.110,250588.395,250588.395,0.0,0,0.0,0.0,6.0,0,0,13500.0,270000.0,270000.0,0.0,270000.0,17,1,0.0,0.0,0.0,-414,-1,0.0,-413.0,-369.0,365243.0,365243.0,365243.0,0.0,0,2,234000.0,1494486.0,41224.5,1305000.0,0.030755,-15904,-8011,-677.0,-4256,2.0,1,1,0,1,0,0,4.0,2,2,9,0,0,0,1,1,0,0.000000,0.767049,0.719491,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2108.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0,True,...,False,False,False,False,False,False,False,False,False,False,False,True,True,False,True,True,False,False,True,False,True,True,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,2723183,112102,0.0,5.48,0.0,-14,279234.720,270000,270000.0,270000.00,0.0,0.00,0.000,0.0,0.000,270000.000,271161.720,271161.720,6.0,6,0.0,0.0,0.0,0,0,13500.0,270000.0,270000.0,0.0,270000.0,17,1,0.0,0.0,0.0,-414,-1,0.0,-413.0,-369.0,365243.0,365243.0,365243.0,0.0,0,2,234000.0,1494486.0,41224.5,1305000.0,0.030755,-15904,-8011,-677.0,-4256,2.0,1,1,0,1,0,0,4.0,2,2,9,0,0,0,1,1,0,0.000000,0.767049,0.719491,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2108.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0,True,...,False,False,False,False,False,False,False,False,False,False,False,True,True,False,True,True,False,False,True,False,True,True,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,2723183,112102,0.0,5.48,0.0,-4,274874.310,270000,13500.0,13500.00,0.0,0.00,12591.810,13500.0,5.535,269889.705,273528.810,273528.810,1.0,1,0.0,0.0,9.0,0,0,13500.0,270000.0,270000.0,0.0,270000.0,17,1,0.0,0.0,0.0,-414,-1,0.0,-413.0,-369.0,365243.0,365243.0,365243.0,0.0,0,2,234000.0,1494486.0,41224.5,1305000.0,0.030755,-15904,-8011,-677.0,-4256,2.0,1,1,0,1,0,0,4.0,2,2,9,0,0,0,1,1,0,0.000000,0.767049,0.719491,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2108.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0,True,...,False,False,False,False,False,False,False,False,False,False,False,True,True,False,True,True,False,False,True,False,True,True,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,2723183,112102,0.0,5.48,0.0,-5,264679.380,270000,0.0,21525.84,0.0,21525.84,12789.360,12600.0,8.190,260762.130,264679.380,264679.380,0.0,1,0.0,1.0,8.0,0,0,13500.0,270000.0,270000.0,0.0,270000.0,17,1,0.0,0.0,0.0,-414,-1,0.0,-413.0,-369.0,365243.0,365243.0,365243.0,0.0,0,2,234000.0,1494486.0,41224.5,1305000.0,0.030755,-15904,-8011,-677.0,-4256,2.0,1,1,0,1,0,0,4.0,2,2,9,0,0,0,1,1,0,0.000000,0.767049,0.719491,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2108.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0,True,...,False,False,False,False,False,False,False,False,False,False,False,True,True,False,True,True,False,False,True,False,True,True,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1896350,2238539,430291,0.0,0.00,0.0,-3,0.000,135000,0.0,0.00,0.0,0.00,0.000,0.0,0.000,0.000,0.000,0.000,0.0,0,0.0,0.0,0.0,0,0,2250.0,45000.0,45000.0,0.0,45000.0,17,1,0.0,0.0,0.0,-208,27,0.0,-50.0,365243.0,365243.0,365243.0,365243.0,0.0,1,1,157500.0,835380.0,40320.0,675000.0,0.010147,-13760,-255,-1074.0,-4716,0.0,1,1,0,1,0,0,3.0,2,2,17,0,0,0,0,1,1,0.426705,0.529276,0.681706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,5.0,0.0,-1833.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,True,...,False,False,False,False,True,False,False,False,False,False,False,False,True,True,False,True,False,True,False,True,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1896351,2238539,430291,0.0,0.00,0.0,-1,0.000,135000,0.0,0.00,0.0,0.00,0.000,0.0,0.000,0.000,0.000,0.000,0.0,0,0.0,0.0,0.0,0,0,2250.0,45000.0,45000.0,0.0,45000.0,17,1,0.0,0.0,0.0,-208,27,0.0,-50.0,365243.0,365243.0,365243.0,365243.0,0.0,1,1,157500.0,835380.0,40320.0,675000.0,0.010147,-13760,-255,-1074.0,-4716,0.0,1,1,0,1,0,0,3.0,2,2,17,0,0,0,0,1,1,0.426705,0.529276,0.681706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,5.0,0.0,-1833.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,True,...,False,False,False,False,True,False,False,False,False,False,False,False,True,True,False,True,False,True,False,True,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1896352,2238539,430291,0.0,0.00,0.0,-4,0.000,135000,0.0,0.00,0.0,0.00,0.000,0.0,0.000,0.000,0.000,0.000,0.0,0,0.0,0.0,0.0,0,0,2250.0,45000.0,45000.0,0.0,45000.0,17,1,0.0,0.0,0.0,-208,27,0.0,-50.0,365243.0,365243.0,365243.0,365243.0,0.0,1,1,157500.0,835380.0,40320.0,675000.0,0.010147,-13760,-255,-1074.0,-4716,0.0,1,1,0,1,0,0,3.0,2,2,17,0,0,0,0,1,1,0.426705,0.529276,0.681706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,5.0,0.0,-1833.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,True,...,False,False,False,False,True,False,False,False,False,False,False,False,True,True,False,True,False,True,False,True,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1896353,2238539,430291,0.0,0.00,0.0,-6,0.000,135000,0.0,0.00,0.0,0.00,0.000,0.0,0.000,0.000,0.000,0.000,0.0,0,0.0,0.0,0.0,0,0,2250.0,45000.0,45000.0,0.0,45000.0,17,1,0.0,0.0,0.0,-208,27,0.0,-50.0,365243.0,365243.0,365243.0,365243.0,0.0,1,1,157500.0,835380.0,40320.0,675000.0,0.010147,-13760,-255,-1074.0,-4716,0.0,1,1,0,1,0,0,3.0,2,2,17,0,0,0,0,1,1,0.426705,0.529276,0.681706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,5.0,0.0,-1833.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,True,...,False,False,False,False,True,False,False,False,False,False,False,False,True,True,False,True,False,True,False,True,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [56]:
X = df_train.drop(columns="TARGET")

In [57]:
y = df_train.TARGET

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

In [140]:
X_train.drop(columns=["NAME_CONTRACT_STATUS_x_Refused", "NAME_CONTRACT_STATUS_x_Sent proposal"], inplace=True)

In [138]:
X_train.drop(columns=columns_to_drop_for_test, inplace=True)

KeyError: "['NAME_CONTRACT_STATUS_x_Approved', 'NAME_CONTRACT_STATUS_x_Demand', 'NAME_GOODS_CATEGORY_Other', 'CHANNEL_TYPE_Car dealer', 'NAME_INCOME_TYPE_Unemployed', 'ORGANIZATION_TYPE_Industry: type 8'] not found in axis"

In [99]:
lgr_model = LogisticRegression()

In [141]:
lgr_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Getting Predicted Probabilities

In [137]:
lgr_model.predict_proba(X_test)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- CHANNEL_TYPE_Car dealer
- NAME_CONTRACT_STATUS_x_Approved
- NAME_CONTRACT_STATUS_x_Demand
- NAME_GOODS_CATEGORY_Other
- NAME_INCOME_TYPE_Unemployed
- ...


In [101]:
X_train.shape

(1517084, 340)

In [102]:
X_test.shape

(379271, 346)

# Early Entry

In [109]:
application_test.SK_ID_CURR.nunique()

48744

In [121]:
application_test.SK_ID_CURR

0        100001
1        100005
2        100013
3        100028
4        100038
          ...  
48739    456221
48740    456222
48741    456223
48742    456224
48743    456250
Name: SK_ID_CURR, Length: 48744, dtype: int64

In [123]:
application_test[application_test.SK_ID_CURR == 456250]

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
48743,456250,Cash loans,F,Y,N,0,135000.0,312768.0,24709.5,270000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.006629,-13962,-633,-1072.0,-4220,22.0,1,1,1,1,0,0,Core staff,2.0,2,2,TUESDAY,14,0,0,0,0,0,0,Government,,0.456541,0.272134,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-327.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0


In [124]:
test_merge_df[test_merge_df.SK_ID_CURR == 456250]

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,DAYS_DIFFERENCE,AMT_DIFFERENCE,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_CURRENT,AMT_PAYMENT_TOTAL_CURRENT,AMT_RECEIVABLE_PRINCIPAL,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS_x,SK_DPD,SK_DPD_DEF,NAME_CONTRACT_TYPE_x,AMT_ANNUITY_x,AMT_APPLICATION,AMT_CREDIT_x,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE_x,WEEKDAY_APPR_PROCESS_START_x,HOUR_APPR_PROCESS_START_x,FLAG_LAST_APPL_PER_CONTRACT,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIVILEGED,NAME_CASH_LOAN_PURPOSE,NAME_CONTRACT_STATUS_y,DAYS_DECISION,NAME_PAYMENT_TYPE,CODE_REJECT_REASON,NAME_TYPE_SUITE_x,NAME_CLIENT_TYPE,NAME_GOODS_CATEGORY,NAME_PORTFOLIO,NAME_PRODUCT_TYPE,CHANNEL_TYPE,SELLERPLACE_AREA,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL,NAME_CONTRACT_TYPE_y,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_y,AMT_ANNUITY_y,AMT_GOODS_PRICE_y,NAME_TYPE_SUITE_y,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START_y,HOUR_APPR_PROCESS_START_y,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
332487,1794451.0,456250,0.0,2.826087,-419.738478,-1.0,153832.725,175500.0,0.0,0.0,0.0,0.0,8286.39,9675.0,1600.83,151808.22,153832.725,153832.725,0.0,0.0,0.0,0.0,10.0,Active,0.0,0.0,Revolving loans,9000.0,180000.0,180000.0,,180000.0,MONDAY,11.0,Y,1.0,,,,XAP,Approved,-373.0,XNA,XAP,Unaccompanied,Repeater,XNA,Cards,x-sell,Credit and cash offices,-1.0,XNA,0.0,XNA,Card X-Sell,-373.0,-327.0,365243.0,365243.0,365243.0,0.0,Cash loans,F,Y,N,0,135000.0,312768.0,24709.5,270000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.006629,-13962,-633,-1072.0,-4220,22.0,1,1,1,1,0,0,Core staff,2.0,2,2,TUESDAY,14,0,0,0,0,0,0,Government,,0.456541,0.272134,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-327.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
332488,1794451.0,456250,0.0,2.826087,-419.738478,-4.0,166188.15,180000.0,0.0,0.0,0.0,0.0,8804.565,9000.0,356.4,163871.775,166188.15,166188.15,0.0,0.0,0.0,0.0,7.0,Active,0.0,0.0,Revolving loans,9000.0,180000.0,180000.0,,180000.0,MONDAY,11.0,Y,1.0,,,,XAP,Approved,-373.0,XNA,XAP,Unaccompanied,Repeater,XNA,Cards,x-sell,Credit and cash offices,-1.0,XNA,0.0,XNA,Card X-Sell,-373.0,-327.0,365243.0,365243.0,365243.0,0.0,Cash loans,F,Y,N,0,135000.0,312768.0,24709.5,270000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.006629,-13962,-633,-1072.0,-4220,22.0,1,1,1,1,0,0,Core staff,2.0,2,2,TUESDAY,14,0,0,0,0,0,0,Government,,0.456541,0.272134,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-327.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
332489,1794451.0,456250,0.0,2.826087,-419.738478,-6.0,171943.02,180000.0,0.0,0.0,0.0,0.0,9084.375,8932.5,1.08,169696.08,171943.02,171943.02,0.0,0.0,0.0,0.0,5.0,Active,0.0,0.0,Revolving loans,9000.0,180000.0,180000.0,,180000.0,MONDAY,11.0,Y,1.0,,,,XAP,Approved,-373.0,XNA,XAP,Unaccompanied,Repeater,XNA,Cards,x-sell,Credit and cash offices,-1.0,XNA,0.0,XNA,Card X-Sell,-373.0,-327.0,365243.0,365243.0,365243.0,0.0,Cash loans,F,Y,N,0,135000.0,312768.0,24709.5,270000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.006629,-13962,-633,-1072.0,-4220,22.0,1,1,1,1,0,0,Core staff,2.0,2,2,TUESDAY,14,0,0,0,0,0,0,Government,,0.456541,0.272134,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-327.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
332490,1794451.0,456250,0.0,2.826087,-419.738478,-2.0,158266.935,175500.0,0.0,0.0,0.0,0.0,8477.73,9675.0,1388.61,156053.16,158266.935,158266.935,0.0,0.0,0.0,0.0,9.0,Active,0.0,0.0,Revolving loans,9000.0,180000.0,180000.0,,180000.0,MONDAY,11.0,Y,1.0,,,,XAP,Approved,-373.0,XNA,XAP,Unaccompanied,Repeater,XNA,Cards,x-sell,Credit and cash offices,-1.0,XNA,0.0,XNA,Card X-Sell,-373.0,-327.0,365243.0,365243.0,365243.0,0.0,Cash loans,F,Y,N,0,135000.0,312768.0,24709.5,270000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.006629,-13962,-633,-1072.0,-4220,22.0,1,1,1,1,0,0,Core staff,2.0,2,2,TUESDAY,14,0,0,0,0,0,0,Government,,0.456541,0.272134,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-327.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
332491,1794451.0,456250,0.0,2.826087,-419.738478,-3.0,162425.565,175500.0,0.0,0.0,0.0,0.0,8643.6,9000.0,522.27,160554.375,162425.565,162425.565,0.0,0.0,0.0,0.0,8.0,Active,0.0,0.0,Revolving loans,9000.0,180000.0,180000.0,,180000.0,MONDAY,11.0,Y,1.0,,,,XAP,Approved,-373.0,XNA,XAP,Unaccompanied,Repeater,XNA,Cards,x-sell,Credit and cash offices,-1.0,XNA,0.0,XNA,Card X-Sell,-373.0,-327.0,365243.0,365243.0,365243.0,0.0,Cash loans,F,Y,N,0,135000.0,312768.0,24709.5,270000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.006629,-13962,-633,-1072.0,-4220,22.0,1,1,1,1,0,0,Core staff,2.0,2,2,TUESDAY,14,0,0,0,0,0,0,Government,,0.456541,0.272134,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-327.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
332492,1794451.0,456250,0.0,2.826087,-419.738478,-5.0,169443.855,180000.0,0.0,0.0,0.0,0.0,8931.42,9000.0,195.435,167091.345,169443.855,169443.855,0.0,0.0,0.0,0.0,6.0,Active,0.0,0.0,Revolving loans,9000.0,180000.0,180000.0,,180000.0,MONDAY,11.0,Y,1.0,,,,XAP,Approved,-373.0,XNA,XAP,Unaccompanied,Repeater,XNA,Cards,x-sell,Credit and cash offices,-1.0,XNA,0.0,XNA,Card X-Sell,-373.0,-327.0,365243.0,365243.0,365243.0,0.0,Cash loans,F,Y,N,0,135000.0,312768.0,24709.5,270000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.006629,-13962,-633,-1072.0,-4220,22.0,1,1,1,1,0,0,Core staff,2.0,2,2,TUESDAY,14,0,0,0,0,0,0,Government,,0.456541,0.272134,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-327.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
332493,1794451.0,456250,0.0,2.826087,-419.738478,-10.0,186577.605,180000.0,0.0,0.0,0.0,0.0,9892.485,20250.0,11250.0,178200.0,185907.105,185907.105,0.0,0.0,0.0,0.0,1.0,Active,0.0,0.0,Revolving loans,9000.0,180000.0,180000.0,,180000.0,MONDAY,11.0,Y,1.0,,,,XAP,Approved,-373.0,XNA,XAP,Unaccompanied,Repeater,XNA,Cards,x-sell,Credit and cash offices,-1.0,XNA,0.0,XNA,Card X-Sell,-373.0,-327.0,365243.0,365243.0,365243.0,0.0,Cash loans,F,Y,N,0,135000.0,312768.0,24709.5,270000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.006629,-13962,-633,-1072.0,-4220,22.0,1,1,1,1,0,0,Core staff,2.0,2,2,TUESDAY,14,0,0,0,0,0,0,Government,,0.456541,0.272134,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-327.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
332494,1794451.0,456250,0.0,2.826087,-419.738478,-7.0,174435.885,180000.0,0.0,0.0,0.0,0.0,9240.705,9675.0,590.625,172012.32,174435.885,174435.885,0.0,0.0,0.0,0.0,4.0,Active,0.0,0.0,Revolving loans,9000.0,180000.0,180000.0,,180000.0,MONDAY,11.0,Y,1.0,,,,XAP,Approved,-373.0,XNA,XAP,Unaccompanied,Repeater,XNA,Cards,x-sell,Credit and cash offices,-1.0,XNA,0.0,XNA,Card X-Sell,-373.0,-327.0,365243.0,365243.0,365243.0,0.0,Cash loans,F,Y,N,0,135000.0,312768.0,24709.5,270000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.006629,-13962,-633,-1072.0,-4220,22.0,1,1,1,1,0,0,Core staff,2.0,2,2,TUESDAY,14,0,0,0,0,0,0,Government,,0.456541,0.272134,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-327.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
332495,1794451.0,456250,0.0,2.826087,-419.738478,-8.0,177219.0,180000.0,0.0,0.0,0.0,0.0,9465.705,9675.0,434.295,174653.82,176958.9,176958.9,0.0,0.0,0.0,0.0,3.0,Active,0.0,0.0,Revolving loans,9000.0,180000.0,180000.0,,180000.0,MONDAY,11.0,Y,1.0,,,,XAP,Approved,-373.0,XNA,XAP,Unaccompanied,Repeater,XNA,Cards,x-sell,Credit and cash offices,-1.0,XNA,0.0,XNA,Card X-Sell,-373.0,-327.0,365243.0,365243.0,365243.0,0.0,Cash loans,F,Y,N,0,135000.0,312768.0,24709.5,270000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.006629,-13962,-633,-1072.0,-4220,22.0,1,1,1,1,0,0,Core staff,2.0,2,2,TUESDAY,14,0,0,0,0,0,0,Government,,0.456541,0.272134,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-327.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
332496,1794451.0,456250,0.0,2.826087,-419.738478,-12.0,181993.5,180000.0,171000.0,171000.0,0.0,0.0,0.0,,0.0,171000.0,171000.0,171000.0,7.0,7.0,0.0,0.0,0.0,Active,0.0,0.0,Revolving loans,9000.0,180000.0,180000.0,,180000.0,MONDAY,11.0,Y,1.0,,,,XAP,Approved,-373.0,XNA,XAP,Unaccompanied,Repeater,XNA,Cards,x-sell,Credit and cash offices,-1.0,XNA,0.0,XNA,Card X-Sell,-373.0,-327.0,365243.0,365243.0,365243.0,0.0,Cash loans,F,Y,N,0,135000.0,312768.0,24709.5,270000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.006629,-13962,-633,-1072.0,-4220,22.0,1,1,1,1,0,0,Core staff,2.0,2,2,TUESDAY,14,0,0,0,0,0,0,Government,,0.456541,0.272134,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-327.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0


In [None]:
GOAL: have a single SK_ID_CURR


find the duplicated SK_ID_CURR

then group by SK_ID_CURR

take mode of object features

take median of numeric features

drop


second_merge_df.SK_ID_CURR

In [114]:
test_merge_df.SK_ID_CURR.nunique()

48744

In [122]:
test_merge_df.SK_ID_CURR

0         100001
1         100005
2         100013
3         100028
4         100028
           ...  
332494    456250
332495    456250
332496    456250
332497    456250
332498    456250
Name: SK_ID_CURR, Length: 332499, dtype: int64

In [None]:
second_merge_df.SK_ID_CURR

In [125]:
second_merge_df.SK_ID_CURR

0          112102
1          112102
2          112102
3          112102
4          112102
            ...  
2189219    430291
2189220    430291
2189221    430291
2189222    430291
2189223    430291
Name: SK_ID_CURR, Length: 2189224, dtype: int64

In [116]:
test_entry.SK_ID_CURR.nunique()

48744

In [130]:
test_merge_df = second_merge_df.merge(application_test, on="SK_ID_CURR", how="right")

In [131]:
test_merge_df.SK_ID_CURR

0        100001.0
1        100005.0
2        100013.0
3        100028.0
4        100038.0
           ...   
48739    456221.0
48740    456222.0
48741    456223.0
48742    456224.0
48743    456250.0
Name: SK_ID_CURR, Length: 48744, dtype: float64

In [132]:
test_entry = pd.get_dummies(test_merge_df).fillna(0)

In [142]:
test_entry.drop(columns= ["NAME_INCOME_TYPE_Businessman", "NAME_INCOME_TYPE_Student", "NAME_INCOME_TYPE_Unemployed", "NAME_TYPE_SUITE_x_0"], inplace=True)

In [144]:
test_entry.drop(columns= "ORGANIZATION_TYPE_Industry: type 8", inplace=True)

In [85]:
application_train.shape

(307511, 122)

In [84]:
application_test.shape

(48744, 121)

In [103]:
test_entry.shape

(292869, 340)

In [95]:
columns_to_drop_for_test = df_train.columns[np.where(~df_train.columns.isin(test_entry.columns))]

In [65]:
application_test.fillna(0)

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,Unaccompanied,Working,Higher education,Married,House / apartment,0.018850,-19241,-2329,-5170.0,-812,0.0,1,1,0,1,0,1,0,2.0,2,2,TUESDAY,18,0,0,0,0,0,0,Kindergarten,0.752614,0.789654,0.159520,0.0660,0.0590,0.9732,0.0000,0.0000,0.00,0.1379,0.1250,0.0000,0.0000,0.0000,0.0505,0.0000,0.0000,0.0672,0.0612,0.9732,0.0000,0.0000,0.0000,0.1379,0.1250,0.0000,0.0000,0.0000,0.0526,0.0000,0.0000,0.0666,0.0590,0.9732,0.0000,0.0000,0.00,0.1379,0.1250,0.0000,0.0000,0.0000,0.0514,0.0000,0.0000,0,block of flats,0.0392,"Stone, brick",No,0.0,0.0,0.0,0.0,-1740.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.035792,-18064,-4469,-9118.0,-1623,0.0,1,1,0,1,0,0,Low-skill Laborers,2.0,2,2,FRIDAY,9,0,0,0,0,0,0,Self-employed,0.564990,0.291656,0.432962,0.0000,0.0000,0.0000,0.0000,0.0000,0.00,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.00,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0,0,0.0000,0,0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,0,Working,Higher education,Married,House / apartment,0.019101,-20038,-4458,-2175.0,-3503,5.0,1,1,0,1,0,0,Drivers,2.0,2,2,MONDAY,14,0,0,0,0,0,0,Transport: type 3,0.000000,0.699787,0.610991,0.0000,0.0000,0.0000,0.0000,0.0000,0.00,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.00,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0,0,0.0000,0,0,0.0,0.0,0.0,0.0,-856.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.026392,-13976,-1866,-2000.0,-4208,0.0,1,1,0,1,1,0,Sales staff,4.0,2,2,WEDNESDAY,11,0,0,0,0,0,0,Business Entity Type 3,0.525734,0.509677,0.612704,0.3052,0.1974,0.9970,0.9592,0.1165,0.32,0.2759,0.3750,0.0417,0.2042,0.2404,0.3673,0.0386,0.0800,0.3109,0.2049,0.9970,0.9608,0.1176,0.3222,0.2759,0.3750,0.0417,0.2089,0.2626,0.3827,0.0389,0.0847,0.3081,0.1974,0.9970,0.9597,0.1173,0.32,0.2759,0.3750,0.0417,0.2078,0.2446,0.3739,0.0388,0.0817,reg oper account,block of flats,0.3700,Panel,No,0.0,0.0,0.0,0.0,-1805.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.010032,-13040,-2191,-4000.0,-4262,16.0,1,1,1,1,0,0,0,3.0,2,2,FRIDAY,5,0,0,0,0,1,1,Business Entity Type 3,0.202145,0.425687,0.000000,0.0000,0.0000,0.0000,0.0000,0.0000,0.00,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.00,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0,0,0.0000,0,0,0.0,0.0,0.0,0.0,-821.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,456221,Cash loans,F,N,Y,0,121500.0,412560.0,17473.5,270000.0,Unaccompanied,Working,Secondary / secondary special,Widow,House / apartment,0.002042,-19970,-5169,-9094.0,-3399,0.0,1,1,1,1,1,0,0,1.0,3,3,WEDNESDAY,16,0,0,0,0,0,0,Other,0.000000,0.648575,0.643026,0.0000,0.0000,0.0000,0.0000,0.0000,0.00,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.00,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0,0,0.0000,0,0,1.0,0.0,1.0,0.0,-684.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
48740,456222,Cash loans,F,N,N,2,157500.0,622413.0,31909.5,495000.0,Unaccompanied,Commercial associate,Secondary / secondary special,Married,House / apartment,0.035792,-11186,-1149,-3015.0,-3003,0.0,1,1,0,1,0,0,Sales staff,4.0,2,2,MONDAY,11,0,0,0,0,1,1,Trade: type 7,0.000000,0.684596,0.000000,0.0000,0.0000,0.0000,0.0000,0.0000,0.00,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.00,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0,0,0.0000,0,0,2.0,0.0,2.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
48741,456223,Cash loans,F,Y,Y,1,202500.0,315000.0,33205.5,315000.0,Unaccompanied,Commercial associate,Secondary / secondary special,Married,House / apartment,0.026392,-15922,-3037,-2681.0,-1504,4.0,1,1,0,1,1,0,0,3.0,2,2,WEDNESDAY,12,0,0,0,0,0,0,Business Entity Type 3,0.733503,0.632770,0.283712,0.1113,0.1364,0.9955,0.0000,0.0000,0.16,0.1379,0.3333,0.0000,0.0000,0.0000,0.1383,0.0000,0.0542,0.1134,0.1415,0.9955,0.0000,0.0000,0.1611,0.1379,0.3333,0.0000,0.0000,0.0000,0.1441,0.0000,0.0574,0.1124,0.1364,0.9955,0.0000,0.0000,0.16,0.1379,0.3333,0.0000,0.0000,0.0000,0.1408,0.0000,0.0554,0,block of flats,0.1663,"Stone, brick",No,0.0,0.0,0.0,0.0,-838.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,3.0,1.0
48742,456224,Cash loans,M,N,N,0,225000.0,450000.0,25128.0,450000.0,Family,Commercial associate,Higher education,Married,House / apartment,0.018850,-13968,-2731,-1461.0,-1364,0.0,1,1,1,1,1,0,Managers,2.0,2,2,MONDAY,10,0,1,1,0,1,1,Self-employed,0.373090,0.445701,0.595456,0.1629,0.0723,0.9896,0.0000,0.0000,0.16,0.0690,0.6250,0.0000,0.0000,0.0000,0.1563,0.0000,0.1490,0.1660,0.0750,0.9896,0.0000,0.0000,0.1611,0.0690,0.6250,0.0000,0.0000,0.0000,0.1204,0.0000,0.1577,0.1645,0.0723,0.9896,0.0000,0.0000,0.16,0.0690,0.6250,0.0000,0.0000,0.0000,0.1591,0.0000,0.1521,0,block of flats,0.1974,Panel,No,0.0,0.0,0.0,0.0,-2308.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0


In [104]:
lgr_model.predict_proba(X_test.drop(columns = columns_to_drop_for_test))

array([[0.94076465, 0.05923535],
       [0.93810764, 0.06189236],
       [0.94605497, 0.05394503],
       ...,
       [0.94673284, 0.05326716],
       [0.92340355, 0.07659645],
       [0.96305345, 0.03694655]])

In [153]:
len(lgr_model.predict_proba(test_entry))

48744

In [None]:
# DROP FROM X_TRAIN
["NAME_CONTRACT_STATUS_x_Refused", "NAME_CONTRACT_STATUS_x_Sent proposal"]

In [None]:
# DROP FROM test_entry
["NAME_INCOME_TYPE_Businessman", "NAME_INCOME_TYPE_Student", "NAME_INCOME_TYPE_Unemployed", "NAME_TYPE_SUITE_x_0"]

In [None]:
test_entry.drop(columns= ["NAME_INCOME_TYPE_Businessman", "NAME_INCOME_TYPE_Student", "NAME_INCOME_TYPE_Unemployed", "NAME_TYPE_SUITE_x_0"], inplace=True)

In [106]:
df_entry = pd.DataFrame([])

In [134]:
test_entry.SK_ID_CURR.astype("int")

0        100001
1        100005
2        100013
3        100028
4        100038
          ...  
48739    456221
48740    456222
48741    456223
48742    456224
48743    456250
Name: SK_ID_CURR, Length: 48744, dtype: int64

In [120]:
application_test.SK_ID_CURR

0        100001
1        100005
2        100013
3        100028
4        100038
          ...  
48739    456221
48740    456222
48741    456223
48742    456224
48743    456250
Name: SK_ID_CURR, Length: 48744, dtype: int64

In [118]:
test_entry.SK_ID_CURR.nunique()

48744

In [148]:
df_entry = df_entry.dropna()

In [150]:
df_entry.SK_ID_CURR = df_entry.SK_ID_CURR.astype("int")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_entry.SK_ID_CURR = df_entry.SK_ID_CURR.astype("int")


In [151]:
df_entry

Unnamed: 0,SK_ID_CURR
0,100001
1,100005
2,100013
3,100028
4,100038
...,...
48739,456221
48740,456222
48741,456223
48742,456224


In [135]:
df_entry["SK_ID_CURR"] = test_entry.SK_ID_CURR.astype("int")

In [162]:
df_entry["TARGET"] = np.round(lgr_model.predict_proba(test_entry)[:,1],1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_entry["TARGET"] = np.round(lgr_model.predict_proba(test_entry)[:,1],1)


In [163]:
df_entry

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.4
1,100005,0.4
2,100013,0.5
3,100028,0.1
4,100038,0.4
...,...,...
48739,456221,0.4
48740,456222,0.4
48741,456223,0.4
48742,456224,0.4


In [165]:
df_entry

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.4
1,100005,0.4
2,100013,0.5
3,100028,0.1
4,100038,0.4
...,...,...
48739,456221,0.4
48740,456222,0.4
48741,456223,0.4
48742,456224,0.4


In [166]:
df_entry.to_csv("Test_Entry.csv", index =False)

In [155]:
lgr_model.classes_

array([0, 1])

In [161]:
np.round(lgr_model.predict_proba(test_entry)[:,1],1)

array([0.4, 0.4, 0.5, ..., 0.4, 0.4, 0.2])

GOAL: have a single SK_ID_CURR


find the duplicated SK_ID_CURR

then group by SK_ID_CURR

take mode of object features

take median of numeric features

drop


second_merge_df.SK_ID_CURR

In [128]:
second_merge_df = second_merge_df.fillna(0)

In [129]:
for each_num_column in second_merge_df.select_dtypes(exclude="object").columns:
    second_merge_df[each_num_column] = second_merge_df.groupby("SK_ID_CURR")[each_num_column].transform(lambda x: x.median())

for each_obj_column in second_merge_df.select_dtypes(include="object").columns:
    second_merge_df[each_obj_column] = second_merge_df.groupby("SK_ID_CURR")[each_obj_column].transform(lambda x: x.mode()[0])

second_merge_df.drop_duplicates(inplace=True)

second_merge_df.SK_ID_CURR

0          112102.0
13         147645.0
48         100193.0
133        183431.0
209        159586.0
             ...   
2189192    430213.0
2189203    437333.0
2189212    408738.0
2189215    424735.0
2189217    430291.0
Name: SK_ID_CURR, Length: 62076, dtype: float64

# TIP: More important to remove data than impute data

# SCRAP

In [None]:
# NOTE of Potentially Similar Columns

# Previous Application -- AMT_APPLICATION and AMT_CREDIT

# Credit Card Balance -- Several Options

# def application_encoding(df_example):
#     df_example = df_example.copy()
#     for each_column in df_example.select_dtypes(include="object").columns:
#         map_dict = {}
#         categories = list(df_example[each_column].value_counts(ascending=True).index)
#         for each_category in categories:
#             map_dict[each_category] = categories.index(each_category)
#         df_example[each_column] = df_example[each_column].map(map_dict)
        
#     return df_example