In [None]:
%reset -f
import pandas as pd
import numpy as np
import matplotlib as mp
import seaborn as sns
import csv
import re
import gc
import sys, os, random

pd.set_option('display.max_rows',1000)

root = '/Users/schwalmdaniel/github/kaggle/home-credit-default-risk'
#root = 'e:/kaggle/house_prices_kaggle'

train=pd.read_csv(root + "/application_train.csv")
test=pd.read_csv(root + "/application_test.csv")



# have a look at the ds
train.head()

In [None]:
train['TARGET'].value_counts()

# it is an unbalanced data, 8.5% of the target is 1, so the baseline is around 92%

In [None]:
# 'NAME_CONTRACT_TYPE', 2 values, converting to 0/1
train['NAME_CONTRACT_TYPE'] = train['NAME_CONTRACT_TYPE'].apply(lambda x: 0 if x == 'Cash loans' else 1)
test['NAME_CONTRACT_TYPE'] = test['NAME_CONTRACT_TYPE'].apply(lambda x: 0 if x == 'Cash loans' else 1)

In [None]:
# 'CODE_GENDER', drop XNA as only 4 rows, convert the rest to 0/1
train = train[train['CODE_GENDER'] != 'XNA']
train['CODE_GENDER'] = train['CODE_GENDER'].apply(lambda x: 0 if x == 'F' else 1)
test['CODE_GENDER'] = test['CODE_GENDER'].apply(lambda x: 0 if x == 'F' else 1)

In [None]:
# FLAG_OWN_CAR
train['FLAG_OWN_CAR'] = train['FLAG_OWN_CAR'].apply(lambda x: 1 if x == 'Y' else 0)
test['FLAG_OWN_CAR'] = test['FLAG_OWN_CAR'].apply(lambda x: 1 if x == 'Y' else 1)

In [None]:
# FLAG_OWN_REALTY
train['FLAG_OWN_REALTY'] = train['FLAG_OWN_REALTY'].apply(lambda x: 1 if x == 'Y' else 0)
test['FLAG_OWN_REALTY'] = test['FLAG_OWN_REALTY'].apply(lambda x: 1 if x == 'Y' else 1)

In [None]:
# where null it should be dropped or mean or average income/annuity

avgAnnuityRate = (train['AMT_ANNUITY']/train['AMT_CREDIT']).mean()
train['AMT_ANNUITY'].fillna(avgAnnuityRate * train['AMT_CREDIT'],inplace=True)
test['AMT_ANNUITY'].fillna(avgAnnuityRate * train['AMT_CREDIT'],inplace=True)


In [None]:
#  where null mean or average income / goods price
goodsPriceMean = train['AMT_GOODS_PRICE'].mean()
train['AMT_GOODS_PRICE'].fillna(goodsPriceMean,inplace=True)
test['AMT_GOODS_PRICE'].fillna(goodsPriceMean,inplace=True)

In [None]:
#  categorical, dummify, where null either unknown or most frequent
train['NAME_TYPE_SUITE'].fillna('Unaccompanied',inplace=True)
test['NAME_TYPE_SUITE'].fillna('Unaccompanied',inplace=True)

In [None]:
# numeric, the older the worse, where null check own car
train['OWN_CAR_AGE'].fillna(100,inplace=True)
test['OWN_CAR_AGE'].fillna(100,inplace=True)
train['OWN_CAR_AGE'] = train['OWN_CAR_AGE'] * -1
test['OWN_CAR_AGE'] = test['OWN_CAR_AGE'] * -1
train['OWN_CAR_AGE'].describe()

In [None]:
# numeric, drop where it is null
train = train[train['CNT_FAM_MEMBERS'] > 0]
test = test[test['CNT_FAM_MEMBERS'] > 0]

In [None]:
# fill null with mean for _1, _2, _3
mean1 = train['EXT_SOURCE_1'].mean()
mean2 = train['EXT_SOURCE_2'].mean()
mean3 = train['EXT_SOURCE_3'].mean()
train['EXT_SOURCE_1'].fillna(mean1,inplace=True)
train['EXT_SOURCE_2'].fillna(mean2,inplace=True)
train['EXT_SOURCE_3'].fillna(mean3,inplace=True)
test['EXT_SOURCE_1'].fillna(mean1,inplace=True)
test['EXT_SOURCE_2'].fillna(mean2,inplace=True)
test['EXT_SOURCE_3'].fillna(mean3,inplace=True)


In [None]:
# fill with mean all _AVG, _MEDI, _MODE
for col in train.columns.tolist():
    if (col.endswith('_AVG') or col.endswith('_MEDI') or col.endswith('_MODE')) and col not in ['FONDKAPREMONT_MODE','HOUSETYPE_MODE',
                    'WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE']: 
        #print (col)
        mean = train[col].mean()
        train[col].fillna(mean,inplace=True)
        test[col].fillna(mean,inplace=True)
        

In [None]:
# EMERGENCYSTATE_MODE
train['EMERGENCYSTATE_MODE'] = train['EMERGENCYSTATE_MODE'].apply(lambda x: 1 if x == 'Y' else 0)
test['EMERGENCYSTATE_MODE'] = test['EMERGENCYSTATE_MODE'].apply(lambda x: 1 if x == 'Y' else 1)


In [None]:
#  Fill none with mean or median for all circle
for col in train.columns.tolist():
    if col.endswith('_CIRCLE'):
        mean = train[col].mean()
        train[col].fillna(mean,inplace=True)
        test[col].fillna(mean,inplace=True)


In [None]:
# negative numeric, drop where it is null
train['DAYS_LAST_PHONE_CHANGE'].fillna(0,inplace=True)
test['DAYS_LAST_PHONE_CHANGE'].fillna(0,inplace=True)


In [None]:
# all amt_credit req, maybe bin it
train['AMT_REQ_CREDIT_BUREAU_YEAR'].mean()

for col in train.columns.tolist():
    if 'AMT_REQ_CREDIT_BUREAU_' in col:
        #print (col)
        mean = train[col].mean()
        train[col].fillna(mean,inplace=True)
        test[col].fillna(mean,inplace=True)


In [None]:
gc.collect()

In [None]:
train_objs_num = len(train)
dataset = pd.concat(objs=[train, test], axis=0)
dataset = pd.get_dummies(dataset, 
    columns = ['NAME_TYPE_SUITE','NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS','NAME_HOUSING_TYPE',
            'OCCUPATION_TYPE','WEEKDAY_APPR_PROCESS_START','ORGANIZATION_TYPE','FONDKAPREMONT_MODE',
            'HOUSETYPE_MODE','WALLSMATERIAL_MODE'],prefix_sep='__')
train = dataset[:train_objs_num]
test = dataset[train_objs_num:]
train.shape

In [None]:
from sklearn.model_selection import train_test_split

X = train.drop(['SK_ID_CURR','TARGET'], axis=1)
y = train['TARGET']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

def get_best_model_and_accuracy(model, params, X, y):
    grid = GridSearchCV(model, # the model to grid search
                        params, # the parameter set to try 
                        error_score=0.) # if a parameter set raises an error, continue and set the performance as a big, fat 0
    grid.fit(X, y) # fit the model and parameters
    # our classical metric for performance
    print ("Best Accuracy: {}".format(grid.best_score_))
    # the best parameters that caused the best accuracy
    print ("Best Parameters: {}".format(grid.best_params_))
    # the average time it took a model to fit to the data (in seconds)
    print ("Average Time to Fit (s): {}".format(round(grid.cv_results_['mean_fit_time'].mean(), 3)))
    # the average time it took a model to predict out of sample data (in seconds)
    # this metric gives us insight into how this model will perform in real-time analysis
    print ("Average Time to Score (s): {}".format(round(grid.cv_results_['mean_score_time'].mean(), 3)))

# Logistic Regression
lr_params = {'C':[1e-1, 1e0, 1e1, 1e2], 'penalty':['l1', 'l2']}

# KNN
knn_params = {'n_neighbors': [1, 3, 5, 7]}

# Decision Tree
tree_params = {'max_depth':[None, 1, 3, 5, 7]}

# Random Forest
forest_params = {'n_estimators': [10, 50, 100], 'max_depth': [None, 1, 3, 5, 7]}

lr = LogisticRegression()
knn = KNeighborsClassifier()
d_tree = DecisionTreeClassifier()
forest = RandomForestClassifier()

get_best_model_and_accuracy(lr, lr_params, X, y)
#get_best_model_and_accuracy(knn, knn_params, X, y)
#get_best_model_and_accuracy(d_tree, tree_params, X, y)
#get_best_model_and_accuracy(forest, forest_params, X, y)