## <a id='toc1_1_'></a>[Import packages](#toc0_)

In [21]:
# import packages
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.impute import KNNImputer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

import warnings

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
%matplotlib inline

## <a id='toc1_2_'></a>[Import datsets](#toc0_)

In [64]:
# import train & test datasets
train = pd.read_csv('/Users/imdongchan/Documents/GitHub/MSBA/CapstoneProject/data/raw/application_train.csv', header = 0, sep = ',', index_col= 0)
test = pd.read_csv('/Users/imdongchan/Documents/GitHub/MSBA/CapstoneProject/data/raw/application_test.csv', header = 0, sep = ',', index_col= 0)

## <a id='toc1_3_'></a>[Drop Columns which have 50% over missing values](#toc0_)

In [65]:
temp = (train.isnull().sum() / len(train) * 100).sort_values(ascending= False)

train_missing_df = pd.DataFrame({'Column':temp.index, 'Prop':temp.values})
columns_to_drop = train_missing_df.loc[train_missing_df.Prop > 50, 'Column'].to_list()
columns_to_drop

['COMMONAREA_AVG',
 'COMMONAREA_MODE',
 'COMMONAREA_MEDI',
 'NONLIVINGAPARTMENTS_AVG',
 'NONLIVINGAPARTMENTS_MODE',
 'NONLIVINGAPARTMENTS_MEDI',
 'FONDKAPREMONT_MODE',
 'LIVINGAPARTMENTS_MEDI',
 'LIVINGAPARTMENTS_AVG',
 'LIVINGAPARTMENTS_MODE',
 'FLOORSMIN_AVG',
 'FLOORSMIN_MODE',
 'FLOORSMIN_MEDI',
 'YEARS_BUILD_AVG',
 'YEARS_BUILD_MEDI',
 'YEARS_BUILD_MODE',
 'OWN_CAR_AGE',
 'LANDAREA_MEDI',
 'LANDAREA_AVG',
 'LANDAREA_MODE',
 'BASEMENTAREA_MEDI',
 'BASEMENTAREA_AVG',
 'BASEMENTAREA_MODE',
 'EXT_SOURCE_1',
 'NONLIVINGAREA_AVG',
 'NONLIVINGAREA_MODE',
 'NONLIVINGAREA_MEDI',
 'ELEVATORS_MODE',
 'ELEVATORS_AVG',
 'ELEVATORS_MEDI',
 'WALLSMATERIAL_MODE',
 'APARTMENTS_AVG',
 'APARTMENTS_MEDI',
 'APARTMENTS_MODE',
 'ENTRANCES_AVG',
 'ENTRANCES_MEDI',
 'ENTRANCES_MODE',
 'LIVINGAREA_MEDI',
 'LIVINGAREA_MODE',
 'LIVINGAREA_AVG',
 'HOUSETYPE_MODE']

In [66]:
train_drop_na = train.drop(columns = columns_to_drop)
test_drop_na = test.drop(columns = columns_to_drop)

In [67]:
temp = (train_drop_na.isnull().sum() / len(train_drop_na) * 100).sort_values(ascending= False)

train_missing_df = pd.DataFrame({'Column':temp.index, 'Prop':temp.values})
train_missing_df.head(20)

Unnamed: 0,Column,Prop
0,FLOORSMAX_MEDI,49.760822
1,FLOORSMAX_MODE,49.760822
2,FLOORSMAX_AVG,49.760822
3,YEARS_BEGINEXPLUATATION_MEDI,48.781019
4,YEARS_BEGINEXPLUATATION_MODE,48.781019
5,YEARS_BEGINEXPLUATATION_AVG,48.781019
6,TOTALAREA_MODE,48.268517
7,EMERGENCYSTATE_MODE,47.398304
8,OCCUPATION_TYPE,31.345545
9,EXT_SOURCE_3,19.825307


In [68]:
temp = (test_drop_na.isnull().sum() / len(test_drop_na) * 100).sort_values(ascending= False)

test_missing_df = pd.DataFrame({'Column':temp.index, 'Prop':temp.values})
test_missing_df.head(20)

Unnamed: 0,Column,Prop
0,FLOORSMAX_MEDI,47.843837
1,FLOORSMAX_MODE,47.843837
2,FLOORSMAX_AVG,47.843837
3,YEARS_BEGINEXPLUATATION_MEDI,46.889874
4,YEARS_BEGINEXPLUATATION_MODE,46.889874
5,YEARS_BEGINEXPLUATATION_AVG,46.889874
6,TOTALAREA_MODE,46.413918
7,EMERGENCYSTATE_MODE,45.562531
8,OCCUPATION_TYPE,32.014197
9,EXT_SOURCE_3,17.782701


## <a id='toc1_4_'></a>[Impute Missing values by imputer](#toc0_)

## <a id='toc1_5_'></a>[Encoding the categorical variables](#toc0_)

There are two methodolgies to encode the categorical variables as numeric values.
- Label Encoding
- One-hot Encoding

#### <a id='toc1_5_1_1_'></a>[Label Encoding](#toc0_)

In [48]:
# Iterate through the columns
def label_encoding(train, test):
    le = preprocessing.LabelEncoder()
    le_count = 0
    for col in train:
        if train[col].dtype == 'object':
            # If 2 or fewer unique categories
            if len(list(train[col].unique())) <= 2:
                # Train on the training data
                le.fit(train[col])
                # Transform both training and testing data
                train[col] = le.transform(train[col])
                test[col] = le.transform(test[col])
            
                # Keep track of how many columns were label encoded
                le_count += 1           
    print('%d columns were label encoded.' % le_count)

label_encoding(train, test)

3 columns were label encoded.


#### <a id='toc1_5_1_2_'></a>[One-Hot Encoding](#toc0_)

In [49]:
# one-hot encoding of categorical variables
train = pd.get_dummies(train)
test = pd.get_dummies(test)

print('Training Features shape: ', train.shape)
print('Testing Features shape: ', test.shape)

Training Features shape:  (307511, 242)
Testing Features shape:  (48744, 238)


## <a id='toc1_6_'></a>[Model predictor & target assignment](#toc0_)

In [50]:
X= train.drop(columns=['TARGET'])
y= train['TARGET']

In [51]:
x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=.30, random_state= 42)

**Table of contents**<a id='toc0_'></a>    
- [Import packages](#toc1_1_)    
  - [Import datsets](#toc1_2_)    
  - [Drop Columns which have 50% over missing values](#toc1_3_)    
  - [Impute Missing values by imputer](#toc1_4_)    
  - [Encoding the categorical variables](#toc1_5_)    
      - [Label Encoding](#toc1_5_1_1_)    
      - [One-Hot Encoding](#toc1_5_1_2_)    
  - [Model predictor & target assignment](#toc1_6_)    
  - [Model selection & Comparison](#toc1_7_)    
    - [Decision Tree](#toc1_7_1_)    
    - [Logistic Regression](#toc1_7_2_)    
    - [Random Forest](#toc1_7_3_)    
  - [S Vector Machine (SVM)](#toc1_8_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

In [52]:
print("X train's shape: ", x_train.shape)
print("y train's shape: ", y_train.shape)
print(x_valid.shape)
print(y_valid.shape)

(215257, 241)
(215257,)
(92254, 241)
(92254,)


## <a id='toc1_7_'></a>[Model selection & Comparison](#toc0_)

### <a id='toc1_7_1_'></a>[Decision Tree](#toc0_)

In [None]:
dcn_tree = DecisionTreeClassifier(max_depth= 5)

# Train on the training data
dcn_tree.fit(train_x, train_y)

### <a id='toc1_7_2_'></a>[Logistic Regression](#toc0_)

In [None]:
log_reg = LogisticRegression(C = 0.0001)

# Train on the training data
log_reg.fit(train_x, train_y)

### <a id='toc1_7_3_'></a>[Random Forest](#toc0_)

In [None]:
rforest = RandomForestClassifier(random_state=42)

# Train on the training data
rforest.fit(train_x, train_y)

## <a id='toc1_8_'></a>[S Vector Machine (SVM)](#toc0_)

In [None]:
svc = SVC(kernel='rbf', random_state=42)