## Import packages

In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from category_encoders import OrdinalEncoder, OneHotEncoder

In [19]:
from sklearn.model_selection import cross_val_score

## Read in the data

In [6]:
# Absolute path
#train = pd.read_csv('/Users/sisichen/Desktop/60mmchallenge_04_16/Data/train_ctrUa4K.csv')

In [14]:
# Relative path 
train = pd.read_csv('../Data/train_ctrUa4K.csv', index_col='Loan_ID')

In [15]:
train.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [17]:
cat_cols = train.select_dtypes('object').columns
for col in cat_cols:
    print(col)
    print(train[col].value_counts())
    print('/')

Gender
Male      489
Female    112
Name: Gender, dtype: int64
/
Married
Yes    398
No     213
Name: Married, dtype: int64
/
Dependents
0     345
1     102
2     101
3+     51
Name: Dependents, dtype: int64
/
Education
Graduate        480
Not Graduate    134
Name: Education, dtype: int64
/
Self_Employed
No     500
Yes     82
Name: Self_Employed, dtype: int64
/
Property_Area
Semiurban    233
Urban        202
Rural        179
Name: Property_Area, dtype: int64
/
Loan_Status
Y    422
N    192
Name: Loan_Status, dtype: int64
/


## Split Data

In [None]:
target= 'Loan_Status'
X = train.drop(columns=target)
y = train[target]

## Set the baseline

In [30]:
y.value_counts(normalize=True).max()

0.6872964169381107

## Model selection
1. Logistic Regression

In [22]:
model_lr = make_pipeline(OneHotEncoder(),
                         SimpleImputer(),
                         StandardScaler(),
                         LogisticRegression(max_iter=1000))

In [23]:
cv_lr = cross_val_score(model_lr, X, y, cv=5, n_jobs=-1)
print(cv_lr)

[0.80487805 0.7804878  0.7804878  0.85365854 0.81147541]


In [31]:
print(cv_lr.mean())
print(cv_lr.std())

0.8061975209916034
0.026848181033511012


In [27]:
model_rf = make_pipeline(OrdinalEncoder(),
                         SimpleImputer(),
                         RandomForestClassifier(random_state=42))

In [28]:
cv_rf = cross_val_score(model_rf, X, y, cv=5, n_jobs=-1)
print(cv_rf)
print(cv_rf.mean())
print(cv_rf.std())

[0.80487805 0.76422764 0.77235772 0.83739837 0.81147541]
0.7980674396907904
0.026755193303845057


In [32]:
model_rf

Pipeline(memory=None,
         steps=[('ordinalencoder',
                 OrdinalEncoder(cols=None, drop_invariant=False,
                                handle_missing='value', handle_unknown='value',
                                mapping=None, return_df=True, verbose=0)),
                ('simpleimputer',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='mean',
                               verbose=0)),
                ('randomforestclassifier',
                 RandomForestClassifier...=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                         

## Hyperparameter tuning

In [34]:
para_grid ={
           'randomforestclassifier__n_estimators': range(50, 250, 5),
           'randomforestclassifier__max_depth': range(5, 10, 2)}

model_rs = RandomizedSearchCV(model_rf, 
                              param_distributions=para_grid,
                              n_iter=3,
                              cv=3,
                              verbose=2)
                
model_rs.fit(X,y)                            

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] randomforestclassifier__n_estimators=65, randomforestclassifier__max_depth=5 
[CV]  randomforestclassifier__n_estimators=65, randomforestclassifier__max_depth=5, total=   0.1s
[CV] randomforestclassifier__n_estimators=65, randomforestclassifier__max_depth=5 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV]  randomforestclassifier__n_estimators=65, randomforestclassifier__max_depth=5, total=   0.1s
[CV] randomforestclassifier__n_estimators=65, randomforestclassifier__max_depth=5 
[CV]  randomforestclassifier__n_estimators=65, randomforestclassifier__max_depth=5, total=   0.1s
[CV] randomforestclassifier__n_estimators=230, randomforestclassifier__max_depth=7 
[CV]  randomforestclassifier__n_estimators=230, randomforestclassifier__max_depth=7, total=   0.4s
[CV] randomforestclassifier__n_estimators=230, randomforestclassifier__max_depth=7 
[CV]  randomforestclassifier__n_estimators=230, randomforestclassifier__max_depth=7, total=   0.4s
[CV] randomforestclassifier__n_estimators=230, randomforestclassifier__max_depth=7 
[CV]  randomforestclassifier__n_estimators=230, randomforestclassifier__max_depth=7, total=   0.3s
[CV] randomforestclassifier__n_estimators=160, randomforestclassifier__max_depth=7 
[CV]  randomforestclassifier__n_estimators=160, randomforestclassifier__max_depth=7, tot

[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    2.3s finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('ordinalencoder',
                                              OrdinalEncoder(cols=None,
                                                             drop_invariant=False,
                                                             handle_missing='value',
                                                             handle_unknown='value',
                                                             mapping=None,
                                                             return_df=True,
                                                             verbose=0)),
                                             ('simpleimputer',
                                              SimpleImputer(add_indicator=False,
                                                            copy=True,
                                                            fill_value=None,


In [35]:
model_rs.best_score_

0.8045990754025188

## make submission.csv

In [36]:
X_test = pd.read_csv('../Data/test_lAUu6dG.csv', index_col='Loan_ID')

In [38]:
submission = pd.read_csv('../Data/sample_submission_49d68Cx.csv', index_col='Loan_ID')

In [39]:
submission

Unnamed: 0_level_0,Loan_Status
Loan_ID,Unnamed: 1_level_1
LP001015,N
LP001022,N
LP001031,N
LP001035,N
LP001051,N
...,...
LP002971,N
LP002975,N
LP002980,N
LP002986,N


In [40]:
submission['Loan_Status'] = model_rs.predict(X_test)

In [42]:
submission['Loan_Status'].value_counts()

Y    307
N     60
Name: Loan_Status, dtype: int64

In [43]:
submission.to_csv('submission.csv', index=False)