In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [17]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
train.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [5]:
num_train = train.copy()
num_test = test.copy()

In [6]:
def fillna(df1, df2, columns, values):
    for i, col in enumerate(columns):
        df1[col].fillna(values[i], inplace=True)
        df2[col].fillna(values[i], inplace=True)
    return df1, df2

num_train, num_test = fillna(num_train, 
                             num_test, 
                             ['Gender', 'Married', 'Education',
                              'Self_Employed', 'Property_Area',
                              'Dependents', 'Credit_History', 
                              'LoanAmount', 'Loan_Amount_Term'],
                             ['None', 'None', 'None', 'None', 'None', 
                              -1, -1, -1, -1])

In [7]:
def label_encoder(df1, df2, columns):
    for col in columns:
        cur = LabelEncoder()
        df1[col] = cur.fit_transform(df1[col])
        df2[col] = cur.transform(df2[col])
    return df1, df2

num_train, num_test = label_encoder(num_train, num_test, ['Gender', 'Married', 'Education', 
                                                          'Self_Employed', 'Property_Area'])

In [12]:
def change_dep(df1, df2):
    df1['Dependents'] = df1['Dependents'].apply(lambda x: 3 if str(x) == '3+' else int(x))
    df2['Dependents'] = df2['Dependents'].apply(lambda x: 3 if str(x) == '3+' else int(x))
    return df1, df2

num_train, num_test = change_dep(num_train, num_test)

In [8]:
ids = list(num_train['Loan_ID'].values)
ids.extend(num_test['Loan_ID'].values)
ID = LabelEncoder()
ID.fit_transform(ids)
num_train['Loan_ID'] = ID.transform(num_train['Loan_ID'])
num_test['Loan_ID'] = ID.transform(num_test['Loan_ID'])

In [9]:
train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [13]:
X_train, X_test, y_train, y_test = train_test_split(num_train.drop('Loan_Status', axis=1), num_train['Loan_Status'], random_state=42, test_size=.25)

<b> BASE MODEL

In [14]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [15]:
rf.score(X_test, y_test)

0.77272727272727271

In [16]:
X_train.to_csv('X_train.csv')
y_train.to_csv('y_train.csv')
X_test.to_csv('X_test.csv')
y_test.to_csv('y_test.csv')

In [67]:
test = pd.read_csv('test.csv')

In [68]:
num_test = test.copy()

In [69]:
num_test['Gender'].fillna('None', inplace=True)
num_test['Gender'] = gender.transform(num_test['Gender'])

num_test['Married'].fillna('None', inplace=True)
num_test['Married'] = married.transform(num_test['Married'])

num_test['Education'].fillna('None', inplace=True)
num_test['Education'] = education.transform(num_test['Education'])

num_test['Self_Employed'].fillna('None', inplace=True)
num_test['Self_Employed'] = self.transform(num_test['Self_Employed'])

num_test['Property_Area'].fillna('None', inplace=True)
num_test['Property_Area'] = area.transform(num_test['Property_Area'])

ID = LabelEncoder()
num_test['Loan_ID'] = ID.fit_transform(num_test['Loan_ID'])

num_test['Dependents'].fillna(-1, inplace=True)
num_test['Dependents'] = num_test['Dependents'].apply(lambda x: 3 if str(x) == '3+' else int(x))

num_test['Loan_ID'] = num_test['Loan_ID'].astype('category')

num_test['Credit_History'].fillna(-1, inplace=True)
num_test['LoanAmount'].fillna(-1, inplace=True)
num_test['Loan_Amount_Term'].fillna(-1, inplace=True)

In [70]:
rf.predict(num_test)

array(['Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'N',
       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'Y', 'Y',
       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',
       'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'N', 'Y', 'N', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',
       'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'N', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'N',
       'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'N', 'N', 'N', 'Y', 'N',
       'Y', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y

In [71]:
from sklearn.metrics import roc_curve, auc

In [None]:
roc_curve()

In [72]:
num_test

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,0,1,2,0,0,0,5720,0,110.0,360.0,1.0,2
1,1,1,2,1,0,0,3076,1500,126.0,360.0,1.0,2
2,2,1,2,2,0,0,5000,1800,208.0,360.0,1.0,2
3,3,1,2,2,0,0,2340,2546,100.0,360.0,-1.0,2
4,4,1,0,0,1,0,3276,0,78.0,360.0,1.0,2
5,5,1,2,0,1,2,2165,3422,152.0,360.0,1.0,2
6,6,0,0,1,1,0,2226,0,59.0,360.0,1.0,1
7,7,1,2,2,1,0,3881,0,147.0,360.0,0.0,0
8,8,1,2,2,0,1,13633,0,280.0,240.0,1.0,2
9,9,1,0,0,1,0,2400,2400,123.0,360.0,1.0,1


In [76]:
dp = rf.decision_path(num_train.drop('Loan_Status', axis=1))