In [652]:
#Reading data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df_train = pd.read_csv("train_loan_prediction.csv")
df_test = pd.read_csv("test_loan_prediction.csv")
df_train.head(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [653]:
df_train.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [654]:
df_train.apply(lambda x: sum(x.isnull()),axis=0)
df_test.apply(lambda x: sum(x.isnull()),axis=0)

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

In [655]:
#filling missing values for gender
df_train['Gender'].value_counts()
df_test['Gender'].value_counts()
df_train['Gender'].fillna(df_train['Gender'].mode()[0], inplace=True)
df_test['Gender'].fillna(df_test['Gender'].mode()[0], inplace=True)
#df_train['Gender'].value_counts()

In [656]:
#filling missing values for Married column
df_train['Married'].fillna(df_train['Married'].mode()[0], inplace=True)
df_train['Married'].value_counts()

Yes    401
No     213
Name: Married, dtype: int64

In [657]:
#filling missing values for Dependents column
df_train['Dependents'].fillna(df_train['Dependents'].mode()[0], inplace=True)
df_test['Dependents'].fillna(df_test['Dependents'].mode()[0], inplace=True)
df_train['Dependents'].value_counts()
df_test['Dependents'].value_counts()

0     210
2      59
1      58
3+     40
Name: Dependents, dtype: int64

In [658]:
#filling missing values for Self_Employed column
df_train['Self_Employed'].value_counts()
df_train['Self_Employed'].fillna(df_train['Self_Employed'].mode()[0], inplace=True)
df_test['Self_Employed'].fillna(df_test['Self_Employed'].mode()[0], inplace=True)
df_test['Self_Employed'].value_counts()

No     330
Yes     37
Name: Self_Employed, dtype: int64

In [659]:
#filling missing values for Loan_Amount_Term
df_train['Loan_Amount_Term'].value_counts()
df_train['Loan_Amount_Term'].fillna(df_train['Loan_Amount_Term'].mean(), inplace=True)
df_test['Loan_Amount_Term'].value_counts()
df_test['Loan_Amount_Term'].fillna(df_test['Loan_Amount_Term'].mean(), inplace=True)

In [660]:
#filling missing values for Credit_History
conv_map = {'Y':1,'N':0}
df_train['Loan_Status'] = df_train['Loan_Status'].map(conv_map)
df_train['Credit_History'].fillna(df_train['Loan_Status'], inplace=True)
df_test['Credit_History'].value_counts()
df_test['Credit_History'].fillna(df_test['Credit_History'].mode()[0], inplace=True)
df_train['Credit_History'].value_counts()
#print df_train['Loan_Status']

1.0    512
0.0    102
Name: Credit_History, dtype: int64

In [661]:
#Coming to ApplicantIncome. One intuition can be that some applicants have lower income but strong support Co-applicants. 
#So it might be a good idea to combine both incomes as total income and take a log transformation of the same
df_train['TotalIncome'] = df_train['ApplicantIncome'] + df_train['CoapplicantIncome']
df_train['TotalIncome_log'] = np.log(df_train['TotalIncome'])
df_test['TotalIncome'] = df_test['ApplicantIncome'] + df_test['CoapplicantIncome']
df_test['TotalIncome_log'] = np.log(df_test['TotalIncome'])

In [662]:
#Including one more variable 
def categ(x):
    if x['TotalIncome'] >= 7000.00:
        return "High"
    elif (x['TotalIncome'] >= 4500.00) and (x['TotalIncome']) < 7000.00:
        return "Mid"
    else:
        return "Low"
df_train['TotalIncomeCat'] = df_train.apply(lambda x : categ(x),axis=1)
df_test['TotalIncomeCat'] = df_test.apply(lambda x : categ(x),axis=1)

In [663]:
tab_train = df_train.pivot_table(values=["LoanAmount"], index=["TotalIncomeCat"], aggfunc=np.mean)
print tab_train

                LoanAmount
TotalIncomeCat            
High            223.052941
Low              95.005435
Mid             131.411765


In [664]:
#filling missing values for LoanAmount
tab_train = df_train.pivot_table(values=["LoanAmount"], index=["Gender","Married","Self_Employed"], aggfunc=np.mean)
tab_test = df_test.pivot_table(values=["LoanAmount"], index=["Gender","Married","Self_Employed"], aggfunc=np.mean)

def fill(x,df):
    if pd.isnull(x['LoanAmount']):
        return df.loc[x['Gender'],x['Married'],x['Self_Employed']][0]
    else:
        return x['LoanAmount']

df_train['LoanAmount'] = df_train.apply(lambda x : format(fill(x,tab_train),'.3f'),axis=1)
df_train['LoanAmount'] = df_train['LoanAmount'].astype('float64') 
df_test['LoanAmount'] = df_test.apply(lambda x : format(fill(x,tab_test),'.3f'),axis=1)
df_test['LoanAmount'] = df_test['LoanAmount'].astype('float64') 
#df_test['LoanAmount'].value_counts()
#df_train['LoanAmount'].value_counts()
#df_train['LoanAmount'].fillna((df_train[np.abs(df_train.LoanAmount-df_train.LoanAmount.mean())<=(3*df_train.LoanAmount.std())])['LoanAmount'].mean(), inplace=True)

#df_test['LoanAmount'].fillna((df_test[np.abs(df_test.LoanAmount-df_test.LoanAmount.mean())<=(3*df_test.LoanAmount.std())])['LoanAmount'].mean(), inplace=True)

In [665]:
#Since the extreme values are practically possible, 
#i.e. some people might apply for high value loans due to specific needs. 
#So instead of treating them as outliers, let’s try a log transformation to nullify their effect
df_train['LoanAmount_log'] = np.log(df_train['LoanAmount'])
df_test['LoanAmount_log'] = np.log(df_test['LoanAmount'])
#df_train['LoanAmount_log'].hist(bins=50)
#df_train['LoanAmount'].hist(bins=50)
#plt.show()

In [666]:
df_train['EMI'] = (df_train['LoanAmount']/df_train['Loan_Amount_Term']) * 1000
df_train['PropofIncToLoan'] = (df_train['EMI']/df_train['TotalIncome']) * 100
df_test['EMI'] = (df_test['LoanAmount']/df_test['Loan_Amount_Term']) * 1000
df_test['PropofIncToLoan'] = (df_test['EMI']/df_test['TotalIncome']) * 100
df_train['EMI_log'] = np.log(df_train['EMI'])
df_train['PropofIncToLoan_log'] = np.log(df_train['PropofIncToLoan'])
df_test['EMI_log'] = np.log(df_test['EMI'])
df_test['PropofIncToLoan_log'] = np.log(df_test['PropofIncToLoan'])

In [708]:
#Converting all our categorical variables into numeric by encoding the categories,
#Since, sklearn requires all inputs to be numeric
df_train.dtypes
df_test.dtypes
from sklearn.preprocessing import LabelEncoder
var_mod = ['Gender','Married','Dependents','Education','Self_Employed','Property_Area','TotalIncomeCat']
le = LabelEncoder()
df_train['Loan_Status'] = le.fit_transform(df_train['Loan_Status'])
for i in var_mod:
    df_train[i] = le.fit_transform(df_train[i])
    df_test[i] = le.fit_transform(df_test[i])
df_train.dtypes 
#df_test.dtypes

Loan_ID                 object
Gender                   int64
Married                  int64
Dependents               int64
Education                int64
Self_Employed            int64
ApplicantIncome          int64
CoapplicantIncome      float64
LoanAmount             float64
Loan_Amount_Term       float64
Credit_History         float64
Property_Area            int64
Loan_Status              int64
TotalIncome            float64
TotalIncome_log        float64
TotalIncomeCat           int64
LoanAmount_log         float64
EMI                    float64
PropofIncToLoan        float64
EMI_log                float64
PropofIncToLoan_log    float64
dtype: object

In [709]:
df_train.corr()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,TotalIncome,TotalIncome_log,TotalIncomeCat,LoanAmount_log,EMI,PropofIncToLoan,EMI_log,PropofIncToLoan_log
Gender,1.0,0.364569,0.172914,0.045364,-0.000525,0.058809,0.082912,0.111007,-0.073567,-0.006865,-0.025752,0.017987,0.093191,0.167285,0.038461,0.148095,0.061793,-0.040506,0.160195,-0.001574
Married,0.364569,1.0,0.334216,0.012304,0.004489,0.051708,0.075948,0.152389,-0.100863,0.014853,0.004257,0.091478,0.083319,0.151527,0.044408,0.186453,0.096872,-0.001156,0.202034,0.062972
Dependents,0.172914,0.334216,1.0,0.055752,0.056798,0.118202,0.03043,0.163571,-0.101054,-0.048068,-0.000244,0.010118,0.12559,0.112812,-0.080659,0.15676,0.103422,0.037936,0.180562,0.080986
Education,0.045364,0.012304,0.055752,1.0,-0.010383,-0.14076,-0.06229,-0.167302,-0.077242,-0.071397,-0.065243,-0.085884,-0.161362,-0.204861,0.176719,-0.143859,-0.07529,0.048206,-0.090782,0.12115
Self_Employed,-0.000525,0.004489,0.056798,-0.010383,1.0,0.12718,-0.0161,0.116729,-0.033943,0.008003,-0.03086,-0.0037,0.113,0.176709,-0.144895,0.111157,0.052237,-0.024884,0.1122,-0.066162
ApplicantIncome,0.058809,0.051708,0.118202,-0.14076,0.12718,1.0,-0.116605,0.565481,-0.045242,-0.010051,-0.0095,-0.00471,0.893037,0.717829,-0.386129,0.434827,0.320332,-0.115156,0.396195,-0.336167
CoapplicantIncome,0.082912,0.075948,0.03043,-0.06229,-0.0161,-0.116605,1.0,0.188396,-0.059675,-0.054737,0.010522,-0.059187,0.342781,0.383827,-0.152256,0.204978,0.136072,-0.065612,0.204257,-0.188336
LoanAmount,0.111007,0.152389,0.163571,-0.167302,0.116729,0.565481,0.188396,1.0,0.03835,-0.023158,-0.046349,-0.035986,0.620228,0.688103,-0.39104,0.895522,0.491451,0.019546,0.763861,0.112212
Loan_Amount_Term,-0.073567,-0.100863,-0.101054,-0.077242,-0.033943,-0.045242,-0.059675,0.03835,1.0,-0.003266,-0.07762,-0.020974,-0.06983,-0.05532,0.107911,0.085921,-0.500308,-0.516211,-0.444226,-0.442057
Credit_History,-0.006865,0.014853,-0.048068,-0.071397,0.008003,-0.010051,-0.054737,-0.023158,-0.003266,1.0,0.004565,0.595642,-0.034306,-0.004183,-0.062305,-0.043298,0.003127,0.0222,-0.028474,-0.027639


In [668]:
# creating generic function for any model 
#Import models from scikit learn module:
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold   #For K-fold cross validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import metrics

#Generic function for making a classification model and accessing performance:
def classification_model(model, data, predictors, outcome):
  #Fit the model:
  model.fit(data[predictors],data[outcome])
  
  #Make predictions on training set:
  predictions = model.predict(data[predictors])
  
  #Print accuracy
  accuracy = metrics.accuracy_score(predictions,data[outcome])
  print "Accuracy : %s" % "{0:.3%}".format(accuracy)

  #Perform k-fold cross-validation with 5 folds
  kf = KFold(data.shape[0], n_folds=5)
  error = []
  for train, test in kf:
    # Filter training data
    train_predictors = (data[predictors].iloc[train,:])
    
    # The target we're using to train the algorithm.
    train_target = data[outcome].iloc[train]
    
    # Training the algorithm using the predictors and target.
    model.fit(train_predictors, train_target)
    
    #Record error from each cross-validation run
    error.append(model.score(data[predictors].iloc[test,:], data[outcome].iloc[test]))
 
  print "Cross-Validation Score : %s" % "{0:.3%}".format(np.mean(error))

  #Fit the model again so that it can be refered outside the function:
  model.fit(data[predictors],data[outcome]) 

In [338]:
# LogisticRegression with one variable
'''outcome_var = 'Loan_Status'
model = LogisticRegression()
predictor_var = ['Credit_History','PropofIncToLoan']
classification_model(model, df_train,predictor_var,outcome_var)'''

Accuracy : 80.945%
Cross-Validation Score : 81.109%


In [381]:
# LogisticRegression with multiple variable
#predictor_var = ['Credit_History','Education','Married','Self_Employed','Property_Area','PropofIncToLoan_log','EMI_log']
'''predictor_var = ['TotalIncome_log','LoanAmount_log','Credit_History','Dependents','Property_Area','PropofIncToLoan_log','EMI_log']
classification_model(model, df_train,predictor_var,outcome_var)'''

Accuracy : 82.899%
Cross-Validation Score : 80.949%


In [669]:
# RandomForestClassifier with multiple variable
model = RandomForestClassifier(n_estimators=100)
predictor_var = ['Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'Loan_Amount_Term', 'Credit_History', 'Property_Area',
        'LoanAmount_log','TotalIncome_log','PropofIncToLoan_log','EMI_log','TotalIncomeCat']
classification_model(model, df_train,predictor_var,outcome_var)

Accuracy : 100.000%
Cross-Validation Score : 80.136%


In [670]:
# get to know feature score for different variable to use it accordingly
featimp = pd.Series(model.feature_importances_, index=predictor_var).sort_values(ascending=False)
print featimp

Credit_History         0.308262
PropofIncToLoan_log    0.142139
TotalIncome_log        0.141942
EMI_log                0.109077
LoanAmount_log         0.109012
Property_Area          0.036042
Dependents             0.034108
Loan_Amount_Term       0.029286
TotalIncomeCat         0.020617
Married                0.019898
Education              0.018662
Self_Employed          0.015822
Gender                 0.015132
dtype: float64


In [703]:
# RandomForestClassifier with top 5 feature variable
model = RandomForestClassifier(n_estimators=25, min_samples_split=25, max_depth=7, max_features=1)
#model = RandomForestClassifier(n_estimators=100)
predictor_var = ['TotalIncome_log','LoanAmount_log','Credit_History','PropofIncToLoan_log','EMI_log','Property_Area']
classification_model(model, df_train,predictor_var,outcome_var)

Accuracy : 84.691%
Cross-Validation Score : 82.739%


In [371]:
#from sklearn import svm
#model = svm.SVC()
#predictor_var = ['TotalIncome_log','LoanAmount_log','Credit_History','Dependents','Property_Area','PropofIncToLoan','EMI']
#classification_model(model, df_train,predictor_var,outcome_var)

Accuracy : 88.762%
Cross-Validation Score : 65.798%


In [577]:
#from sklearn import tree
#model = tree.DecisionTreeClassifier(criterion='gini',max_depth=5)
#predictor_var = ['TotalIncome_log','LoanAmount_log','Credit_History','PropofIncToLoan_log','EMI_log']
#predictor_var = ['Gender', 'Married', 'Dependents', 'Education',
#       'Self_Employed', 'Loan_Amount_Term', 'Credit_History', 'Property_Area',
#        'LoanAmount_log','TotalIncome_log','PropofIncToLoan_log','EMI_log']
#classification_model(model, df_train,predictor_var,outcome_var)

Accuracy : 84.202%
Cross-Validation Score : 78.661%


In [469]:
'''from sklearn import tree
for depth in range(1,10):
  model = tree.DecisionTreeClassifier(max_depth=depth, random_state=0,min_samples_split=30, min_samples_leaf=10)
  print depth
  classification_model(model, df_train,predictor_var,outcome_var)
  #print 'Depth: %i Accuracy: %.3f' % (depth,Accuracy)'''

1
Accuracy : 80.945%
Cross-Validation Score : 80.946%
2
Accuracy : 81.596%
Cross-Validation Score : 79.645%
3
Accuracy : 81.596%
Cross-Validation Score : 78.181%
4
Accuracy : 81.922%
Cross-Validation Score : 78.988%
5
Accuracy : 83.388%
Cross-Validation Score : 78.988%
6
Accuracy : 83.550%
Cross-Validation Score : 76.383%
7
Accuracy : 84.039%
Cross-Validation Score : 75.245%
8
Accuracy : 84.039%
Cross-Validation Score : 75.407%
9
Accuracy : 84.039%
Cross-Validation Score : 75.407%


In [704]:
from sklearn.metrics import confusion_matrix
predictions = model.predict(df_train[predictor_var])
predictions_map = {1:'Y',0:'N'}
con_test = pd.DataFrame({
        'Loan_ID':df_train['Loan_ID'],
        'Loan_Status':df_train['Loan_Status'],
        'Loan_Status_pred':predictions
    })
con_test['Loan_Status'] = con_test['Loan_Status'].map(predictions_map)
con_test['Loan_Status_pred'] = con_test['Loan_Status_pred'].map(predictions_map)
con_test.to_csv('con_test.csv',index=False)
#y_true = [2, 0, 2, 2, 0, 1]
#y_pred = [0, 0, 2, 2, 0, 2]
arr = confusion_matrix(con_test['Loan_Status'], con_test['Loan_Status_pred'])
print arr
sens = arr[0][0]/float(arr[0][0] + arr[0][1])
print "Sensitivity : %s" % "{0:.3%}".format(sens)
spec = arr[1][1]/float(arr[1][0] + arr[1][1])
print "Specificity : %s" % "{0:.3%}".format(spec)

[[110  82]
 [ 10 412]]
Sensitivity : 57.292%
Specificity : 97.630%


In [705]:
print model
predictions = model.predict(df_test[predictor_var])
predictions_map = {1:'Y',0:'N'}

result = pd.DataFrame({
        'Loan_ID':df_test['Loan_ID'],
        'Loan_Status':predictions
    })
result['Loan_Status'] = result['Loan_Status'].map(predictions_map)
result.to_csv('submission.csv',index=False)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=7, max_features=1, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=25, min_weight_fraction_leaf=0.0,
            n_estimators=25, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
