In [4]:
#Reading data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df_train = pd.read_csv("train_loan_prediction.csv")
df_test = pd.read_csv("test_loan_prediction.csv")
df_train.head(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [6]:
df_train.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [59]:
df_train.apply(lambda x: sum(x.isnull()),axis=0)
df_test.apply(lambda x: sum(x.isnull()),axis=0)

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [21]:
#filling missing values for gender
df_train['Gender'].value_counts()
df_test['Gender'].value_counts()
df_train['Gender'].fillna('Male', inplace=True)
df_test['Gender'].fillna('Male', inplace=True)

In [27]:
#filling missing values for Married column
df_train['Married'].value_counts()
df_train['Married'].fillna('Yes', inplace=True)

In [36]:
#filling missing values for Dependents column
df_train['Dependents'].value_counts()
df_test['Dependents'].value_counts()
df_train['Dependents'].fillna('0', inplace=True)
df_test['Dependents'].fillna('0', inplace=True)

In [46]:
#filling missing values for Self_Employed column
df_train['Self_Employed'].value_counts()
df_train['Self_Employed'].fillna('No', inplace=True)
df_test['Self_Employed'].value_counts()
df_test['Self_Employed'].fillna('No', inplace=True)

In [51]:
#filling missing values for LoanAmount
df_train['LoanAmount'].value_counts()
df_train['LoanAmount'].fillna(df_train['LoanAmount'].mean(), inplace=True)
df_test['LoanAmount'].value_counts()
df_test['LoanAmount'].fillna(df_train['LoanAmount'].mean(), inplace=True)

In [55]:
#filling missing values for Loan_Amount_Term
df_train['Loan_Amount_Term'].value_counts()
df_train['Loan_Amount_Term'].fillna(df_train['Loan_Amount_Term'].mean(), inplace=True)
df_test['Loan_Amount_Term'].value_counts()
df_test['Loan_Amount_Term'].fillna(df_train['Loan_Amount_Term'].mean(), inplace=True)

In [61]:
#filling missing values for Credit_History
df_train['Credit_History'].value_counts()
df_train['Credit_History'].fillna(1.0, inplace=True)
df_test['Credit_History'].value_counts()
df_test['Credit_History'].fillna(1.0, inplace=True)

In [67]:
#Since the extreme values are practically possible, 
#i.e. some people might apply for high value loans due to specific needs. 
#So instead of treating them as outliers, let’s try a log transformation to nullify their effect
df_train['LoanAmount_log'] = np.log(df_train['LoanAmount'])
df_test['LoanAmount_log'] = np.log(df_test['LoanAmount'])
#df_train['LoanAmount_log'].hist(bins=50)
#plt.show()

In [69]:
#Coming to ApplicantIncome. One intuition can be that some applicants have lower income but strong support Co-applicants. 
#So it might be a good idea to combine both incomes as total income and take a log transformation of the same
df_train['TotalIncome'] = df_train['ApplicantIncome'] + df_train['CoapplicantIncome']
df_train['TotalIncome_log'] = np.log(df_train['TotalIncome'])
df_test['TotalIncome'] = df_test['ApplicantIncome'] + df_test['CoapplicantIncome']
df_test['TotalIncome_log'] = np.log(df_test['TotalIncome'])

In [77]:
#Converting all our categorical variables into numeric by encoding the categories,
#Since, sklearn requires all inputs to be numeric
df_train.dtypes
df_test.dtypes
from sklearn.preprocessing import LabelEncoder
var_mod = ['Gender','Married','Dependents','Education','Self_Employed','Property_Area']
le = LabelEncoder()
df_train['Loan_Status'] = le.fit_transform(df_train['Loan_Status'])
for i in var_mod:
    df_train[i] = le.fit_transform(df_train[i])
    df_test[i] = le.fit_transform(df_test[i])
df_train.dtypes 
df_test.dtypes

Loan_ID               object
Gender                 int64
Married                int64
Dependents             int64
Education              int64
Self_Employed          int64
ApplicantIncome        int64
CoapplicantIncome      int64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area          int64
LoanAmount_log       float64
TotalIncome            int64
TotalIncome_log      float64
dtype: object

In [78]:
# creating generic function for any model 
#Import models from scikit learn module:
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold   #For K-fold cross validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import metrics

#Generic function for making a classification model and accessing performance:
def classification_model(model, data, predictors, outcome):
  #Fit the model:
  model.fit(data[predictors],data[outcome])
  
  #Make predictions on training set:
  predictions = model.predict(data[predictors])
  
  #Print accuracy
  accuracy = metrics.accuracy_score(predictions,data[outcome])
  print "Accuracy : %s" % "{0:.3%}".format(accuracy)

  #Perform k-fold cross-validation with 5 folds
  kf = KFold(data.shape[0], n_folds=5)
  error = []
  for train, test in kf:
    # Filter training data
    train_predictors = (data[predictors].iloc[train,:])
    
    # The target we're using to train the algorithm.
    train_target = data[outcome].iloc[train]
    
    # Training the algorithm using the predictors and target.
    model.fit(train_predictors, train_target)
    
    #Record error from each cross-validation run
    error.append(model.score(data[predictors].iloc[test,:], data[outcome].iloc[test]))
 
  print "Cross-Validation Score : %s" % "{0:.3%}".format(np.mean(error))

  #Fit the model again so that it can be refered outside the function:
  model.fit(data[predictors],data[outcome]) 



In [81]:
# LogisticRegression with one variable
outcome_var = 'Loan_Status'
model = LogisticRegression()
predictor_var = ['Credit_History']
classification_model(model, df_train,predictor_var,outcome_var)

Accuracy : 80.945%
Cross-Validation Score : 80.946%


In [83]:
# LogisticRegression with multiple variable
predictor_var = ['Credit_History','Education','Married','Self_Employed','Property_Area']
classification_model(model, df_train,predictor_var,outcome_var)

Accuracy : 80.945%
Cross-Validation Score : 80.946%


In [86]:
# RandomForestClassifier with multiple variable
model = RandomForestClassifier(n_estimators=100)
predictor_var = ['Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'Loan_Amount_Term', 'Credit_History', 'Property_Area',
        'LoanAmount_log','TotalIncome_log']
classification_model(model, df_train,predictor_var,outcome_var)

Accuracy : 100.000%
Cross-Validation Score : 78.178%


In [88]:
# get to know feature score for different variable to use it accordingly
featimp = pd.Series(model.feature_importances_, index=predictor_var).sort_values(ascending=False)
print featimp

Credit_History      0.267018
TotalIncome_log     0.256420
LoanAmount_log      0.232446
Dependents          0.051033
Property_Area       0.050984
Loan_Amount_Term    0.044912
Education           0.026821
Married             0.026534
Gender              0.022273
Self_Employed       0.021559
dtype: float64


In [109]:
# RandomForestClassifier with top 5 feature variable
#model = RandomForestClassifier(n_estimators=25, min_samples_split=25, max_depth=7, max_features=1)
model = RandomForestClassifier(n_estimators=25)
predictor_var = ['TotalIncome_log','LoanAmount_log','Credit_History','Dependents','Property_Area']
classification_model(model, df_train,predictor_var,outcome_var)

Accuracy : 99.511%
Cross-Validation Score : 78.824%


In [102]:
#from sklearn import svm
#model = svm.SVC()
#predictor_var = ['TotalIncome_log','LoanAmount_log','Credit_History','Dependents','Property_Area']
#classification_model(model, df_train,predictor_var,outcome_var)

Accuracy : 81.107%
Cross-Validation Score : 80.784%


In [106]:
print model
predictions = model.predict(df_test[predictor_var])
predictions_map = {1:'Y',0:'N'}

result = pd.DataFrame({
        'Loan_ID':df_test['Loan_ID'],
        'Loan_Status':predictions
    })
result['Loan_Status'] = result['Loan_Status'].map(predictions_map)
result.to_csv('submission.csv',index=False)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
