<a href="https://colab.research.google.com/github/udfds/DataLab/blob/master/competitions/analyticsvidhya/loan_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Practice Problem: Loan Prediction III


Problem:

Company wants to automate the loan eligibility process (real time) based on customer detail provided while filling online application form... To automate this process, they have given a problem to identify the customers segments.

Reference:
 - https://datahack.analyticsvidhya.com/contest/practice-problem-loan-prediction-iii/

## Install

In [71]:
!pip install pandas-profiling



## Imports

In [0]:
import pandas as pd
import numpy as np
import matplotlib as plt
import pandas_profiling

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import metrics

% matplotlib inline

## DataFrame

In [73]:
df_loan = pd.read_csv('dataset_loan_train.csv')
df_loan_test = pd.read_csv('dataset_loan_test.csv')
df_loan.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


## Exploring data

In [74]:
df_loan.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [75]:
print(df_loan.Married.value_counts(), '\n')
print(df_loan.Dependents.value_counts(), '\n')
print(df_loan.Education.value_counts(), '\n')
print(df_loan.Self_Employed.value_counts(), '\n')
print(df_loan.Property_Area.value_counts(), '\n')
print(df_loan.Loan_Status.value_counts(), '\n')

Yes    398
No     213
Name: Married, dtype: int64 

0     345
1     102
2     101
3+     51
Name: Dependents, dtype: int64 

Graduate        480
Not Graduate    134
Name: Education, dtype: int64 

No     500
Yes     82
Name: Self_Employed, dtype: int64 

Semiurban    233
Urban        202
Rural        179
Name: Property_Area, dtype: int64 

Y    422
N    192
Name: Loan_Status, dtype: int64 



In [76]:
df_loan.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [0]:
def clean_df_loan(df_loan):
  loan_amount_mean = df_loan['LoanAmount'].mean()
  df_loan['LoanAmount'].fillna(loan_amount_mean, inplace=True)
  
  df_loan['Self_Employed'].fillna('No',inplace=True)
  df_loan['Gender'].fillna(df_loan['Gender'].mode()[0], inplace=True)
  df_loan['Married'].fillna(df_loan['Married'].mode()[0], inplace=True)
  df_loan['Dependents'].fillna(df_loan['Dependents'].mode()[0], inplace=True)
  df_loan['Loan_Amount_Term'].fillna(df_loan['Loan_Amount_Term'].mode()[0], inplace=True)
  df_loan['Credit_History'].fillna(df_loan['Credit_History'].mode()[0], inplace=True)

## Encoder

In [78]:
clean_df_loan(df_loan)
clean_df_loan(df_loan_test)

features = ['Gender','Married','Dependents','Education','Self_Employed','Property_Area','Loan_Status']


for feature in features:
  encoder = LabelEncoder()
  df_loan[feature] = encoder.fit_transform(df_loan[feature])
  if feature is not 'Loan_Status':
    df_loan_test[feature] = encoder.fit_transform(df_loan_test[feature])

df_loan.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,1,0,0,0,0,5849,0.0,146.412162,360.0,1.0,2,1
1,LP001003,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,0
2,LP001005,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,1
3,LP001006,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2,1
4,LP001008,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2,1


## Splitting

In [0]:
X = df_loan[['Credit_History','Education','Married','Self_Employed','Property_Area']]
y = df_loan.Loan_Status

df_loan_test = df_loan_test[['Credit_History','Education','Married','Self_Employed','Property_Area']]

## Build Model - KFold

In [80]:
# Cross validation
results = []
rkf = RepeatedKFold(n_splits=2, n_repeats=10, random_state=9)

best_accuracy = 0
best_model = None

for line_train, line_test in rkf.split(X):
    # Load the group of data
    X_train, X_valid = X.iloc[line_train], X.iloc[line_test]
    y_train, y_valid = y.iloc[line_train], y.iloc[line_test]
    
    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    y_predict = model.predict(X_valid)
    accuracy = np.mean(y_valid == y_predict)
    
    results.append(accuracy)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model
  
print("Mean accuracy:", np.mean(results))
print("Best accuracy:", best_accuracy)

Mean accuracy: 0.8094462540716613
Best accuracy: 0.8534201954397395


## Build Model - RandomForestClassifier

In [81]:
# Build model
model_rfc = RandomForestClassifier(n_estimators=100, n_jobs=1, random_state=0, max_depth=6)

# Fit the model
model_rfc.fit(X, y)

# Accuracy
accuracy = round(model_rfc.score(X, y) * 100, 2)
print('Accuracy:', accuracy)

Accuracy: 80.94


## Build Model - LogisticRegression

In [82]:
# Build model
model_lg = LogisticRegression()

# Fit the model
model_lg.fit(X, y)

# Accuracy
accuracy = round(model_lg.score(X, y) * 100, 2)
print('Accuracy:', accuracy)

Accuracy: 80.94


## Predict: Loan_Status

In [0]:
predicts_lg = model_lg.predict(df_loan_test)
predicts_kfold = best_model.predict(df_loan_test)
predicts_rfc = model_rfc.predict(df_loan_test)

df_loan_test['Loan_Status by KFOLD'] = predicts_kfold
df_loan_test['Loan_Status by LG'] = predicts_lg
df_loan_test['Loan_Status by RFC'] = predicts_rfc

In [84]:
df_loan_test

Unnamed: 0,Credit_History,Education,Married,Self_Employed,Property_Area,Loan_Status by KFOLD,Loan_Status by LG,Loan_Status by RFC
0,1.0,0,1,0,2,1,1,1
1,1.0,0,1,0,2,1,1,1
2,1.0,0,1,0,2,1,1,1
3,1.0,0,1,0,2,1,1,1
4,1.0,1,0,0,2,1,1,1
...,...,...,...,...,...,...,...,...
362,1.0,1,1,1,2,1,1,1
363,1.0,0,1,0,2,1,1,1
364,1.0,0,0,0,1,1,1,1
365,1.0,0,1,0,0,1,1,1
