### Automatization of loan eligibility process based on information: Gender, Marital Status, Education, Number of Dependents, Income, Loan Amount, Credit History and others.

In [1]:
import pandas as pd

In [2]:
# read the datafile

df=pd.read_csv('train.csv')
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001195,Male,Yes,0,Graduate,No,2132,1591.0,96.0,360.0,1.0,Semiurban,Y
1,LP001870,Female,No,1,Graduate,No,3481,0.0,155.0,36.0,1.0,Semiurban,N
2,LP001947,Male,Yes,0,Graduate,No,2383,3334.0,172.0,360.0,1.0,Semiurban,Y
3,LP001250,Male,Yes,3+,Not Graduate,No,4755,0.0,95.0,,0.0,Semiurban,N
4,LP001267,Female,Yes,2,Graduate,No,1378,1881.0,167.0,360.0,1.0,Urban,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...
547,LP002051,Male,Yes,0,Graduate,No,2400,2167.0,115.0,360.0,1.0,Semiurban,Y
548,LP002297,Male,No,0,Graduate,No,2500,20000.0,103.0,360.0,1.0,Semiurban,Y
549,LP001792,Male,Yes,1,Graduate,No,3315,0.0,96.0,360.0,1.0,Semiurban,Y
550,LP001673,Male,No,0,Graduate,Yes,11000,0.0,83.0,360.0,1.0,Urban,N


## 1. Data Preprocessing

### Drop columns

`Loan_ID`

In [3]:
df.drop(columns= 'Loan_ID', inplace=True)


In [4]:
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,Yes,0,Graduate,No,2132,1591.0,96.0,360.0,1.0,Semiurban,Y
1,Female,No,1,Graduate,No,3481,0.0,155.0,36.0,1.0,Semiurban,N
2,Male,Yes,0,Graduate,No,2383,3334.0,172.0,360.0,1.0,Semiurban,Y
3,Male,Yes,3+,Not Graduate,No,4755,0.0,95.0,,0.0,Semiurban,N
4,Female,Yes,2,Graduate,No,1378,1881.0,167.0,360.0,1.0,Urban,N
...,...,...,...,...,...,...,...,...,...,...,...,...
547,Male,Yes,0,Graduate,No,2400,2167.0,115.0,360.0,1.0,Semiurban,Y
548,Male,No,0,Graduate,No,2500,20000.0,103.0,360.0,1.0,Semiurban,Y
549,Male,Yes,1,Graduate,No,3315,0.0,96.0,360.0,1.0,Semiurban,Y
550,Male,No,0,Graduate,Yes,11000,0.0,83.0,360.0,1.0,Urban,N


### Check missing values:

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 552 entries, 0 to 551
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             539 non-null    object 
 1   Married            549 non-null    object 
 2   Dependents         537 non-null    object 
 3   Education          552 non-null    object 
 4   Self_Employed      525 non-null    object 
 5   ApplicantIncome    552 non-null    int64  
 6   CoapplicantIncome  552 non-null    float64
 7   LoanAmount         531 non-null    float64
 8   Loan_Amount_Term   540 non-null    float64
 9   Credit_History     502 non-null    float64
 10  Property_Area      552 non-null    object 
 11  Loan_Status        552 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 51.9+ KB


In [6]:
df.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        27
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           21
Loan_Amount_Term     12
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

### Fill the missing values

In [7]:
print(df['Gender'].value_counts())
print(df['Married'].value_counts())
print(df['Dependents'].value_counts())
print(df['Self_Employed'].value_counts())
print(df['LoanAmount'].mean())
print(df['Loan_Amount_Term'].mean())
print(df['Credit_History'].value_counts())
print(df['Education'].value_counts())
print(df['Property_Area'].value_counts())
print(df['Loan_Status'].value_counts())

Male      441
Female     98
Name: Gender, dtype: int64
Yes    359
No     190
Name: Married, dtype: int64
0     312
1      92
2      88
3+     45
Name: Dependents, dtype: int64
No     451
Yes     74
Name: Self_Employed, dtype: int64
148.09981167608285
340.55555555555554
1.0    423
0.0     79
Name: Credit_History, dtype: int64
Graduate        434
Not Graduate    118
Name: Education, dtype: int64
Semiurban    211
Urban        185
Rural        156
Name: Property_Area, dtype: int64
Y    382
N    170
Name: Loan_Status, dtype: int64


In [8]:
df['Gender'].fillna(value='Male', inplace=True)


In [9]:
df['Married'].fillna(value='Yes', inplace=True)
df['Dependents'].fillna(value='0', inplace=True)
df['Self_Employed'].fillna(value='No', inplace=True)
df['Credit_History'].fillna(value=1.0, inplace=True)

In [10]:
df['LoanAmount'].fillna(value=148.09981167608285, inplace=True)
df['Loan_Amount_Term'].fillna(value=340.55555555555554, inplace=True)

### Handle non-numeric columns

In [11]:
df['Gender'].replace({"Male":1, "Female":0}, inplace=True)
df['Married'].replace({"Yes":1, "No":0}, inplace=True)
df['Self_Employed'].replace({"Yes":1, "No":0}, inplace=True)
df['Education'].replace({"Graduate":1, "Not Graduate":0}, inplace=True)
df['Loan_Status'].replace({"Y":1, "N":0}, inplace=True)

In [12]:
df=pd.get_dummies(df, columns=['Dependents', 'Property_Area'])

In [24]:
X=df.copy()
y=X.pop('Loan_Status')
y

0      1
1      0
2      1
3      0
4      0
      ..
547    1
548    1
549    1
550    0
551    1
Name: Loan_Status, Length: 552, dtype: int64

In [25]:
from sklearn.model_selection import train_test_split as tts 
X_train, X_test, y_train, y_test=tts(X, y, test_size=0.2)

In [28]:
from sklearn.tree import DecisionTreeClassifier
tree=DecisionTreeClassifier()


In [29]:
# fit the model
tree.fit(X_train, y_train)

DecisionTreeClassifier()

In [30]:
# show score on train data
tree.score(X_train, y_train)

1.0

In [31]:
# show score on test data
tree.score(X_test, y_test)

0.7567567567567568

In [32]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()

In [33]:
# fit the model
rfc.fit(X_train, y_train)

RandomForestClassifier()

In [35]:
# show score on train data
rfc.score(X_train, y_train)

1.0

In [36]:
# show score on test data

rfc.score(X_test, y_test)

0.8108108108108109

In [38]:
from sklearn.model_selection import GridSearchCV
model=RandomForestClassifier()

In [51]:
my_param_grid = {
    "n_estimators":[25, 50, 75, 100, 150, 200, 300],
    "max_depth":[1,2,3,5,7,10,15, None]
}

In [53]:
grid_search = GridSearchCV(estimator=model, param_grid=my_param_grid, cv=5)

In [54]:
%%time
grid_search.fit(X_train, y_train)

CPU times: total: 1min 20s
Wall time: 1min 20s


GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [1, 2, 3, 5, 7, 10, 15, None],
                         'n_estimators': [25, 50, 75, 100, 150, 200, 300]})

In [56]:
grid_search.best_params_

{'max_depth': 3, 'n_estimators': 75}

In [57]:
best_model=RandomForestClassifier(max_depth=3, n_estimators=75)

In [58]:
best_model.fit(X_train, y_train)

RandomForestClassifier(max_depth=3, n_estimators=75)

In [59]:
best_model.score(X_train, y_train)

0.8140589569160998

In [60]:
best_model.score(X_test, y_test)

0.8378378378378378

In [61]:
best_model.fit(X, y)


RandomForestClassifier(max_depth=3, n_estimators=75)

In [68]:
df_test=pd.read_csv('real_test.csv')
df_test

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001482,Male,Yes,0,Graduate,Yes,3459,0.0,25.0,120.0,1.0,Semiurban
1,LP001949,Male,Yes,3+,Graduate,,4416,1250.0,110.0,360.0,1.0,Urban
2,LP002940,Male,No,0,Not Graduate,No,3833,0.0,110.0,360.0,1.0,Rural
3,LP002911,Male,Yes,1,Graduate,No,2787,1917.0,146.0,360.0,0.0,Rural
4,LP001279,Male,No,0,Graduate,No,2366,2531.0,136.0,360.0,1.0,Semiurban
...,...,...,...,...,...,...,...,...,...,...,...,...
57,LP001207,Male,Yes,0,Not Graduate,Yes,2609,3449.0,165.0,180.0,0.0,Rural
58,LP002190,Male,Yes,1,Graduate,No,6325,0.0,175.0,360.0,1.0,Semiurban
59,LP001370,Male,No,0,Not Graduate,,7333,0.0,120.0,360.0,1.0,Rural
60,LP002187,Male,No,0,Graduate,No,2500,0.0,96.0,480.0,1.0,Semiurban


In [69]:
df_test.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        5
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           1
Loan_Amount_Term     2
Credit_History       0
Property_Area        0
dtype: int64

In [70]:
df_test['Self_Employed'].fillna(value='No', inplace=True)
df_test['LoanAmount'].fillna(value=148.09981167608285, inplace=True)
df_test['Loan_Amount_Term'].fillna(value=340.55555555555554, inplace=True)

In [71]:
df_test['Gender'].replace({"Male":1, "Female":0}, inplace=True)
df_test['Married'].replace({"Yes":1, "No":0}, inplace=True)
df_test['Self_Employed'].replace({"Yes":1, "No":0}, inplace=True)
df_test['Education'].replace({"Graduate":1, "Not Graduate":0}, inplace=True)
df_test=pd.get_dummies(df_test, columns=['Dependents', 'Property_Area'])

In [74]:
df_new=df_test[['Loan_ID']]
df_test.drop(columns='Loan_ID', inplace=True)

### Make a prediction using your best model:

In [75]:
best_model.predict(df_test)

array([1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1], dtype=int64)

In [76]:
df_new['Prediction']=best_model.predict(df_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['Prediction']=best_model.predict(df_test)
