In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

In [8]:
train_input = pd.read_csv('risk_train_data.csv')
test_input = pd.read_csv('risk_test_data.csv')

In [9]:
train_input.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [10]:
test_input.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype='object')

In [11]:
all = pd.concat([train_input, test_input], axis=0)

In [12]:
all.shape

(981, 13)

In [14]:
all.reset_index(inplace=True, drop=True)

In [15]:
all

Unnamed: 0,ApplicantIncome,CoapplicantIncome,Credit_History,Dependents,Education,Gender,LoanAmount,Loan_Amount_Term,Loan_ID,Loan_Status,Married,Property_Area,Self_Employed
0,5849,0.0,1.0,0,Graduate,Male,,360.0,LP001002,Y,No,Urban,No
1,4583,1508.0,1.0,1,Graduate,Male,128.0,360.0,LP001003,N,Yes,Rural,No
2,3000,0.0,1.0,0,Graduate,Male,66.0,360.0,LP001005,Y,Yes,Urban,Yes
3,2583,2358.0,1.0,0,Not Graduate,Male,120.0,360.0,LP001006,Y,Yes,Urban,No
4,6000,0.0,1.0,0,Graduate,Male,141.0,360.0,LP001008,Y,No,Urban,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...
976,4009,1777.0,1.0,3+,Not Graduate,Male,113.0,360.0,LP002971,,Yes,Urban,Yes
977,4158,709.0,1.0,0,Graduate,Male,115.0,360.0,LP002975,,Yes,Urban,No
978,3250,1993.0,,0,Graduate,Male,126.0,360.0,LP002980,,No,Semiurban,No
979,5000,2393.0,1.0,0,Graduate,Male,158.0,360.0,LP002986,,Yes,Rural,No


In [16]:
all.isnull().sum()

ApplicantIncome        0
CoapplicantIncome      0
Credit_History        79
Dependents            25
Education              0
Gender                24
LoanAmount            27
Loan_Amount_Term      20
Loan_ID                0
Loan_Status          367
Married                3
Property_Area          0
Self_Employed         55
dtype: int64

In [17]:
Counter(all['Gender'])

Counter({'Male': 775, 'Female': 182, nan: 24})

In [20]:
gender_null = all[all['Gender'].isnull()].index.tolist()

In [21]:
all['Gender'].iloc[gender_null] = 'Male'

In [22]:
Counter(all['Gender'])

Counter({'Male': 799, 'Female': 182})

In [23]:
Counter(all['Married'])

Counter({'No': 347, 'Yes': 631, nan: 3})

In [25]:
pd.crosstab(all['Married'].isnull(), all['Dependents'].isnull())

Dependents,False,True
Married,Unnamed: 1_level_1,Unnamed: 2_level_1
False,956,22
True,0,3


In [26]:
married_null = all[all['Married'].isnull()].index.tolist()
all['Married'].iloc[gender_null] = 'Yes'

In [27]:
all.isnull().sum()

ApplicantIncome        0
CoapplicantIncome      0
Credit_History        79
Dependents            25
Education              0
Gender                 0
LoanAmount            27
Loan_Amount_Term      20
Loan_ID                0
Loan_Status          367
Married                3
Property_Area          0
Self_Employed         55
dtype: int64

In [28]:
Counter(all['Dependents'])

Counter({'0': 545, '1': 160, '2': 160, '3+': 91, nan: 25})

In [29]:
pd.crosstab(all['Married'], all['Dependents'].isnull())

Dependents,False,True
Married,Unnamed: 1_level_1,Unnamed: 2_level_1
No,328,8
Yes,628,14


In [30]:
pd.crosstab(all['Married'], all['Dependents'])

Dependents,0,1,2,3+
Married,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,268,35,14,11
Yes,277,125,146,80


In [36]:
bachelor_nulldependent = all[(all['Married'] == 'No') & all['Dependents'].isnull()].index.tolist()

In [37]:
bachelor_nulldependent

[293, 332, 355, 597, 684, 879, 916, 926]

In [38]:
all['Dependents'].iloc[bachelor_nulldependent] = '0'

In [40]:
all['Dependents'].iloc[all[all['Dependents'].isnull()].index.tolist()] = '0'

In [41]:
self_emp_null = all[all['Self_Employed'].isnull()].index.tolist()
all['Self_Employed'].iloc[self_emp_null] = 'No'

In [42]:
pd.crosstab(all['LoanAmount'].isnull(), all['Loan_Amount_Term'])

Loan_Amount_Term,6.0,12.0,36.0,60.0,84.0,120.0,180.0,240.0,300.0,350.0,360.0,480.0
LoanAmount,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
False,1,2,3,3,7,4,64,7,20,1,800,22
True,0,0,0,0,0,0,2,1,0,0,23,1


In [43]:
all.groupby('Loan_Amount_Term')['LoanAmount'].mean()

Loan_Amount_Term
6.0       95.000000
12.0     185.500000
36.0     117.666667
60.0     139.666667
84.0     121.142857
120.0     36.750000
180.0    131.125000
240.0    128.857143
300.0    166.250000
350.0    133.000000
360.0    144.420000
480.0    137.181818
Name: LoanAmount, dtype: float64

In [44]:
all['LoanAmount'][all['LoanAmount'].isnull() & all['Loan_Amount_Term'] == 360] = 144
all['LoanAmount'][all['LoanAmount'].isnull()] = 130

In [47]:
all['Loan_Amount_Term'].value_counts()

360.0    823
180.0     66
480.0     23
300.0     20
240.0      8
84.0       7
120.0      4
36.0       3
60.0       3
12.0       2
350.0      1
6.0        1
Name: Loan_Amount_Term, dtype: int64

In [48]:
all['Loan_Amount_Term'][all['Loan_Amount_Term'].isnull()] = 360

In [49]:
all.isnull().sum()

ApplicantIncome        0
CoapplicantIncome      0
Credit_History        79
Dependents             0
Education              0
Gender                 0
LoanAmount             0
Loan_Amount_Term       0
Loan_ID                0
Loan_Status          367
Married                3
Property_Area          0
Self_Employed          0
dtype: int64

In [50]:
all['Credit_History'].value_counts()

1.0    754
0.0    148
Name: Credit_History, dtype: int64

In [51]:
pd.crosstab(all['Gender'], all['Credit_History'])

Credit_History,0.0,1.0
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,30,135
Male,118,619


In [52]:
pd.crosstab(all['Self_Employed'], all['Credit_History'])

Credit_History,0.0,1.0
Self_Employed,Unnamed: 1_level_1,Unnamed: 2_level_1
No,134,658
Yes,14,96


In [53]:
all.columns

Index(['ApplicantIncome', 'CoapplicantIncome', 'Credit_History', 'Dependents',
       'Education', 'Gender', 'LoanAmount', 'Loan_Amount_Term', 'Loan_ID',
       'Loan_Status', 'Married', 'Property_Area', 'Self_Employed'],
      dtype='object')

In [54]:
all_new = pd.get_dummies(all.drop(['Loan_ID'], axis=1), drop_first=True)
all_new

Unnamed: 0,ApplicantIncome,CoapplicantIncome,Credit_History,LoanAmount,Loan_Amount_Term,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Gender_Male,Loan_Status_Y,Married_Yes,Property_Area_Semiurban,Property_Area_Urban,Self_Employed_Yes
0,5849,0.0,1.0,130.0,360.0,0,0,0,0,1,1,0,0,1,0
1,4583,1508.0,1.0,128.0,360.0,1,0,0,0,1,0,1,0,0,0
2,3000,0.0,1.0,66.0,360.0,0,0,0,0,1,1,1,0,1,1
3,2583,2358.0,1.0,120.0,360.0,0,0,0,1,1,1,1,0,1,0
4,6000,0.0,1.0,141.0,360.0,0,0,0,0,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
976,4009,1777.0,1.0,113.0,360.0,0,0,1,1,1,0,1,0,1,1
977,4158,709.0,1.0,115.0,360.0,0,0,0,0,1,0,1,0,1,0
978,3250,1993.0,,126.0,360.0,0,0,0,0,1,0,0,1,0,0
979,5000,2393.0,1.0,158.0,360.0,0,0,0,0,1,0,1,0,0,0


In [55]:
test = all_new[all['Credit_History'].isnull()]

In [56]:
all_in_test = test.index.tolist()

In [57]:
all_in_train = [ x for x in all.index.tolist() if x not in all_in_test]
train = all_new.iloc[all_in_train]
train.shape

(902, 15)

In [58]:
X_train = train.drop(['Loan_Status_Y', 'Credit_History'], axis=1)
Y_train = train['Credit_History']
X_test = test.drop(['Loan_Status_Y', 'Credit_History'], axis=1)
Y_test = test['Credit_History']

In [59]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [60]:
pred = model.predict(X_test)

In [61]:
test['Credit_History'] = pred

In [62]:
pred

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [63]:
pred.shape

(79,)

In [64]:
df_all = pd.concat([train, test], axis=0)

In [65]:
df_all.shape

(981, 15)

In [66]:
len(train_input)

614

In [67]:
train2 = df_all.head(len(train_input))
test2 = df_all.tail(len(test_input))

In [68]:
X_train = train2.drop(['Loan_Status_Y'], axis=1)
Y_train = train2['Loan_Status_Y']
X_test = test2.drop(['Loan_Status_Y'], axis=1)


In [69]:
from sklearn.svm import SVC
model = SVC()
model.fit(X_train, Y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [71]:
pred = model.predict(X_test)

In [72]:
pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,