** 1. import libraries **

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

** 2. import dataset **

In [2]:
train = pd.read_csv('loan_train.csv')
train.drop(['Unnamed: 0','Unnamed: 0.1','effective_date','due_date'], axis=1, inplace=True)
train.head()

Unnamed: 0,loan_status,Principal,terms,age,education,Gender
0,PAIDOFF,1000,30,45,High School or Below,male
1,PAIDOFF,1000,30,33,Bechalor,female
2,PAIDOFF,1000,15,27,college,male
3,PAIDOFF,1000,30,28,college,female
4,PAIDOFF,1000,30,29,college,male


In [3]:
test = pd.read_csv('loan_test.csv')
test.drop(['Unnamed: 0','Unnamed: 0.1','effective_date','due_date'], axis=1, inplace=True)

** 3. loan_status - 0 / 1 **

In [4]:
def Y_01 (x):
    if x == 'PAIDOFF': return (1)
    else: return (0)

In [5]:
train['Y'] = train['loan_status'].apply(Y_01)
train.drop(['loan_status'], axis=1, inplace=True)
train.head()

Unnamed: 0,Principal,terms,age,education,Gender,Y
0,1000,30,45,High School or Below,male,1
1,1000,30,33,Bechalor,female,1
2,1000,15,27,college,male,1
3,1000,30,28,college,female,1
4,1000,30,29,college,male,1


In [6]:
test['Y'] = test['loan_status'].apply(Y_01)
test.drop(['loan_status'], axis=1, inplace=True)

** 4. gender ** 

In [7]:
df = train[['Gender','Y']]
df = df.groupby('Gender')['Y'].agg(['sum','count'])
df['x_gender'] = np.log(df['sum']/df['count'])
df.drop(['sum','count'], axis=1, inplace=True)
df['Gender'] = df.index

In [8]:
train = train.merge(df, how='left', on='Gender')
train.drop(['Gender'], axis=1, inplace=True)
train.head()

Unnamed: 0,Principal,terms,age,education,Y,x_gender
0,1000,30,45,High School or Below,1,-0.312942
1,1000,30,33,Bechalor,1,-0.144581
2,1000,15,27,college,1,-0.312942
3,1000,30,28,college,1,-0.144581
4,1000,30,29,college,1,-0.312942


In [9]:
test = test.merge(df, how='left', on='Gender')
test.drop(['Gender'], axis=1, inplace=True)

** 5. education **

In [10]:
def X_01 (x):
    if x == 'Master or Above': return (1)
    else: return (0)

In [11]:
train['education_bin'] = train['education'].apply(X_01)
train.drop(['education'], axis=1, inplace=True)
train.head()

Unnamed: 0,Principal,terms,age,Y,x_gender,education_bin
0,1000,30,45,1,-0.312942,0
1,1000,30,33,1,-0.144581,0
2,1000,15,27,1,-0.312942,0
3,1000,30,28,1,-0.144581,0
4,1000,30,29,1,-0.312942,0


In [12]:
test['education_bin'] = test['education'].apply(Y_01)
test.drop(['education'], axis=1, inplace=True)

In [13]:
df = train[['education_bin','Y']]
df = df.groupby('education_bin')['Y'].agg(['sum','count'])
df['x_education'] = np.log(df['sum']/df['count'])
df.drop(['sum','count'], axis=1, inplace=True)
df['education_bin'] = df.index

In [14]:
train = train.merge(df, how='left', on='education_bin')
train.drop(['education_bin'], axis=1, inplace=True)
train.head()

Unnamed: 0,Principal,terms,age,Y,x_gender,x_education
0,1000,30,45,1,-0.312942,-0.283814
1,1000,30,33,1,-0.144581,-0.283814
2,1000,15,27,1,-0.312942,-0.283814
3,1000,30,28,1,-0.144581,-0.283814
4,1000,30,29,1,-0.312942,-0.283814


In [15]:
test = test.merge(df, how='left', on='education_bin')
test.drop(['education_bin'], axis=1, inplace=True)

** 6. terms **

In [16]:
df = train[['terms','Y']]
df = df.groupby('terms')['Y'].agg(['sum','count'])
df['x_terms'] = np.log(df['sum']/df['count'])
df.drop(['sum','count'], axis=1, inplace=True)
df['terms'] = df.index

In [17]:
train = train.merge(df, how='left', on='terms')
train.drop(['terms'], axis=1, inplace=True)
train.head()

Unnamed: 0,Principal,age,Y,x_gender,x_education,x_terms
0,1000,45,1,-0.312942,-0.283814,-0.332134
1,1000,33,1,-0.144581,-0.283814,-0.332134
2,1000,27,1,-0.312942,-0.283814,-0.261014
3,1000,28,1,-0.144581,-0.283814,-0.332134
4,1000,29,1,-0.312942,-0.283814,-0.332134


In [18]:
test = test.merge(df, how='left', on='terms')
test.drop(['terms'], axis=1, inplace=True)

** 7. Principal **

In [19]:
df = pd.cut(x=train['Principal'], bins=7)
df.unique()

[(900.0, 1000.0], (700.0, 800.0], (299.3, 400.0], (800.0, 900.0], (400.0, 500.0]]
Categories (5, interval[float64]): [(299.3, 400.0] < (400.0, 500.0] < (700.0, 800.0] < (800.0, 900.0] < (900.0, 1000.0]]

In [20]:
def X_5 (x):
    if x <= 400: return (900)
    elif x <= 500: return (900)
    elif x <= 800: return (800)
    elif x <= 900: return (900)
    else: return (1000)

In [21]:
train['Principal_bin'] = train['Principal'].apply(X_5)
train.drop(['Principal'], axis=1, inplace=True)
train.head()

Unnamed: 0,age,Y,x_gender,x_education,x_terms,Principal_bin
0,45,1,-0.312942,-0.283814,-0.332134,1000
1,33,1,-0.144581,-0.283814,-0.332134,1000
2,27,1,-0.312942,-0.283814,-0.261014,1000
3,28,1,-0.144581,-0.283814,-0.332134,1000
4,29,1,-0.312942,-0.283814,-0.332134,1000


In [22]:
test['Principal_bin'] = test['Principal'].apply(X_5)
test.drop(['Principal'], axis=1, inplace=True)

In [23]:
df = train[['Principal_bin','Y']]
df = df.groupby('Principal_bin')['Y'].agg(['sum','count'])
df['x_Principal'] = np.log(df['sum']/df['count'])
df.drop(['sum','count'], axis=1, inplace=True)
df['Principal_bin'] = df.index

In [24]:
train = train.merge(df, how='left', on='Principal_bin')
train.drop(['Principal_bin'], axis=1, inplace=True)
train.head()

Unnamed: 0,age,Y,x_gender,x_education,x_terms,x_Principal
0,45,1,-0.312942,-0.283814,-0.332134,-0.305936
1,33,1,-0.144581,-0.283814,-0.332134,-0.305936
2,27,1,-0.312942,-0.283814,-0.261014,-0.305936
3,28,1,-0.144581,-0.283814,-0.332134,-0.305936
4,29,1,-0.312942,-0.283814,-0.332134,-0.305936


In [25]:
test = test.merge(df, how='left', on='Principal_bin')
test.drop(['Principal_bin'], axis=1, inplace=True)

** 8. age ** 

In [26]:
df = pd.cut(x=train['age'], bins=7)
df.unique()

[(41.571, 46.286], (32.143, 36.857], (22.714, 27.429], (27.429, 32.143], (36.857, 41.571], (17.967, 22.714], (46.286, 51.0]]
Categories (7, interval[float64]): [(17.967, 22.714] < (22.714, 27.429] < (27.429, 32.143] < (32.143, 36.857] < (36.857, 41.571] < (41.571, 46.286] < (46.286, 51.0]]

In [27]:
def X_7 (x):
    if x <= 22.714: return (1)
    elif x <= 27.429: return (2)
    elif x <= 32.143: return (3)
    elif x <= 36.857: return (4)
    elif x <= 41.571: return (5)
    elif x <= 46.286: return (6)
    else: return (7)

In [28]:
train['age_bin'] = train['age'].apply(X_7)
train.drop(['age'], axis=1, inplace=True)
train.head()

Unnamed: 0,Y,x_gender,x_education,x_terms,x_Principal,age_bin
0,1,-0.312942,-0.283814,-0.332134,-0.305936,6
1,1,-0.144581,-0.283814,-0.332134,-0.305936,4
2,1,-0.312942,-0.283814,-0.261014,-0.305936,2
3,1,-0.144581,-0.283814,-0.332134,-0.305936,3
4,1,-0.312942,-0.283814,-0.332134,-0.305936,3


In [29]:
test['age_bin'] = test['age'].apply(X_7)
test.drop(['age'], axis=1, inplace=True)

In [30]:
df = train[['age_bin','Y']]
df = df.groupby('age_bin')['Y'].agg(['sum','count'])
df['x_age'] = np.log(df['sum']/df['count'])
df.drop(['sum','count'], axis=1, inplace=True)
df['age_bin'] = df.index

In [31]:
train = train.merge(df, how='left', on='age_bin')
train.drop(['age_bin'], axis=1, inplace=True)
train.head()

Unnamed: 0,Y,x_gender,x_education,x_terms,x_Principal,x_age
0,1,-0.312942,-0.283814,-0.332134,-0.305936,-0.207639
1,1,-0.144581,-0.283814,-0.332134,-0.305936,-0.200671
2,1,-0.312942,-0.283814,-0.261014,-0.305936,-0.347401
3,1,-0.144581,-0.283814,-0.332134,-0.305936,-0.278713
4,1,-0.312942,-0.283814,-0.332134,-0.305936,-0.278713


In [32]:
test = test.merge(df, how='left', on='age_bin')
test.drop(['age_bin'], axis=1, inplace=True)

** 9. correlation matrix ** 

In [33]:
train.drop(['Y'], axis=1).corr()

Unnamed: 0,x_gender,x_education,x_terms,x_Principal,x_age
x_gender,1.0,0.032067,0.041444,0.048028,0.031637
x_education,0.032067,1.0,-0.030308,-0.013451,0.016982
x_terms,0.041444,-0.030308,1.0,0.511211,-0.021617
x_Principal,0.048028,-0.013451,0.511211,1.0,-0.00847
x_age,0.031637,0.016982,-0.021617,-0.00847,1.0


** 10. raw logistic reg **

In [34]:
y = train['Y']
x = train.drop(['Y'], axis=1)

In [35]:
x = sm.add_constant(x)
reg_log = sm.Logit(y,x)
results_log = reg_log.fit()
results_log.summary()

Optimization terminated successfully.
         Current function value: 0.536218
         Iterations 7


0,1,2,3
Dep. Variable:,Y,No. Observations:,346.0
Model:,Logit,Df Residuals:,340.0
Method:,MLE,Df Model:,5.0
Date:,"Sat, 21 Mar 2020",Pseudo R-squ.:,0.04374
Time:,16:11:57,Log-Likelihood:,-185.53
converged:,True,LL-Null:,-194.02
Covariance Type:,nonrobust,LLR p-value:,0.004556

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,7.0204,1.924,3.649,0.000,3.250,10.791
x_gender,4.7831,2.569,1.862,0.063,-0.251,9.818
x_education,2.9508,3.509,0.841,0.400,-3.927,9.828
x_terms,5.8171,3.134,1.856,0.063,-0.326,11.960
x_Principal,2.5800,4.758,0.542,0.588,-6.745,11.905
x_age,4.1186,2.095,1.966,0.049,0.013,8.224


** 11. final logistic reg **

In [36]:
y = train['Y']
x = train.drop(['Y','x_Principal','x_education'], axis=1)

In [37]:
x = sm.add_constant(x)
reg_log = sm.Logit(y,x)
results_log = reg_log.fit()
results_log.summary()

Optimization terminated successfully.
         Current function value: 0.537634
         Iterations 6


0,1,2,3
Dep. Variable:,Y,No. Observations:,346.0
Model:,Logit,Df Residuals:,342.0
Method:,MLE,Df Model:,3.0
Date:,"Sat, 21 Mar 2020",Pseudo R-squ.:,0.04121
Time:,16:11:58,Log-Likelihood:,-186.02
converged:,True,LL-Null:,-194.02
Covariance Type:,nonrobust,LLR p-value:,0.001139

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,5.6781,1.305,4.351,0.000,3.120,8.236
x_gender,4.8989,2.565,1.910,0.056,-0.128,9.926
x_terms,6.5266,2.777,2.350,0.019,1.083,11.970
x_age,4.1548,2.092,1.986,0.047,0.055,8.254


** 12. test data **

In [38]:
y_test = test['Y']
x_test = test.drop(['Y','x_Principal','x_education'], axis=1)

In [39]:
x_test = sm.add_constant(x_test)
pred_values = results_log.predict(x_test)
bins=np.array([0,0.75,1])
cm = np.histogram2d(y_test, pred_values, bins=bins)[0]
cm

array([[ 7.,  7.],
       [18., 22.]])

In [40]:
accuracy = (cm[0,0]+cm[1,1])/cm.sum()
accuracy

0.5370370370370371