In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

In [2]:
filr = os.path.join(os.getcwd(),'data','loan_problem_3.csv')

df = pd.read_csv(filr)
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [19]:
df.info()
#pd.isnan(df)
# Shape: 614,13
cat_features = ['Gender','Married','Education','Self_Employed','Property_Area','Dependents']
num_features = ['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term','Credit_History']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
Loan_ID              614 non-null object
Gender               601 non-null object
Married              611 non-null object
Dependents           599 non-null object
Education            614 non-null object
Self_Employed        582 non-null object
ApplicantIncome      614 non-null int64
CoapplicantIncome    614 non-null float64
LoanAmount           592 non-null float64
Loan_Amount_Term     600 non-null float64
Credit_History       564 non-null float64
Property_Area        614 non-null object
Loan_Status          614 non-null object
dtypes: float64(4), int64(1), object(8)
memory usage: 62.4+ KB


In [18]:
df['Dependents'].value_counts()
#df1 = df.dropna()

0     345
1     102
2     101
3+     51
Name: Dependents, dtype: int64

In [5]:
df1.shape

(480, 13)

In [10]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.pipeline import Pipeline
from categorical_encoder import CategoricalEncoder,DataFrameSelector

In [11]:
from sklearn.pipeline import FeatureUnion

In [20]:
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_features)),
        ('std_scaler', StandardScaler()),
    ])

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_features)),
        ('cat_encoder', CategoricalEncoder(encoding="onehot-dense")),
    ])

In [21]:
full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

In [22]:
X = full_pipeline.fit_transform(df1)

In [25]:
y,y_cats = df1['Loan_Status'].factorize()

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [29]:
logreg = LogisticRegression()

In [30]:
logreg.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [31]:
preds = logreg.predict(X_test)

In [35]:
proba  = logreg.predict_proba(X_test)

In [37]:
from sklearn.metrics import classification_report,accuracy_score,roc_auc_score
from sklearn.model_selection import cross_val_score

In [47]:
logreg_accuracy = round(accuracy_score(y_test,preds),2)
print("Accuracy: " + str(logreg_accuracy))

logreg_roc_auc = round(roc_auc_score(y_test,proba[:, 1]),2)
print("ROC_AUC: " + str(logreg_roc_auc))

Accuracy: 0.81
ROC_AUC: 0.78


In [46]:
cv_5 = cross_val_score(LogisticRegression(),X_train,y_train,scoring='accuracy',cv=5)
cv_10 = cross_val_score(LogisticRegression(),X_train,y_train,scoring='accuracy',cv=10)

print("CV 5 Accuracy: " + str(round(cv_5.mean(),2)))
print("CV 10 Accuracy: " + str(round(cv_10.mean(),2)))

CV 5 Accuracy: 0.81
CV 10 Accuracy: 0.81


In [48]:
cv_5_roc_auc = cross_val_score(LogisticRegression(),X_train,y_train,scoring='roc_auc',cv=5)
cv_10_roc_auc = cross_val_score(LogisticRegression(),X_train,y_train,scoring='roc_auc',cv=10)

print("CV 5 ROC_AUC: " + str(round(cv_5_roc_auc.mean(),2)))
print("CV 10 ROC_AUC: " + str(round(cv_10_roc_auc.mean(),2)))

CV 5 ROC_AUC: 0.73
CV 10 ROC_AUC: 0.74


In [49]:
cv_5_f1 = cross_val_score(LogisticRegression(),X_train,y_train,scoring='f1',cv=5)
cv_10_f1 = cross_val_score(LogisticRegression(),X_train,y_train,scoring='f1',cv=10)

print("CV 5 F1: " + str(round(cv_5_f1.mean(),2)))
print("CV 10 F1: " + str(round(cv_10_f1.mean(),2)))

CV 5 F1: 0.87
CV 10 F1: 0.88


In [50]:
cv_5_precision = cross_val_score(LogisticRegression(),X_train,y_train,scoring='precision',cv=5)
cv_5_recall = cross_val_score(LogisticRegression(),X_train,y_train,scoring='recall',cv=5)
cv_10_precision = cross_val_score(LogisticRegression(),X_train,y_train,scoring='precision',cv=10)
cv_10_recall = cross_val_score(LogisticRegression(),X_train,y_train,scoring='recall',cv=10)

print("CV 5 Precision: " + str(round(cv_5_precision.mean(),2)))
print("CV 5 Recall: " + str(round(cv_5_recall.mean(),2)))
print("CV 10 Precision: " + str(round(cv_10_precision.mean(),2)))
print("CV 10 Recall: " + str(round(cv_10_recall.mean(),2)))

CV 5 Precision: 0.8
CV 5 Recall: 0.96
CV 10 Precision: 0.8
CV 10 Recall: 0.96
