In [17]:

import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier , VotingClassifier

In [18]:
loan=pd.read_csv("F:\\Python work\\titanic\\loan_application (1).csv")
loan

Unnamed: 0,Application_ID,Gender,Married,Dependents,Education,Self_Employed,Credit_History,Property_Area,Income,Application_Status
0,LP001002,Male,No,0,Graduate,No,1,Urban,medium,Y
1,LP001003,Male,Yes,1,Graduate,No,1,Rural,medium,N
2,LP001005,Male,Yes,0,Graduate,Yes,1,Urban,low,Y
3,LP001006,Male,Yes,0,Not Graduate,No,1,Urban,low,Y
4,LP001008,Male,No,0,Graduate,No,1,Urban,medium,Y
...,...,...,...,...,...,...,...,...,...,...
506,LP002978,Female,No,0,Graduate,No,1,Rural,low,Y
507,LP002979,Male,Yes,3+,Graduate,No,1,Rural,medium,Y
508,LP002983,Male,Yes,1,Graduate,No,1,Urban,medium,Y
509,LP002984,Male,Yes,2,Graduate,No,1,Urban,medium,Y


In [19]:
def relevant_data(data_df):
    """
    Returns only the required columns from the data set
    :param data_df: raw pandas data frame
    :return: pandas data frame with relevant columns
    """
    data_df = data_df.drop('Application_ID', axis=1)
    return data_df

In [20]:
def cat2int(data_df):
    """
    Converts categorical values in to discret numeric values
    :param data_df: raw data frame
    :return: data frame with categorical converted to numerics
    """

    data_df['Dependents'] = data_df['Dependents'].map(
        lambda x: 4 if x == '3+' else int(x))

    data_df['Gender'] = data_df['Gender'].map(lambda x: 0 if x == 'No' else 1)

    data_df['Education'] = data_df['Education'].map(
        lambda x: 0 if x == 'Not Graduate' else 1)

    data_df['Married'] = data_df['Married'].map(
        lambda x: 0 if x == 'No' else 1)

    data_df['Property_Area'] = data_df['Property_Area'].map(
        lambda x: 0 if x == 'Urban' else 1 if x == 'Semiurban' else 2)

    data_df['Income'] = data_df['Income'].map(
        lambda x: 0 if x == 'low' else 1 if x == 'medium' else 2)

    data_df['Self_Employed'] = data_df['Self_Employed'].map(
        lambda x: 0 if x == 'No' else 1)

    return data_df

In [21]:
def get_x_y(data_df):
    """
    Returns X and y i.e. predictors and target variale from data set
    :param data_df: raw data frame
    :return: 2 pandas data frames
    """

    X = data_df.drop('Application_Status', axis=1)
    y = data_df.loc[:, 'Application_Status']

    return X, y

In [22]:
loan= relevant_data(loan)

In [23]:
loan= cat2int(loan)

In [24]:
loan.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Credit_History,Property_Area,Income,Application_Status
0,1,0,0,1,0,1,0,1,Y
1,1,1,1,1,0,1,2,1,N
2,1,1,0,1,1,1,0,0,Y
3,1,1,0,0,0,1,0,0,Y
4,1,0,0,1,0,1,0,1,Y


In [25]:
X,y = get_x_y(loan)

In [28]:
X_train, X_test, y_train, y_test= train_test_split(X,y, train_size=0.7, random_state=1)

In [29]:
rf= RandomForestClassifier()
lr= LogisticRegression()

In [30]:
vc_hard= VotingClassifier(estimators=[('rf', rf), ('lr', lr)])
vc_soft= VotingClassifier(estimators=[('rf', rf), ('lr', lr)], voting='soft')

In [31]:
print('Random Forest Results')
print('---------------------')
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))
print('')
print('Logistic Regression Results')
print('---------------------')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(classification_report(y_test, y_pred))
print('')
print('Voting Classifier Hard Results')
print('---------------------')
vc_hard.fit(X_train, y_train)
y_pred = vc_hard.predict(X_test)
print(classification_report(y_test, y_pred))
print('')
print('Voting Classifier Soft Results')
print('---------------------')
vc_soft.fit(X_train, y_train)
y_pred = vc_soft.predict(X_test)
print(classification_report(y_test, y_pred))
print('')

Random Forest Results
---------------------
              precision    recall  f1-score   support

           N       0.82      0.51      0.63        55
           Y       0.78      0.94      0.85        99

    accuracy                           0.79       154
   macro avg       0.80      0.72      0.74       154
weighted avg       0.79      0.79      0.77       154


Logistic Regression Results
---------------------
              precision    recall  f1-score   support

           N       0.96      0.47      0.63        55
           Y       0.77      0.99      0.87        99

    accuracy                           0.81       154
   macro avg       0.87      0.73      0.75       154
weighted avg       0.84      0.81      0.78       154


Voting Classifier Hard Results
---------------------
              precision    recall  f1-score   support

           N       0.80      0.51      0.62        55
           Y       0.77      0.93      0.84        99

    accuracy                     