In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.model_selection import train_test_split, RandomizedSearchCV

In [2]:
train = pd.read_csv(r"D:\Data Science\Projects\Analytics vidya\loan Eligibility Prediction\train.csv")
test = pd.read_csv(r"D:\Data Science\Projects\Analytics vidya\loan Eligibility Prediction\test.csv")
sample_sub = pd.read_csv(r"D:\Data Science\Projects\Analytics vidya\loan Eligibility Prediction\sample_submission_49d68Cx.csv")

In [3]:
train.shape, test.shape

((614, 13), (367, 12))

In [4]:
train.isna().sum().sum(), test.isna().sum().sum()

(149, 84)

### Label Encoding Target Variable

In [5]:
train['Loan_Status'].replace({'Y' : 1, 'N' : 0}, inplace = True)

---
## Merging Data

In [6]:
train['is_train'] = 1
test['is_train'] = 0
test['Loan_Status'] = None


data = pd.concat((train,test))

data.set_index('Loan_ID',inplace = True)

In [7]:
data.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,is_train
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,1,1
LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,0,1
LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,1,1
LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,1,1
LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,1,1


In [8]:
# Imputing Categorical Features by Mode
# Imputing Continuous Features by Median
data['Gender'].fillna('Male',inplace = True )
data['Married'].fillna('Yes', inplace = True)
data['Dependents'].fillna('0', inplace = True)
data['Self_Employed'].fillna('No', inplace = True)
data['LoanAmount'].fillna(126.0, inplace=True)
data['Loan_Amount_Term'].fillna(360, inplace=True)
data['Credit_History'].fillna(1, inplace=True)

---
# Encoding

### Label Encoding

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

col_ls = ['Gender', 'Married', 'Dependents', 'Education','Self_Employed','Loan_Amount_Term', 'Credit_History', 'Property_Area']

data[col_ls] = data[col_ls].apply(le.fit_transform)

---
# Min Max Scaling

In [10]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [11]:
col_ls = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']

In [12]:
data[col_ls] = scaler.fit_transform(data[col_ls])

---
## Validation Split

In [13]:
train = data[data['is_train'] == 1].copy()
test = data[data['is_train'] == 0].copy()

In [14]:
test.drop(['Loan_Status'], axis = 1, inplace=True)
train.drop(['is_train'], axis=1, inplace=True)
test.drop(['is_train'], axis=1, inplace=True)

train['Loan_Status'] = train['Loan_Status'].astype(int)
train = train[ [col for col in train if col!='Loan_Status'] + ['Loan_Status']]

In [15]:
train['Loan_Status'] = train['Loan_Status'].astype(int)

In [16]:
train.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [17]:
X_train, X_test, y_train, y_test = train_test_split(train.iloc[:,:-1], train['Loan_Status'], test_size = 0.30,random_state = 1999)

---
# Model Validation

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,BaggingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, confusion_matrix,roc_auc_score, f1_score

In [19]:
model_dict = {}

model_dict['LogisticRegression'] = LogisticRegression(max_iter = 200, solver='saga')
model_dict['DecisionTreeClassifier'] = DecisionTreeClassifier()
model_dict['RandomForestClassifier'] = RandomForestClassifier()
model_dict['AdaBoostClassifier'] = AdaBoostClassifier()
model_dict['BaggingClassifier'] = BaggingClassifier()
model_dict['XGBoost'] = XGBClassifier()
model_dict['LGBM'] = LGBMClassifier()
model_dict['Cat'] = CatBoostClassifier(verbose=False)

In [20]:
def model_test(X_train, X_test, y_train, y_test,model,model_name):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print('======================================{}======================================='.format(model_name))
    print('Accuracy is : {}'.format(accuracy))
    print('F1 Score is {}'.format(f1))
    print('Predicted Class unique Values : {}'.format(np.unique(y_pred)))
    print()

In [21]:
for model_name,model in model_dict.items():
    model_test(X_train, X_test, y_train, y_test, model, model_name)

Accuracy is : 0.8216216216216217
F1 Score is 0.8842105263157896
Predicted Class unique Values : [0 1]

Accuracy is : 0.745945945945946
F1 Score is 0.8142292490118578
Predicted Class unique Values : [0 1]

Accuracy is : 0.7945945945945946
F1 Score is 0.8602941176470589
Predicted Class unique Values : [0 1]

Accuracy is : 0.7945945945945946
F1 Score is 0.8602941176470589
Predicted Class unique Values : [0 1]

Accuracy is : 0.7351351351351352
F1 Score is 0.8108108108108109
Predicted Class unique Values : [0 1]

Accuracy is : 0.7621621621621621
F1 Score is 0.8345864661654135
Predicted Class unique Values : [0 1]

Accuracy is : 0.7675675675675676
F1 Score is 0.8389513108614233
Predicted Class unique Values : [0 1]

Accuracy is : 0.8162162162162162
F1 Score is 0.8785714285714287
Predicted Class unique Values : [0 1]



---
# Final Submission

In [22]:
X_train = train.iloc[:,:-1]
y_train = train['Loan_Status']
X_test = test

In [23]:
def final_pred_with_csv(X_train, X_test, y_train,model,model_name,sample_sub):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    submit = sample_sub.copy()
    submit['Loan_Status'] = y_pred
    submit['Loan_Status'].replace({1:'Y', 0:'N'}, inplace = True)
    address = "D:\\Data Science\\Projects\\Analytics vidya\\loan Eligibility Prediction\\Submission\\" + model_name + '.csv'
    submit.to_csv(address, index=False)
    print(model_name, 'Done')

In [24]:
"D:\Data Science\Projects\Analytics vidya\loan Eligibility Prediction\Submission"

'D:\\Data Science\\Projects\\Analytics vidya\\loan Eligibility Prediction\\Submission'

In [25]:
for model_name,model in model_dict.items():
    final_pred_with_csv(X_train, X_test, y_train,model, model_name,sample_sub)

LogisticRegression Done
DecisionTreeClassifier Done
RandomForestClassifier Done
AdaBoostClassifier Done
BaggingClassifier Done
XGBoost Done
LGBM Done
Cat Done


In [60]:
cat_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area']

In [63]:
#params = {'reg_lambda': 1000,
# 'min_child_samples': 78,
# 'max_depth': 4,
# 'learning_rate': 0.1}

model = CatBoostClassifier(verbose=False)
model.fit(X_train, y_train)

preds = model.predict(X_test)

In [64]:
sample_sub['Loan_Status'] = preds
sample_sub['Loan_Status'].replace({1:'Y', 0:'N'}, inplace = True)
sample_sub.to_csv(r"D:\Data Science\Projects\Analytics vidya\loan Eligibility Prediction\Submission\cat_tuned.csv", index=False)

In [65]:
dict(zip(model.feature_names_,model.feature_importances_ ))

{'Gender': 1.6029568456077978,
 'Married': 3.0161873983881247,
 'Dependents': 7.220447688211917,
 'Education': 3.0016137500294384,
 'Self_Employed': 1.5676655036250344,
 'ApplicantIncome': 14.821728474601095,
 'CoapplicantIncome': 12.6601459342618,
 'LoanAmount': 13.421838095768834,
 'Loan_Amount_Term': 4.3751872188775565,
 'Credit_History': 28.283267086928046,
 'Property_Area': 10.02896200370037}

## Insights:

1. Results are better with Label Encoding than One Hot Encoding
2. Most Important feature is 'Credit_History'

In [28]:
hyperparam_combs = {
    'max_depth': [i for i in range(1,15)],
    'learning_rate' : [0.1,0.2,0.3,0.4,0.01, 0.02, 0.03, 0.04,0.05],
    #'num_leaves' : [i for i in range(2,50)],
    'reg_lambda' : [1,2,3,4,5,10,15,20,50,100,200,500,1000,2000],
    'min_child_samples' : [i for i in range(2,100)]
}

clf = RandomizedSearchCV(CatBoostClassifier(verbose=False),
                         hyperparam_combs,
                         scoring='accuracy',
                         random_state=1,
                         n_iter=10,
                        verbose = 100,
                        n_jobs = -2)

search = clf.fit(X_train, y_train)

search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[Parallel(n_jobs=-2)]: Using backend LokyBackend with 11 concurrent workers.
Pickling array (shape=(11,), dtype=object).
Pickling array (shape=(614,), dtype=object).
Pickling array (shape=(6, 614), dtype=int32).
Pickling array (shape=(3, 614), dtype=float64).
Pickling array (shape=(2, 614), dtype=int64).
Pickling array (shape=(6,), dtype=object).
Pickling array (shape=(3,), dtype=object).
Pickling array (shape=(2,), dtype=object).
Pickling array (shape=(6,), dtype=int64).
Pickling array (shape=(614,), dtype=int32).
Pickling array (shape=(614,), dtype=object).
Pickling array (shape=(491,), dtype=int32).
Pickling array (shape=(123,), dtype=int32).
Pickling array (shape=(11,), dtype=object).
Pickling array (shape=(614,), dtype=object).
Pickling array (shape=(6, 614), dtype=int32).
Pickling array (shape=(3, 614), dtype=float64).
Pickling array (shape=(2, 614), dtype=int64).
Pickling array (shape=(6,), dtype=object).
Pickling arra

Pickling array (shape=(11,), dtype=object).
Pickling array (shape=(614,), dtype=object).
Pickling array (shape=(6, 614), dtype=int32).
Pickling array (shape=(3, 614), dtype=float64).
Pickling array (shape=(2, 614), dtype=int64).
Pickling array (shape=(6,), dtype=object).
Pickling array (shape=(3,), dtype=object).
Pickling array (shape=(2,), dtype=object).
Pickling array (shape=(6,), dtype=int64).
Pickling array (shape=(614,), dtype=int32).
Pickling array (shape=(614,), dtype=object).
Pickling array (shape=(492,), dtype=int32).
Pickling array (shape=(122,), dtype=int32).
[Parallel(n_jobs=-2)]: Done   4 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-2)]: Done   5 tasks      | elapsed:   11.1s
Pickling array (shape=(11,), dtype=object).
Pickling array (shape=(614,), dtype=object).
Pickling array (shape=(6, 614), dtype=int32).
Pickling array (shape=(3, 614), dtype=float64).
Pickling array (shape=(2, 614), dtype=int64).
Pickling array (shape=(6,), dtype=object).
Pickling array (shape=(3,),

[Parallel(n_jobs=-2)]: Done  16 tasks      | elapsed:  1.4min
Pickling array (shape=(11,), dtype=object).
Pickling array (shape=(614,), dtype=object).
Pickling array (shape=(6, 614), dtype=int32).
Pickling array (shape=(3, 614), dtype=float64).
Pickling array (shape=(2, 614), dtype=int64).
Pickling array (shape=(6,), dtype=object).
Pickling array (shape=(3,), dtype=object).
Pickling array (shape=(2,), dtype=object).
Pickling array (shape=(6,), dtype=int64).
Pickling array (shape=(614,), dtype=int32).
Pickling array (shape=(614,), dtype=object).
Pickling array (shape=(491,), dtype=int32).
Pickling array (shape=(123,), dtype=int32).
[Parallel(n_jobs=-2)]: Done  17 tasks      | elapsed:  1.4min
Pickling array (shape=(11,), dtype=object).
Pickling array (shape=(614,), dtype=object).
Pickling array (shape=(6, 614), dtype=int32).
Pickling array (shape=(3, 614), dtype=float64).
Pickling array (shape=(2, 614), dtype=int64).
Pickling array (shape=(6,), dtype=object).
Pickling array (shape=(3,),

[Parallel(n_jobs=-2)]: Done  29 tasks      | elapsed:  2.0min
Pickling array (shape=(11,), dtype=object).
Pickling array (shape=(614,), dtype=object).
Pickling array (shape=(6, 614), dtype=int32).
Pickling array (shape=(3, 614), dtype=float64).
Pickling array (shape=(2, 614), dtype=int64).
Pickling array (shape=(6,), dtype=object).
Pickling array (shape=(3,), dtype=object).
Pickling array (shape=(2,), dtype=object).
Pickling array (shape=(6,), dtype=int64).
Pickling array (shape=(614,), dtype=int32).
Pickling array (shape=(614,), dtype=object).
Pickling array (shape=(491,), dtype=int32).
Pickling array (shape=(123,), dtype=int32).
[Parallel(n_jobs=-2)]: Done  30 out of  50 | elapsed:  2.0min remaining:  1.3min
Pickling array (shape=(11,), dtype=object).
Pickling array (shape=(614,), dtype=object).
Pickling array (shape=(6, 614), dtype=int32).
Pickling array (shape=(3, 614), dtype=float64).
Pickling array (shape=(2, 614), dtype=int64).
Pickling array (shape=(6,), dtype=object).
Pickling

{'reg_lambda': 1000,
 'min_child_samples': 78,
 'max_depth': 4,
 'learning_rate': 0.1}