In [55]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import model_selection
import time

In [24]:
df = pd.read_csv("filtered_df.csv")
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001003,Male,Yes,1,Graduate,No,8.430327,7.319202,4.859812,5.888878,1.0,Rural,N
1,LP001005,Male,Yes,0,Graduate,Yes,8.006701,0.0,4.204693,5.888878,1.0,Urban,Y
2,LP001006,Male,Yes,0,Not Graduate,No,7.857094,7.765993,4.795791,5.888878,1.0,Urban,Y
3,LP001008,Male,No,0,Graduate,No,8.699681,0.0,4.955827,5.888878,1.0,Urban,Y
4,LP001011,Male,Yes,2,Graduate,Yes,8.597482,8.342125,5.590987,5.888878,1.0,Urban,Y


#### Encoding Categorical Variable

In [8]:
#Encoding categorical features or variable and creating dummies
X = df[['Gender','Married','Dependents','Education',
       'Self_Employed','ApplicantIncome','CoapplicantIncome',
       'LoanAmount','Loan_Amount_Term','Credit_History','Property_Area']]

Y = df['Loan_Status']

X = pd.get_dummies(data=X, drop_first=True)

In [10]:
X.head(5)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban
0,8.430327,7.319202,4.859812,5.888878,1.0,True,True,True,False,False,False,False,False,False
1,8.006701,0.0,4.204693,5.888878,1.0,True,True,False,False,False,False,True,False,True
2,7.857094,7.765993,4.795791,5.888878,1.0,True,True,False,False,False,True,False,False,True
3,8.699681,0.0,4.955827,5.888878,1.0,True,False,False,False,False,False,False,False,True
4,8.597482,8.342125,5.590987,5.888878,1.0,True,True,False,True,False,False,True,False,True


#### Data Splitting

In [13]:
#Create Training and Test Sets and Apply Scaling
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=0)
X_train.head(3)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban
442,8.111928,0.0,4.26268,5.888878,1.0,True,False,False,False,False,False,False,False,True
434,9.224243,0.0,5.56452,5.888878,1.0,True,True,False,False,True,False,True,True,False
397,8.188411,7.160846,4.615121,5.888878,1.0,True,False,False,False,False,True,False,False,False


In [15]:
y_train.head(3)

442    Y
434    Y
397    N
Name: Loan_Status, dtype: object

#### Scaling the variables

In [18]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Building Predictive Model

#### Logistics Regression

In [22]:
log_reg = LogisticRegression().fit(X_train, y_train)
print('Accuracy of Logistic regression classifier on training set: {:.2f}'
     .format(log_reg.score(X_train, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
     .format(log_reg.score(X_test, y_test)))

Accuracy of Logistic regression classifier on training set: 0.82
Accuracy of Logistic regression classifier on test set: 0.78


In [26]:
y_pred = log_reg.predict(X_test)

In [28]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[16, 26],
       [ 0, 78]], dtype=int64)

In [30]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           N       1.00      0.38      0.55        42
           Y       0.75      1.00      0.86        78

    accuracy                           0.78       120
   macro avg       0.88      0.69      0.70       120
weighted avg       0.84      0.78      0.75       120



#### Using Cross_Validation

The technique of cross validation (CV) is best explained by example using the most common method, K-Fold CV. When we approach a machine learning problem, 

1. we make sure to split our data into a training and a testing set. 
2. In K-Fold CV, we further split our training set into K number of subsets, called folds. 
3. We then iteratively fit the model K times (i.e fit 5 models in 5 Folds CV), each time training the data on K-1 of the folds and evaluating on the Kth fold (called the validation data). 

As an example, consider fitting a model with K = 5. The first iteration we train on the first four folds and evaluate on the fifth. The second time we train on the first, second, third, and fifth fold and evaluate on the fourth. We repeat this procedure 3 more times, each time evaluating on a different fold. 

4. At the very end of training, we average the performance on each of the folds to come up with final validation metrics for the model.

For hyperparameter tuning, we perform many iterations of the entire K-Fold CV process, each time using different model settings. We then compare all of the models, select the best one, train it on the full training set, and then evaluate on the testing set. 

Instead of following this manual process RandomSearchCV and GridSearch automate the process.

In [35]:

#seed = 7
#kfold = model_selection.KFold(n_splits=10, random_state=seed)
kfold = model_selection.KFold(n_splits=10)
model = LogisticRegression()
scoring = 'accuracy'
results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
print(("Accuracy: %.3f (%.3f)") % (results.mean(), results.std()))

Accuracy: 0.814 (0.060)


#### Hyperparameter Tuning for Logistic Regression

When creating a ML model it is often difficult to define which model architecture that will optimise the model.  A model is optimised at a point where the model can strike a balance between overfitting and underfitting. Hence, GridSearch and RandomSearch are two method of searching for the optimum point for a model optimization by exploring a range of different possibilities. 

1. The objective of the model parameter is to learn the patterns in the input features for it to be able to predict the output feature. Model parameter are learned by the model during the training process.

2. Hyperparameter's object is to reduce the loss. i.e achieving a point where the model is optimised.

#### 1. Using GridSearch 

GridSearch can be used to find the optimum parameters of the logistic regressor.

In [39]:
dual=[True,False]
max_iter=[2.5,3,3.5,4,4.5]
param_grid=dict(dual=dual,max_iter=max_iter)

In [41]:
log_reg = LogisticRegression(penalty='l2')
grid = GridSearchCV(estimator=log_reg,param_grid=param_grid,cv=10,n_jobs=1)

start_time=time.time()
grid_result = grid.fit(X_train,y_train)

#summary of results
print ("Best: %f using %s" %(grid_result.best_score_,grid_result.best_params_))
print ("Execution time: " + str((time.time()- start_time)) + 'ms')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best: 0.769444 using {'dual': False, 'max_iter': 4}
Execution time: 0.7709364891052246ms


 0.70555556        nan 0.76944444        nan]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [43]:
#summary of results
print ("Best: %f using %s" %(grid_result.best_score_,grid_result.best_params_))
print ("Execution time: " + str((time.time()- start_time)) + 'ms')

Best: 0.769444 using {'dual': False, 'max_iter': 4}
Execution time: 30.320932388305664ms


In [45]:
dual=[True,False]
max_iter=[1,2,3,4,5]
C = [1.0,1.5,2.0,2.5]
param_grid=dict(dual=dual,max_iter=max_iter)

In [47]:
log_reg = LogisticRegression(penalty='l2')
grid = GridSearchCV(estimator=log_reg,param_grid=param_grid,cv=10,n_jobs=1)

start_time=time.time()
grid_result = grid.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [49]:
#summary of results
print("Best: %f using %s" %(grid_result.best_score_,grid_result.best_params_))
print ("Execution time: " + str((time.time()- start_time)) + 'ms')

Best: 0.805556 using {'dual': False, 'max_iter': 5}
Execution time: 9.608636617660522ms


#### 2. Random Search

In [57]:
random = RandomizedSearchCV(estimator=log_reg, param_distributions=param_grid, cv=10, n_jobs=-1)
start_time=time.time()
random_result = random.fit(X_train,y_train)



50 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\seune\AppData\Local\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\seune\AppData\Local\anaconda3\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\seune\AppData\Local\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1172, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
            

In [59]:
#summary of results
print("Best: %f using %s" %(random_result.best_score_,random_result.best_params_))
print ("Execution time: " + str((time.time()- start_time)) + 'ms')

Best: 0.805556 using {'max_iter': 5, 'dual': False}
Execution time: 24.58357858657837ms
