# Model Selection and Evaluation

---

## Logistic Regression

In [1]:
import numpy as np
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [2]:
#load prepared dataset from notebook
loan_prepared = np.loadtxt('loan_prepared.txt')
loan_labels = np.loadtxt('loan_labels.txt')

In [3]:
#load logistic regression model
log_reg = LogisticRegression()

In [4]:
#fit model onto data
log_reg.fit(loan_prepared, loan_labels)

LogisticRegression()

In [5]:
#calculate scores of predictions using cross validation
loan_predictions = log_reg.predict(loan_prepared)
scores = cross_val_score(log_reg, loan_prepared, loan_labels, cv=10, scoring="f1")
scores

array([0.94952681, 0.94285714, 0.94585987, 0.93533123, 0.93512658,
       0.94015748, 0.94285714, 0.94536817, 0.94071146, 0.94043887])

In [6]:
#f1 score mean
scores.mean()

0.9418234770028555

In [7]:
print(metrics.classification_report(loan_labels, loan_predictions))

              precision    recall  f1-score   support

         0.0       0.83      0.65      0.73      1496
         1.0       0.92      0.97      0.94      6166

    accuracy                           0.90      7662
   macro avg       0.87      0.81      0.83      7662
weighted avg       0.90      0.90      0.90      7662



In [8]:
print("f1 score: ", scores.mean())

f1 score:  0.9418234770028555


---

## Random Forest Classifier

In [9]:
#load random forest classifier model
rfc = RandomForestClassifier(n_estimators=100, max_features='sqrt')

In [10]:
#fit model onto data
rfc.fit(loan_prepared, loan_labels)

RandomForestClassifier(max_features='sqrt')

In [11]:
#calculate socres of predcitions using cross validation
loan_predictions = rfc.predict(loan_prepared)
scores = cross_val_score(rfc, loan_prepared, loan_labels, cv=10, scoring="f1")
scores

array([0.98955823, 0.99274778, 0.99595142, 0.99515347, 0.99112187,
       0.99193548, 0.99190939, 0.99675325, 0.99273608, 0.99112187])

In [12]:
#f1 score mean
scores.mean()

0.9928988845499938

In [13]:
print(metrics.classification_report(loan_labels, loan_predictions))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1496
         1.0       1.00      1.00      1.00      6166

    accuracy                           1.00      7662
   macro avg       1.00      1.00      1.00      7662
weighted avg       1.00      1.00      1.00      7662



In [14]:
#tune hyperparameters using grid search
param_grid = {'n_estimators': [50, 100, 150],'max_features': ['auto', 'sqrt', 'log2']}
grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 10, scoring="f1")
grid_search.fit(loan_prepared, loan_labels)

GridSearchCV(cv=10, estimator=RandomForestClassifier(max_features='sqrt'),
             param_grid={'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [50, 100, 150]},
             scoring='f1')

In [15]:
#find best hyperparameters
grid_search.best_estimator_

RandomForestClassifier(max_features='log2', n_estimators=150)

In [16]:
#mean f1 score after tuning
print("f1 score: ", grid_search.cv_results_['mean_test_score'].max())

f1 score:  0.9929784195100145
