# Model Selection and Evaluation

---

## Logistic Regression

In [1]:
import numpy as np
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [2]:
#load prepared dataset from notebook
loan_prepared = np.loadtxt('loan_prepared.txt')
loan_labels = np.loadtxt('loan_labels.txt')

In [3]:
#load logistic regression model
log_reg = LogisticRegression()

In [4]:
#fit model onto data
log_reg.fit(loan_prepared, loan_labels)

LogisticRegression()

In [5]:
#make predictions
loan_predictions = log_reg.predict(loan_prepared)

In [6]:
#create scoring metric for training set
def train_scores(model):
    for scores in ['f1','precision','recall','accuracy']:
        cvs = cross_val_score(model, loan_prepared, loan_labels, scoring=scores, cv=10).mean()
        print(scores + " : "+ str(cvs))

In [7]:
#calculate scores of predictions using cross validation
train_scores(log_reg)

f1 : 0.941672046632743
precision : 0.9184927762967652
recall : 0.9661058957250205
accuracy : 0.9036781601369823


---

## Random Forest Classifier

In [8]:
#load random forest classifier model
rfc = RandomForestClassifier(n_estimators=100, max_features='sqrt')

In [9]:
#fit model onto data
rfc.fit(loan_prepared, loan_labels)

RandomForestClassifier(max_features='sqrt')

In [10]:
#make predictions
loan_predictions = rfc.predict(loan_prepared)

In [11]:
#calculate scores of predictions using cross validation
train_scores(rfc)

f1 : 0.9924942496071786
precision : 0.9890811828489869
recall : 0.996594329495464
accuracy : 0.9874716180840888


In [12]:
#tune hyperparameters using grid search
param_grid = {'n_estimators': [100, 150, 200],'max_features': ['sqrt', 'log2']}
grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 10, scoring="f1")
grid_search.fit(loan_prepared, loan_labels)

GridSearchCV(cv=10, estimator=RandomForestClassifier(max_features='sqrt'),
             param_grid={'max_features': ['sqrt', 'log2'],
                         'n_estimators': [100, 150, 200]},
             scoring='f1')

In [13]:
#find best hyperparameters
grid_search.best_params_

{'max_features': 'log2', 'n_estimators': 150}

In [14]:
#mean f1 score after tuning
print("f1 :", grid_search.cv_results_['mean_test_score'].max())

f1 : 0.9928160731114899


---

## Scores from Test Set

In [15]:
#use test set
test_features = np.loadtxt('test_features.txt')
test_labels = np.loadtxt('test_labels.txt')

In [16]:
#random forest classifier with hyperparameter tuning
rfc = RandomForestClassifier(n_estimators = 150, max_features='log2')

In [17]:
#fit model onto data
rfc.fit(test_features, test_labels)

RandomForestClassifier(max_features='log2', n_estimators=150)

In [18]:
#make predictions
test_predictions = rfc.predict(test_features)

In [19]:
#create scoring metric for testing set
def test_scores(model):
    for scores in ['f1','precision','recall','accuracy']:
        cvs = cross_val_score(model, test_features, test_labels, scoring=scores, cv=10).mean()
        print(scores + " : "+ str(cvs))

In [20]:
#calculate scores of predictions using cross validation
test_scores(rfc)

f1 : 0.9903982612184375
precision : 0.9816415679642102
recall : 0.9987054880603268
accuracy : 0.9843559118673648
