# Logistic Regression: Fit and evaluate a model

In [28]:
import joblib
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix

In [29]:
tr_features=pd.read_csv('train_features.csv')
tr_labels=pd.read_csv('train_labels.csv')

te_features=pd.read_csv('test_features.csv')
te_labels=pd.read_csv('test_labels.csv')

In [30]:
def print_results(results):
    print('Best PARAMS: {}\n'.format(results.best_params_))
    means=results.cv_results_['mean_test_score']
    stds=results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}' .format(round(mean,3), round(std*2,3), params))

In [31]:
lr=LogisticRegression()
parameters={
        'C':[0.01, 0.01, 0.1,1, 10, 100, 1000],
        'solver': ['liblinear'],
        'penalty': ['l1', 'l2']
}

In [32]:
cv=GridSearchCV(lr, parameters, cv=5)

In [33]:
cv.fit(tr_features, tr_labels.values.ravel())

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.01, 0.01, 0.1, 1, 10, 100, 1000],
                         'penalty': ['l1', 'l2'], 'solver': ['liblinear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [34]:
print_results(cv)

Best PARAMS: {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}

0.624 (+/-0.007) for {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}
0.736 (+/-0.069) for {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}
0.624 (+/-0.007) for {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}
0.736 (+/-0.069) for {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}
0.796 (+/-0.115) for {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
0.807 (+/-0.126) for {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.809 (+/-0.108) for {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
0.811 (+/-0.109) for {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
0.802 (+/-0.125) for {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}
0.802 (+/-0.118) for {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
0.802 (+/-0.118) for {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
0.802 (+/-0.118) for {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.802 (+/-0.118) for {'C': 1000, 'penalty': 'l1', 'solver': 'liblinear'}
0.

In [35]:
print(cv.best_estimator_)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)


In [36]:
scores = cross_val_score(lr, tr_features, tr_labels.values.ravel(), cv=5)
scores

array([0.85981308, 0.82242991, 0.71962617, 0.78504673, 0.86792453])

### Write out pickled model

In [37]:
joblib.dump(cv.best_estimator_, 'LR_model.pkl')

['LR_model.pkl']

In [38]:
prob_te=cv.predict_proba(te_features)
prob_te

array([[0.06985861, 0.93014139],
       [0.64459595, 0.35540405],
       [0.83860885, 0.16139115],
       [0.26662706, 0.73337294],
       [0.62546758, 0.37453242],
       [0.83860885, 0.16139115],
       [0.89094758, 0.10905242],
       [0.83549072, 0.16450928],
       [0.88387185, 0.11612815],
       [0.83853863, 0.16146137],
       [0.87069604, 0.12930396],
       [0.81858802, 0.18141198],
       [0.89095202, 0.10904798],
       [0.84239716, 0.15760284],
       [0.88202621, 0.11797379],
       [0.88187976, 0.11812024],
       [0.08753087, 0.91246913],
       [0.60977607, 0.39022393],
       [0.84666449, 0.15333551],
       [0.84953905, 0.15046095],
       [0.89766733, 0.10233267],
       [0.29379379, 0.70620621],
       [0.89629875, 0.10370125],
       [0.39698417, 0.60301583],
       [0.88746573, 0.11253427],
       [0.87547196, 0.12452804],
       [0.42126075, 0.57873925],
       [0.83861008, 0.16138992],
       [0.83783922, 0.16216078],
       [0.64901119, 0.35098881],
       [0.

In [39]:
pred_te = cv.predict(te_features)
pred_te

array([1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 1], dtype=int64)

In [40]:
print(classification_report(te_labels, pred_te))
print(confusion_matrix(te_labels, pred_te))

              precision    recall  f1-score   support

           0       0.80      0.83      0.81       113
           1       0.68      0.63      0.66        65

    accuracy                           0.76       178
   macro avg       0.74      0.73      0.73       178
weighted avg       0.76      0.76      0.76       178

[[94 19]
 [24 41]]
