# Imports

In [1]:
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
import sys
import os
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt


# Fetch and manipulate data

In [2]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(url, names=names)
dataframe.head()


Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
dataframe.shape

(768, 9)

In [4]:
array = dataframe.values
X = array[:,0:8]  #Get all values of all rows and columns of the dataframe excep the last colum which is our target
Y = array[:,8]    #Get the values of the last column which is our target or labels

# Split our data into Train and Test sets

In [5]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
import joblib

test_size = 0.33
seed = 7 # or 42
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=test_size, random_state=seed)

In [6]:
X_train.shape

(514, 8)

In [7]:
X_test.shape

(254, 8)

## Fit a Logistic regression model to the train set

In [8]:
# Fit a Logistic model on training set
logistic_reg = LogisticRegression(max_iter=400) #solver='lbfgs'
logistic_reg.fit(X_train, Y_train) 

LogisticRegression(max_iter=400)

In [9]:
# Evaluate the logistic model on the Test set

Y_pred= logistic_reg.predict(X_test)
print(Y_pred)

[0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0.
 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 1. 0. 1. 1.
 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 1. 0. 0.
 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 1.
 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1.
 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 1. 0. 1. 1. 0. 0. 0. 1. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 1. 1. 0. 0. 1. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0.
 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0.
 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1.]


In [10]:
#Accuracy of the model on the train set
from sklearn.model_selection import cross_val_score

scores = cross_val_score(logistic_reg, X_train, Y_train, cv=10)
print('Cross-Validation Accuracy Scores', scores)
print()
print('Min_accuracy=',scores.min())
print('Mean_accuracy=',scores.mean())
print('Max_accuracy=',scores.max())

Cross-Validation Accuracy Scores [0.80769231 0.76923077 0.82692308 0.76923077 0.60784314 0.80392157
 0.82352941 0.78431373 0.78431373 0.7254902 ]

Min_accuracy= 0.6078431372549019
Mean_accuracy= 0.7702488687782806
Max_accuracy= 0.8269230769230769


In [11]:
#Accuracy of the model on the Test set
from sklearn.model_selection import cross_val_score

scores = cross_val_score(logistic_reg, X_test, Y_test, cv=10)
print('Cross-Validation Accuracy Scores', scores)
print()
print('Min_accuracy=',scores.min())
print('Mean_accuracy=',scores.mean())
print('Max_accuracy=',scores.max())

Cross-Validation Accuracy Scores [0.80769231 0.80769231 0.88461538 0.76923077 0.84       0.68
 0.68       0.68       0.68       0.8       ]

Min_accuracy= 0.68
Mean_accuracy= 0.7629230769230768
Max_accuracy= 0.8846153846153846


# Other ways to measure goodness for logistic regression
* ROC /AUC curve
* Brier score 

You can’t/shouldn't use RMSE evaluation metric for logistic regression problem. Be clear about which evaluation metrics are used for regression problems and which for classification problems.

**For Regression:** MAE, MSE, RMSE, RMSLE(root mean squared log error), R Squared(simple & adjusted)

**For Classification:** Accuracy, Precision, Recall, F1 Score, ROC/AUC Curve, log loss(for predicted probability value)

As Logisitc Regression problem is in the classification domain you should use any of the above stated.

# つづく