In [1]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
%matplotlib inline
sns.set()

In [3]:
pima_df = pd.read_csv('../data/pima.csv', dtype = {'diabetes': 'int8'})
pima_df.head(3)

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,skin,diabetes
0,6,148,72,35,0,33.6,0.627,50,1.379,1
1,1,85,66,29,0,26.6,0.351,31,1.1426,0
2,8,183,64,0,0,23.3,0.672,32,0.0,1


In [4]:
feature_cols = ['num_preg', 'insulin', 'bmi', 'age']
X = pima_df[feature_cols]
y = pima_df.diabetes

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [6]:
y_pred = logreg.predict(X_test)

# Classification Accuracy
> Percentage of Correct Predictions

> **Pros:**  
> Simple to understand

> **Cons:**  
> Doesn't specify **underlying distribution** of response values  
> Doesn't point out **types of errors** the classifier is making

In [7]:
metrics.accuracy_score(y_test, y_pred)

0.69270833333333337

# Null Accuracy
> Accuracy that could be achieved by always predicting the most frequent class  
> Even a dumb model can do that!  

> It's based on underlying distribution of data.  

> Hence The null accuracy forms the base line for a model accuracy

In [8]:
y_test.value_counts()

0    130
1     62
Name: diabetes, dtype: int64

In [9]:
y_test.mean()    # Percentage of Ones

0.32291666666666669

In [10]:
1 - y_test.mean()    # Percentage of Zeroes

0.67708333333333326

In [11]:
max(y_test.mean(), 1 - y_test.mean())    # Null Accuracy

0.67708333333333326

In [12]:
y_test.value_counts().head(1) / len(y_test)    # Null Accuracy

0    0.677083
Name: diabetes, dtype: float64