In [1]:
# for regression : mean absolute error, mean squared error, root mean squared erro
# for classification: classification accuracy, confusion_matrix, roc_auc 

In [3]:
#pima indians diabetes dataset
import pandas as pd
url="http://ftp.ics.uci.edu/pub/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
col_names=['pregnant','glucose','bp','skin','insulin','bmi','pedigree','age','label']
pima= pd.read_csv(url, header=None, names=col_names)
pima.head()

Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,label
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
feature_cols=['pregnant','insulin','bmi','age']
x= pima[feature_cols]
y=pima.label

In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test= train_test_split(x,y,random_state=0)

In [8]:
from sklearn.linear_model import LogisticRegression
logreg= LogisticRegression()
logreg.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [9]:
y_pred= logreg.predict(x_test)

In [10]:
# first metrics is accuracy which is easiest to understand
from sklearn import metrics
print(metrics.accuracy_score(y_test,y_pred))

0.6927083333333334


In [12]:
# null accuracy: accuracy achieved by always predicting most frequent class
y_test.value_counts()

0    130
1     62
Name: label, dtype: int64

In [14]:
#percentage of ones
1- y_test.mean()

0.6770833333333333

In [15]:
# calculate null accuracy in one line for binary classification only
max(y_test.mean(), 1-y_test.mean())

0.6770833333333333

In [16]:
# why accuracy is not a good measure: all values can be set to a class and still get good accuracy, 
#and there might be a pattern like all 0's are correctly predicted but 1's are not well predicted.
#so accuracy does not tell the underlying distribution of output values and it does not tell the type of error classifier is making

In [17]:
#second metric is confusion matrix. 
print(metrics.confusion_matrix(y_test, y_pred))

[[118  12]
 [ 47  15]]


In [18]:
confusion= metrics.confusion_matrix(y_test, y_pred)
TP=confusion[1,1]
TN=confusion[0,0]
FP=confusion[0,1]
FN=confusion[1,0]

In [19]:
#accuracy
print((TP+ TN)/float(TP+TN+FP+FN))
print(metrics.accuracy_score(y_test, y_pred))

0.6927083333333334
0.6927083333333334


In [20]:
# classification error
print(1-metrics.accuracy_score(y_test, y_pred))

0.30729166666666663


In [22]:
# precision: when a positive value is predicted , how often is the prediction correct
print(TP/float(TP+FP))
precision= metrics.precision_score(y_test,y_pred)
print(precision)

0.5555555555555556
0.5555555555555556


In [23]:
#recall: when the actual value is positive, how often is the prediction correct or sensitivity
print(TP/float(TP+FN))
recall= metrics.recall_score(y_test,y_pred)
print(recall)

0.24193548387096775
0.24193548387096775


In [24]:
#which evaluation metric to optimize depends on the business case
# eg 1: spam filter in which false negative(keeping spam mails in the inbox) is more acceptable than 
# false positive(send non spam email to spam folder), thus we need to minimize false positive, hence precison should be maximize
#eg 2: fraud transaction detector: if a trasaction is fraud but u say its harmless (false negative) is more harmful then 
#false positive (not fraud but say its fraud), hence False negative needs to be minimized,hence recall maximized

In [30]:
# AUC(area under the curve) is the percentage of the ROC plot that is underneath the curve.
#prob that the member has diabetes(y=1)
y_pred_prob= logreg.predict_proba(x_test)[:,1]
print(y_pred_prob[:10])

[0.36752429 0.28356344 0.28895886 0.4141062  0.15896027 0.17065156
 0.49889026 0.51341541 0.27678612 0.67189438]


In [28]:
print(metrics.roc_auc_score(y_test, y_pred_prob))
#auc is useful when there is class imbalance problem unlike accuracy

0.7245657568238213


In [32]:
#calculate cross-validated AUC
from sklearn.model_selection import cross_val_score
cross_val_score(logreg, x,y,cv=10,scoring='roc_auc').mean()

0.7378233618233618

In [33]:
#confusion_matrix advantage: can calculate variety of metrics. useful for multi-class problems
#roc_auc advantage: does not need classification threshold, useful in class imbalance scenarios