In [1]:
# - 10.1 Algorithm Evaluation Metrics

In [2]:
# http://scikit-learn.org/stable/modules/model_evaluation.html

In [3]:
# 10.1 Algorithm Evaluation Metrics

In [4]:
# - Choice of metrics influences how the performance of machine learning algorithms is measured
# and compared.

# - They influence how you weigh the importance of different characteristics in the results and 
# your ultimate decision of which algorithm to choose.

# - cross_validation.cross_val_score function reports all scores sorted in ascending order (largest
# score is best). Some evaluation metrics (like mean squared error) are naturally descending scores
# (the smalles score is best) and as such are reported negative by cross_validation.cross_val_score
# function. This is important to note, because some scores will be reported negative that by definition
# can never be negative.

In [5]:
# 10.2 Classification Metrics

In [6]:
# - classification accuracy

# - logarithmic loss

# - area under roc curve

# - confusion matrix

#- classification report

In [7]:
from pandas import read_csv

In [8]:
import numpy

In [9]:
import sys

In [10]:
def print_data(_data):
    return numpy.savetxt(sys.stdout, _data[:5,:], '%5.3f')

In [11]:
_uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data'

In [12]:
_col_names = ['preg','plas','pres','skin','test','mass','pedi','age','class']

In [13]:
_dataframe = read_csv(_uri, names=_col_names)

In [14]:
_array = _dataframe.values

In [15]:
print_data(_array)

6.000 148.000 72.000 35.000 0.000 33.600 0.627 50.000 1.000
1.000 85.000 66.000 29.000 0.000 26.600 0.351 31.000 0.000
8.000 183.000 64.000 0.000 0.000 23.300 0.672 32.000 1.000
1.000 89.000 66.000 23.000 94.000 28.100 0.167 21.000 0.000
0.000 137.000 40.000 35.000 168.000 43.100 2.288 33.000 1.000


In [16]:
_X = _array[:,0:8]

In [17]:
print_data(_X)

6.000 148.000 72.000 35.000 0.000 33.600 0.627 50.000
1.000 85.000 66.000 29.000 0.000 26.600 0.351 31.000
8.000 183.000 64.000 0.000 0.000 23.300 0.672 32.000
1.000 89.000 66.000 23.000 94.000 28.100 0.167 21.000
0.000 137.000 40.000 35.000 168.000 43.100 2.288 33.000


In [18]:
_Y = _array[:,8:]

In [19]:
print_data(_Y)

1.000
0.000
1.000
0.000
1.000


In [20]:
_Y = numpy.ravel(_Y)

In [21]:
print(_Y[:5])

[ 1.  0.  1.  0.  1.]


In [22]:
from sklearn.model_selection import KFold

In [23]:
from sklearn.model_selection import cross_val_score

In [24]:
from sklearn.linear_model import LogisticRegression

In [25]:
_kfold = KFold(n_splits=10, random_state=7)

In [26]:
_model = LogisticRegression()

In [27]:
# 10.2.1 Classification Accuracy

In [28]:
# - accuracy = correct predictions / all predictions made

# - most common evaluation metic and the most misused

# - It is only suitable when there are an equal number of observations in each class
# (which is rarely the case) and that all predictions and prediction errors are equally
# important, which is often not the case.

In [29]:
_scoring = 'accuracy'

In [30]:
_score = cross_val_score(_model, _X, _Y, cv=_kfold, scoring=_scoring)

In [31]:
'{:.2%}'.format(_score.mean())

'76.95%'

In [32]:
'{:.2%}'.format(_score.std())

'4.84%'

In [33]:
# 10.2.2 Logarithmic Loss

In [34]:
# - Logarithmic loss (or logloss) evaluates the predictions of probabilities of membership to 
# a given class

# - The scalar probability between 0 and 1 can be seen as a measure of confidence for a prediction
# by an algorithm.

# - Predictions that are correct or incorrect are rewarded or punished proportionally to the confidence
# of the prediction.

In [35]:
_scoring = 'neg_log_loss'

In [36]:
_score = cross_val_score(_model, _X, _Y, cv=_kfold, scoring=_scoring)

In [37]:
'{:3f}'.format(_score.mean())

'-0.492588'

In [38]:
'{:3f}'.format(_score.std())

'0.047028'

In [39]:
# - Smaller logloss is better with 0 representing a perfect logloss.

# - As mentioned above, the measure is inverted to be ascending when using cross_val_score()

In [40]:
# 10.2.3 Area Under ROC Curve

In [41]:
# - AUC is used for binary classification to discriminate between positive and negative classes.

# - An area of 1.0 = model made all predictions perfectly

# - An area of 0.5 = model is as good as random

# - A binary classification problem is a trade-off between sensitivity and specificity

# - Sensitivity = Recall = True Positive Rate = number of instances from the positive (first)
# class that were actually predicted correctly.

# - Specificity = True Negative Rate = number of instances from the negative (second) class
# that were actually predicted correctly.

In [42]:
_scoring = 'roc_auc'

In [43]:
_score = cross_val_score(_model, _X, _Y, cv=_kfold, scoring=_scoring)

In [44]:
'{:.3f}'.format(_score.mean())

'0.824'

In [45]:
'{:.3f}'.format(_score.std())

'0.041'

In [46]:
# - AUC is close to 1 and greater than 0.5 suggesting some skill in the predictions

In [47]:
# 10.2.4 Confustion Matrix

In [48]:
# - The confusion matrix is a handy presentation of the accuracy of a model with two or more classes.

# - The table presents predictions on the x-axis and accuracy outcomes on the y-axis.

# - The cells of the table are the number of predictions made by an algorithm.

In [49]:
from sklearn.model_selection import train_test_split

In [50]:
from sklearn.metrics import confusion_matrix

In [51]:
_X_train, _X_test, _Y_train, _Y_test = train_test_split(_X, _Y, test_size=0.33, random_state=7)

In [52]:
_model.fit(_X_train, _Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [53]:
_predicted = _model.predict(_X_test)

In [54]:
_matrix = confusion_matrix(_Y_test, _predicted)

In [55]:
_matrix

array([[141,  21],
       [ 41,  51]])

In [56]:
# - We can see that majority of the predictions fall on the diagonal line of the matrix.
# which are correct predictions.

# - https://en.wikipedia.org/wiki/Confusion_matrix

In [57]:
# 10.2.5 Classification Report

In [58]:
# - The classification report displays the precision, recall, F1-score and support
# for each class.

# - The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number
# of false positives. 
# The precision is intuitively the ability of the classifier not to label as positive a sample that is negative.

# - The recall is the ratio tp / (tp + fn) where tp is the number of true positives and fn the number 
# of false negatives. 
# The recall is intuitively the ability of the classifier to find all the positive samples.

# - The F-beta score can be interpreted as a weighted harmonic mean of the precision and recall, where an F-beta 
# score reaches its best value at 1 and worst score at 0.
# The F-beta score weights recall more than precision by a factor of beta. beta == 1.0 means recall and precision 
# are equally important.

# - The support is the number of occurrences of each class in y_true.

# - http://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html

# - http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html

In [59]:
from sklearn.metrics import classification_report

In [60]:
_report = classification_report(_Y_test, _predicted)

In [61]:
print(_report)

             precision    recall  f1-score   support

        0.0       0.77      0.87      0.82       162
        1.0       0.71      0.55      0.62        92

avg / total       0.75      0.76      0.75       254



In [62]:
# 10.3 Regression Metrics

In [63]:
# - Mean Absolute Error

# - Mean Squared Error

# - R^2

In [64]:
from sklearn.linear_model import LinearRegression

In [65]:
_uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'

In [66]:
_col_names = ['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT','MEDV']

In [67]:
_dataframe = read_csv(_uri, delim_whitespace=True, names=_col_names)

In [68]:
_dataframe.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [69]:
_array = _dataframe.values

In [70]:
print_data(_array)

0.006 18.000 2.310 0.000 0.538 6.575 65.200 4.090 1.000 296.000 15.300 396.900 4.980 24.000
0.027 0.000 7.070 0.000 0.469 6.421 78.900 4.967 2.000 242.000 17.800 396.900 9.140 21.600
0.027 0.000 7.070 0.000 0.469 7.185 61.100 4.967 2.000 242.000 17.800 392.830 4.030 34.700
0.032 0.000 2.180 0.000 0.458 6.998 45.800 6.062 3.000 222.000 18.700 394.630 2.940 33.400
0.069 0.000 2.180 0.000 0.458 7.147 54.200 6.062 3.000 222.000 18.700 396.900 5.330 36.200


In [71]:
_X = _array[:,0:13]

In [72]:
print_data(_X)

0.006 18.000 2.310 0.000 0.538 6.575 65.200 4.090 1.000 296.000 15.300 396.900 4.980
0.027 0.000 7.070 0.000 0.469 6.421 78.900 4.967 2.000 242.000 17.800 396.900 9.140
0.027 0.000 7.070 0.000 0.469 7.185 61.100 4.967 2.000 242.000 17.800 392.830 4.030
0.032 0.000 2.180 0.000 0.458 6.998 45.800 6.062 3.000 222.000 18.700 394.630 2.940
0.069 0.000 2.180 0.000 0.458 7.147 54.200 6.062 3.000 222.000 18.700 396.900 5.330


In [73]:
_Y = _array[:,13:]

In [74]:
print_data(_Y)

24.000
21.600
34.700
33.400
36.200


In [75]:
_Y = numpy.ravel(_Y)

In [76]:
print(_Y[:5])

[ 24.   21.6  34.7  33.4  36.2]


In [77]:
_kfold = KFold(n_splits=10, random_state=7)

In [78]:
_model = LinearRegression()

In [79]:
# 10.3.1 Mean Absolute Error

In [80]:
# - The Mean Absolute Error (or MAE) is the sum of the absolute differences between predictions and 
# actual values. 

# - It gives an idea of how wrong the predictions were. 

# - The measure gives an idea of the magnitude of the error, but no idea of the direction 
# (e.g. over or under predicting).

In [81]:
_scoring = 'neg_mean_absolute_error'

In [82]:
_score = cross_val_score(_model, _X, _Y, cv=_kfold, scoring=_scoring)

In [83]:
'{:.3f}'.format(_score.mean())

'-4.005'

In [84]:
'{:.3f}'.format(_score.std())

'2.084'

In [85]:
# - A value of 0 indicates no error or perfect predictions.

# - Like logloss, this metric is inverted by the cross_val_score()

In [86]:
# 10.3.2 Mean Squared Error

In [87]:
# - The Mean Squared Error (or MSE) is much like the mean absolute error in that it provides a gross 
# idea of the magnitude of error. 

# - Taking the square root of the mean squared error converts the units back to the original units 
# of the output variable and can be meaningful for description and presentation. 

# - This is called the Root Mean Squared Error (or RMSE).

In [88]:
_scoring = 'neg_mean_squared_error'

In [89]:
_score = cross_val_score(_model, _X, _Y, cv=_kfold, scoring=_scoring)

In [90]:
'{:.3f}'.format(_score.mean())

'-34.705'

In [91]:
'{:.3f}'.format(_score.std())

'45.574'

In [92]:
# - This metric too is inverted so that the results are increasing.

In [93]:
import math

In [94]:
'{:.3f}'.format(math.sqrt(abs(_score.mean())))

'5.891'

In [95]:
# - this is the root mean squared error (RMSE)

In [96]:
# 10.3.3 R^2 Metric

In [97]:
# - The R2 (or R Squared) metric provides an indication of the goodness of fit of a set of 
# predictions to the actual values. 

# - In statistical literature this measure is called the coefficient of determination. 

#- This is a value between 0 and 1 for no-fit and perfect fit respectively.

In [98]:
_scoring = 'r2'

In [99]:
_score = cross_val_score(_model, _X, _Y, cv=_kfold, scoring=_scoring)

In [100]:
'{:.3f}'.format(_score.mean())

'0.203'

In [101]:
'{:.3f}'.format(_score.std())

'0.595'

In [102]:
# - The predictions have a poor fit to the actual values with a value closer to zero and less than 0.5