In [1]:
import numpy as np
import pandas as pd

# Scoring and Metrics

In [2]:
# pattern: positive => True, negative => False
df = pd.DataFrame([
    [15, True],
    [-2, False],
    [99, True],
    [-10, False],
    [8, True],
    [5, True],
    [-1000, False],
    [-1, False],
], columns=["x", "y"])
train, test = df.iloc[:4], df.iloc[4:]

In [3]:
train

Unnamed: 0,x,y
0,15,True
1,-2,False
2,99,True
3,-10,False


In [4]:
test

Unnamed: 0,x,y
4,8,True
5,5,True
6,-1000,False
7,-1,False


In [5]:
from sklearn.linear_model import LogisticRegression
# TODO: 
# 1. train a LogisticRegression on training data
# 2. score the model on the test data
model = LogisticRegression()
model.fit(train[["x"]], train["y"])
model.predict(test[["x"]])

array([ True, False, False, False])

In [6]:
model.score(test[["x"]], test["y"])   # by default, this uses accuracy_score

0.75

In [7]:
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy_score(["A", "B"], ["A", "C"])

0.5

In [8]:
accuracy_score(test["y"], model.predict(test[["x"]]))

0.75

# Confusion Matrices

In [9]:
actual =    ["dog", "dog", "dog", "dog", "cat", "cat", "cat", "mouse", "mouse"]
predicted = ["dog", "dog", "dog", "cat", "cat", "dog", "cat", "mouse", "mouse"]

In [10]:
# ROW: an actual category
# COL: a prediction
confusion_matrix(actual, predicted)

array([[2, 1, 0],
       [1, 3, 0],
       [0, 0, 2]])

In [11]:
labels = ["dog", "cat", "mouse", "horse"]
cm = confusion_matrix(actual, predicted, labels=labels)
cm

array([[3, 1, 0, 0],
       [1, 2, 0, 0],
       [0, 0, 2, 0],
       [0, 0, 0, 0]])

In [12]:
pd.DataFrame(cm, index=labels, columns=labels)

Unnamed: 0,dog,cat,mouse,horse
dog,3,1,0,0
cat,1,2,0,0
mouse,0,0,2,0
horse,0,0,0,0


# Multi-Class Metrics

In [16]:
actual =    ["dog", "dog", "dog", "dog", "cat", "cat", "cat", "cat", "mouse", "mouse"] * 199
predicted = ["dog", "dog", "dog", "dog", "cat", "dog", "cat", "dog", "mouse", "mouse"] * 199
actual += ["horse"] * 10
predicted += ["dog"] * 9 + ["horse"]

labels = ["dog", "cat", "mouse", "horse"]
cm = confusion_matrix(actual, predicted, labels=labels)
cm = pd.DataFrame(cm, index=labels, columns=labels)
cm

Unnamed: 0,dog,cat,mouse,horse
dog,796,0,0,0
cat,398,398,0,0
mouse,0,0,398,0
horse,9,0,0,1


In [17]:
# what is the recall for cat?
cm.at["cat", "cat"] / cm.loc["cat", :].sum()

0.5

In [18]:
from sklearn.metrics import recall_score, precision_score, balanced_accuracy_score

In [22]:
print(labels)
recall_score(actual, predicted, average=None, labels=labels)

['dog', 'cat', 'mouse', 'horse']


array([1. , 0.5, 1. , 0.1])

In [23]:
accuracy_score(actual, predicted)

0.7965

In [24]:
# average of recall scores
# will this be bigger or smaller than accuracy?
recall_score(actual, predicted, average=None, labels=labels).mean()

0.65

In [25]:
# average of recall scores is called balanced accuracy score
balanced_accuracy_score(actual, predicted)

0.65

In [28]:
print(labels)
precision_score(actual, predicted, average=None, labels=labels)

['dog', 'cat', 'mouse', 'horse']


array([0.66167914, 1.        , 1.        , 1.        ])

In [29]:
cm

Unnamed: 0,dog,cat,mouse,horse
dog,796,0,0,0
cat,398,398,0,0
mouse,0,0,398,0
horse,9,0,0,1


# Binary Classification Metrics

Unless otherwise specified, "precision" and "recall" refer to those metrics for the positive class when we're doing binary classification.

In [30]:
actual = [False, True, True, True, True, False, False, True, True, True, True, True, True]
predicted = [False, True, True, True, True, True, True, False, False, False, True, True, True]
confusion_matrix(actual, predicted)

array([[1, 2],
       [3, 7]])

In [31]:
recall_score(actual, predicted, average=None, labels=[False, True])

array([0.33333333, 0.7       ])

In [32]:
# for binary classification, we have False recall and True recall
# "recall" is shorthand for "True recall"
recall_score(actual, predicted)

0.7

In [34]:
precision_score(actual, predicted, average=None, labels=[False, True])

array([0.25      , 0.77777778])

In [33]:
# for binary classification, we have False recall and True recall
# "precision" is shorthand for "True precision"
precision_score(actual, predicted)

0.7777777777777778