In [1]:
import numpy as np
from mlcevaluator1 import mlcEvaluator1
from mlcevaluator2 import mlcEvaluator2
from mlctensor import mlcTensor
from sklearn.metrics import multilabel_confusion_matrix


## Syntetic example
Generate true and predicted labels

In [2]:
# Example GT and Prediction matrices
gt=np.asarray([[1,1,0], [1,1,1], [0,0,0],
               [1,0,0], [1,1,0], [0,0,0],
               [1,0,0], [1,1,0], [1,1,0]])
              
pred=np.asarray([[1,1,0],[1,0,1],[0,0,0],
                 [1,1,1], [1,1,1], [0,1,1],
                 [0,1,1], [1,0,1], [0,0,1]])

The equations for computing contribution of a single data instance $i$ to the confusion tensor have an implicit assumption that $\lvert T_i\rvert > 0$ and $\lvert P_i\rvert > 0$, i.e. that both true labels and predictions for the data instance $i$ have at least one label assigned. To cope with the scenarios where true labels or prediction has no labels assigned, an additional class is included in computing the confusion tensor. This label, *unknown* is added as last element of each $T_i$ and $P_i$ vector.

## Multi-label Confusion Tensor
Compute raw Multi-Label Confusion Tensor and normalized Recall and Precision Confusion matrices

In [3]:
evalT = mlcTensor(gt, pred)
MT = evalT.computeConfusionTensor(unique=True)
RT = evalT.getRecall()
PT = evalT.getPrecision()

In [4]:
PT.shape

(4, 4)

## Per-class Precision/Recall
Per-class Recall and Precision are defined as:
$$
R(k) = \frac{TP(k)}{TP(k)+FN(k)},\qquad P(k) = \frac{TP(k)}{TP(k)+FP(k)}
$$
where $k$ is the class index, $TP(k)$ stands for a number of correctly assigned labels, $FN(k)$ represents the number of cases where the relevant label $k$ was not assigned to an instance and $FP(k)$ is a number of instances with incorrectly assigned label $k$.

In [5]:
rT=RT.diagonal()
pT=PT.diagonal()

print('Recall values for each class:')
print(rT.round(decimals=2))
print('Precision values for each class:')
print(pT.round(decimals=2))


Recall values for each class:
[0.71 0.4  1.   0.5 ]
Precision values for each class:
[1.   0.4  0.14 1.  ]


## Evaluating Classifier performance over all labels 
Let $\boldsymbol{T}_i$ be the vector representing the set of true labels for data instance $i$ and $\boldsymbol{P}_i$ be the vector of predicted labels for the same instance. Vectors $\boldsymbol{T}_{i1}=\boldsymbol{P}_{i1}=\boldsymbol{T}_i\cap \boldsymbol{P}_i$ represents correctly predicted labels. $\boldsymbol{T}_{i2}=\boldsymbol{T}_i\backslash \boldsymbol{P}_i$ is a set of true labels not predicted by the classifier, while $\boldsymbol{P}_{i2} = \boldsymbol{P}_i\backslash \boldsymbol{T}_i$ represents incorrectly predicted labels. It is clear that $\boldsymbol{T}_i = \boldsymbol{T}_{i1} + \boldsymbol{T}_{i1}$ and $\boldsymbol{P}_i = \boldsymbol{P}_{i1}+\boldsymbol{P}_{i2}$.<br>
 - $TP(k)$ (True Positive) stands for a number of instances with correctly assigned label $k$
 - $FP(k)$ (False Positive) stands for a number of instances with incorrectly assigned label $k$
 - $FN(k)$ (False Negative) represents the number of cases where the relevant label $k$ was not assigned to an instance. 

Let $B(TP(k) , FP(k) , TN(k), FN(k))$ be some specific binary classification metric, $k = 1, ...,q$, where $q$ is the number of possible labels.

Label-based classification metrics for a classificator can be obtained using either Macro-averaging or Micro-averaging approach.

### Macro-averaging
Macro-averaging averages over all  categories, thus giving each category equal weight<br>
$B_{Macro} = \frac{1}{q}\sum\limits_{k=1}^q B\big[ TP(k), FP(k), TN(k), FN(k)\big]$

Macro-averaged Recall and Precision for a classifier can be computed as:<br>
$R_{Macro} = \frac{1}{q}\sum\limits_{k=1}^q \frac{TP(k)}{TP(k)+FN(k)} = \frac{1}{q}\sum\limits_{k=1}^q R(k)$

$P_{Macro} = \frac{1}{q}\sum\limits_{k=1}^q \frac{TP(k)}{TP(k)+FP(k)} = \frac{1}{q}\sum\limits_{k=1}^q P(k)$,

where $R(k)$ and $P(k)$ are per-class Recall and Precision.

In [6]:
RMacro = rT.sum()/rT.shape[0]
PMacro = pT.sum()/pT.shape[0]

print('Macro-averaged Classifier Recall:', RMacro.round(decimals=2))
print('Macro-averaged Classifier Precision:', PMacro.round(decimals=2))

Macro-averaged Classifier Recall: 0.65
Macro-averaged Classifier Precision: 0.64


### Micro-averaging
Macro-averaging averages over data instances, thus giving each sample equal weight<br>
$B_{Micro} = B\big[\sum\limits_{k=1}^q TP(k), \sum\limits_{k=1}^q FP(k), \sum\limits_{k=1}^q TN(k), \sum\limits_{k=1}^q FN(k)\big]$

Micro-averaged Recall and Precision for a classifier can be computed as:<br>
$R_{Micro} = \frac{\sum\limits_{k=1}^q TP(k)}{\sum\limits_{k=1}^q TP(k)+\sum\limits_{k=1}^q FN(k)}$

$P_{Micro} = \frac{\sum\limits_{k=1}^q TP(k)}{\sum\limits_{k=1}^q TP(k)+ \sum\limits_{k=1}^qFP(k)}$

True positive values for each class $TP(k), k=1, ...,q$ are represented by diagonal elements of raw confusion tensor in both Recall and Precision matrices:



In [7]:
TP = MT[0,:,:].diagonal()
print(TP)

[5. 2. 1. 1.]


False negative value $FN(k)$ for label $k$ can be computed as a sum of the corresponding row in the Recall matrix (first elemet of Confusion Tensor) minus the value of the diagonal element, i.e. number of true positives for the same label:

In [8]:
FN = MT[0,:,:].sum(axis=1) - MT[0,:,:].diagonal()
print(FN)

[2. 3. 0. 1.]


False positive values $FP(k)$ are represented by the sum of the corresponding column in the Precision matrix (second elemet of Confusion Tensor) minus the value of the diagonal, i.e. number of true positives for the same label:

In [9]:
FP = MT[1,:,:].sum(axis=0) - MT[1,:,:].diagonal()
print(FP)

[0. 3. 6. 0.]


Micro-averaged Recall and Confusion (averaged over all samples) can be easily computed using the above values:

In [10]:
RMicro = TP.sum()/(TP.sum()+FN.sum())
PMicro = TP.sum()/(TP.sum()+FP.sum())

print('Micro-averaged Classifier Recall:', RMicro.round(decimals=2))
print('Micro-averaged Classifier Precision:', PMicro.round(decimals=2))

Micro-averaged Classifier Recall: 0.6
Micro-averaged Classifier Precision: 0.5
