# 误差的计算

In [2]:
# samples

import numpy as np
origin = np.array([1,0,1,1,1,0,0,0,1,0,1,0])
assumption = np.array([1,0,0,1,1,1,0,0,1,0,1,1])

In [7]:
# Error rate
def cal_error_rate(org, asp):
    assert(org.shape[0] == asp.shape[0])
    return np.sum(origin != assumption)/org.shape[0]

cal_error(origin, assumption)

0.25

- 训练误差
    - training error | empirical error
- 泛化误差
    - generalization error

## 采样

### Cross Validation

> 样本集D分为k组，k-1作训练集，余下1组测试集，k组结果取均值

## 均方误差 （MSE）

$ E(f;D) = \frac{1}{m}\sum_{i=1}^{m}(f(x_i)-y_i)^2 $

In [8]:
def cal_MSE(org, asp):
    assert(org.shape[0] == asp.shape[0])
    return np.sum((origin - assumption)**2)/org.shape[0]

cal_MSE(origin, assumption)

0.25

## Precision & Recall

- Precision = $ \frac{TP}{TP+FP} $
- Recall = $ \frac{TP}{TP+FN} $

P-R 图

In [14]:
# TFPN
def cal_TFPN(org, asp):
    LEN = org.shape[0]
    TP = np.array([i for i in range(LEN) if org[i]==1 and asp[i]==1])
    TN = np.array([i for i in range(LEN) if org[i]==0 and asp[i]==0])
    FP = np.array([i for i in range(LEN) if org[i]==0 and asp[i]==1])
    FN = np.array([i for i in range(LEN) if org[i]==1 and asp[i]==0])
    return TP,TN,FP,FN
    
cal_TFPN(origin, assumption)

(array([ 0,  3,  4,  8, 10]), array([1, 6, 7, 9]), array([ 5, 11]), array([2]))

In [16]:
# P&R
def cal_precision(org, asp):
    TP,TN,FP,FN = cal_TFPN(org, asp)
    return TP.shape[0]/(TP.shape[0]+FP.shape[0])

def cal_recall(org, asp):
    TP,TN,FP,FN = cal_TFPN(org, asp)
    return TP.shape[0]/(TP.shape[0]+FN.shape[0])

p = cal_precision(origin, assumption)
r = cal_recall(origin, assumption)
p,r

(0.7142857142857143, 0.8333333333333334)

## F1 Score

> $ F_1 = \frac{2*P*R}{P+R} = \frac{2*TP}{N+TP-TN} $

> $ F_\beta $

In [18]:
def f1_score(org, asp):
    p = cal_precision(org, asp)
    r = cal_recall(org, asp)
    return 2*p*r/(p+r)

f1_score(origin, assumption)

0.7692307692307692

## BEP (Break-Even Point) 平衡点

> precision = recall

## ROC (Receiver Operating Characteristic)

> FPR-TPR

> $ FPR = \frac{FP}{TN+FP} $

> $ TPR = \frac{TP}{TP+FN} $

In [24]:
from sklearn.metrics import roc_auc_score
roc_auc_score(origin, assumption)

0.75000000000000022

## 代价敏感错误率

### Cost curve

## 算法比较检验

### Hypothesis test

- 二项检验 Binomial test
- t-检验 t-test
- 交叉验证t检验
- McNemar 检验
- Friedman 检验+Nemenyi 后续检验

### 偏差-方差分解 (bias-variance decomposition)