In [None]:
import numpy as np
import matplotlib.pyplot as plt

# 逻辑回归【只支持二分类】

In [None]:
import os, sys
sys.path.append(os.path.abspath('..'))
from playML.model_selection import train_test_split
from playML.LogisticRegression import LogisticRegression
from sklearn.datasets import load_iris

In [None]:
# 1.获取数据集
iris = load_iris()
X = iris.data
y = iris.target

# 2.1.数据预处理，构建二分类数据
X = X[y < 2, :2]
y = y[y < 2]

# 2.2.数据集划分
X_train, X_test, y_train, y_test = train_test_split(X, y, test_ratio=0.2, seed=666)

# 3.逻辑回归
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_predict = log_reg.predict(X_test)
y_proba = log_reg.predict_proba(X_test)
score = log_reg.score(X_test, y_test)

print("y_true:    ", y_test)
print("y_predict: ", y_predict)
print("y_proba:   ", y_proba)
print("准确度：    ", score)

# 4.绘制
plt.scatter(X_train[y_train == 0, 0], X_train[y_train == 0, 1], color='red')
plt.scatter(X_train[y_train == 1, 0], X_train[y_train == 1, 1], color='blue')
plt.scatter(X_test[y_predict == 0, 0], X_test[y_predict == 0, 1], color='red', marker='*')
plt.scatter(X_test[y_predict == 1, 0], X_test[y_predict == 1, 1], color='blue', marker='*')
plt.show()

# 决策边界

In [None]:
def plot_decision_boundary(model, axis):
    """绘制决策边界"""
    x0, x1 = np.meshgrid(
        np.linspace(axis[0], axis[1], int((axis[1]-axis[0])*100)),
        np.linspace(axis[2], axis[3], int((axis[3]-axis[2])*100))
    )
    X_new = np.c_[x0.ravel(), x1.ravel()]

    y_predict = model.predict(X_new)
    zz = y_predict.reshape(x0.shape)

    from matplotlib.colors import ListedColormap
    custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])

    plt.contourf(x0, x1, zz, cmap=custom_cmap)

## 逻辑回归的决策边界【直线】

In [None]:
plot_decision_boundary(log_reg, axis=[4, 7.5, 1.5, 4.5])
plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
plt.show()

## knn的决策边界

In [None]:
from sklearn.neighbors import KNeighborsClassifier

### 二分类

In [None]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)

In [None]:
plot_decision_boundary(knn_clf, axis=[4, 7.5, 1.5, 4.5])
plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
plt.show()

### 多分类

In [None]:
# 构建三分类数据
X_all = iris.data[:, :2]
y_all = iris.target

# 绘制knn决策边界
def knn_decision_boundary(k=5):
    # 1.模型训练
    knn_clf = KNeighborsClassifier(n_neighbors=k)
    knn_clf.fit(X_all, y_all)

    # 2.绘制边界
    plot_decision_boundary(knn_clf, axis=[4, 8, 1.5, 4.5])
    plt.scatter(X_all[y_all == 0, 0], X_all[y_all == 0, 1], color='red')
    plt.scatter(X_all[y_all == 1, 0], X_all[y_all == 1, 1], color='blue')
    plt.scatter(X_all[y_all == 2, 0], X_all[y_all == 2, 1], color='green')
    plt.show()

In [None]:
knn_decision_boundary(k=5)    # knn: k越小，模型越复杂，越容易发生过拟合

In [None]:
knn_decision_boundary(k=20)

In [None]:
knn_decision_boundary(k=50)

# 逻辑回归中添加多项式特征
## 支持决策边界为曲线

In [None]:
np.random.seed(666)
X = np.random.normal(0, 1, size=(200, 2))
y = np.array(X[:, 0]**2 + X[:, 1]**2 < 1.5, dtype='int')

In [None]:
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.show()

In [None]:
import os, sys
sys.path.append(os.path.abspath('..'))
from playML.LogisticRegression import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

  
def PolynomialLogisticRegression(degree=2):
    """多项式逻辑回归"""
    return Pipeline([
        ("ploy", PolynomialFeatures(degree)),    # 1.转换为线性回归, degree: 多项式的阶数
        ("std_scaler", StandardScaler()),        # 2.数据集归一化
        ("log_reg", LogisticRegression())        # 3.多项式回归
    ])


def PolynomialLogisticRegressionTest(degree=2):
    # 1.构建逻辑回归
    ploy_log_reg = PolynomialLogisticRegression(degree=degree)
    ploy_log_reg.fit(X, y)
    
    # 2.绘制
    plot_decision_boundary(ploy_log_reg, axis=[-4, 4, -4, 4])
    plt.scatter(X[y == 0, 0], X[y == 0, 1])
    plt.scatter(X[y == 1, 0], X[y == 1, 1])
    plt.show()

In [None]:
PolynomialLogisticRegressionTest(degree=1)

In [None]:
PolynomialLogisticRegressionTest(degree=2)

In [None]:
PolynomialLogisticRegressionTest(degree=20)  # 多项式回归: degree越大，模型越复杂，越容易发生过拟合

## 模型正则化，防止过拟合

In [None]:
np.random.seed(666)
X = np.random.normal(0, 1, size=(200, 2))
y = np.array(X[:, 0]**2 + X[:, 1] < 1.5, dtype='int')

# 添加随机噪音
for _ in range(20):
    y[np.random.randint(200)] = 1

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


def PolynomialLogisticRegression(degree, C=1, penalty='l2'):
    """多项式逻辑回归"""
    return Pipeline([
        ("ploy", PolynomialFeatures(degree=degree)),
        ("std_scaler", StandardScaler()),
        ("log_reg", LogisticRegression(C=C, penalty=penalty))  # penalty: L1、L2正则化， C: 正则化力度
    ])


def PolynomialLogisticRegressionTest(degree, C=1, penalty='l2'):
    # 1.构建逻辑回归
    ploy_log_reg = PolynomialLogisticRegression(degree, C, penalty)
    ploy_log_reg.fit(X, y)
    
    # 2.绘制
    plot_decision_boundary(ploy_log_reg, axis=[-4, 4, -4, 4])
    plt.scatter(X[y == 0, 0], X[y == 0, 1])
    plt.scatter(X[y == 1, 0], X[y == 1, 1])
    plt.show()

In [None]:
PolynomialLogisticRegressionTest(degree=1)

In [None]:
PolynomialLogisticRegressionTest(degree=2)

In [None]:
PolynomialLogisticRegressionTest(degree=20)

In [None]:
PolynomialLogisticRegressionTest(degree=20, C=0.1)

# 逻辑回归解决多分类问题

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# 1.获取数据集
iris = load_iris()

# 2.1.数据预处理，构建三分类数据
X = iris.data[:, :2]
y = iris.target

# 2.2.数据集划分
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)


def LogisticRegressionTest(multi_class, solver):
    # 多分类逻辑回归
    log_reg = LogisticRegression(multi_class=multi_class, solver=solver)
    log_reg.fit(X_train, y_train)
    print('准确度:', log_reg.score(X_test, y_test))
    
    # 绘制决策边界
    plot_decision_boundary(log_reg, axis=[4, 8, 1.5, 4.5])
    plt.scatter(X[y == 0, 0], X[y == 0, 1], color='red')
    plt.scatter(X[y == 1, 0], X[y == 1, 1], color='blue')
    plt.scatter(X[y == 2, 0], X[y == 2, 1], color='green')
    plt.show()

## OvR and OvO
- **ovr:** 将n分类问题，分为两类：目标类+其余类，计算n次，目标划分到概率大的一类中
- **ovo:** 从n分类问题，取出两类：目标类1+目标类2，计算n*(n-1)/2次，目标划分到概率大的一类中

In [None]:
LogisticRegressionTest(multi_class='ovr', solver='liblinear')

In [None]:
LogisticRegressionTest(multi_class='multinomial', solver='newton-cg')

## 扩展所有的二分类器，进行多分类任务

In [None]:
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier


def MultiClassifier(estimator, multi_class='ovr'):
    """多分类器"""
    if multi_class == 'ovr':
        ovr = OneVsRestClassifier(estimator)
        ovr.fit(X_train, y_train)
        print('准确度:', ovr.score(X_test, y_test))
        return ovr
    else:
        ovo = OneVsOneClassifier(estimator)
        ovo.fit(X_train, y_train)
        print('准确度:', ovo.score(X_test, y_test))
        return ovo
        
        
def MultiClassLogisticRegression(multi_class):
    """多分类逻辑回归"""
    from sklearn.linear_model import LogisticRegression
    return MultiClassifier(LogisticRegression(), multi_class)


MultiClassLogisticRegression('ovr')
MultiClassLogisticRegression('ovo')

# 分类问题的评价
## 指标
- **精准率** = 某个分类预测正确的总数 / 某个分类预测值的总数
- **召回率** = 某个分类预测正确的总数 / 某个分类真实值的总数
- **调和平均值** = 2 * 精准率 * 召回率 / (精准率 + 召回率)

In [None]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
digits = load_digits()
X = digits.data
y = digits.target.copy()

# 产生极度偏斜数据【二分类】
y[digits.target == 9] = 1
y[digits.target != 9] = 0

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)

log_reg_2 = LogisticRegression()
log_reg_2.fit(X_train, y_train)

In [None]:
def test_threshold_0(log_reg):
    a = log_reg.decision_function(X_test)[50: 60]  # 逻辑回归分数
    b = log_reg.predict(X_test)[50: 60]  # 以 0 作为分类边界
    print(np.vstack((a, b)).T)


def predict(log_reg, X_test, n=0):
    """二分类: 调整分类阈值，重新计算预测值"""
    decision_scores = log_reg.decision_function(X_test)
    return np.array(decision_scores >= n, dtype='int')


def evaluate(y_true, y_predict):
    """二分类: 评价分类算法"""
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import precision_score
    from sklearn.metrics import recall_score
    from sklearn.metrics import f1_score

    cm = confusion_matrix(y_test, y_predict)
    ps = precision_score(y_true, y_predict)
    rs = recall_score(y_true, y_predict)
    fs = f1_score(y_true, y_predict)
    print("混淆矩阵：\n", cm)
    print("精准率：", ps)
    print("召回率：", rs)
    print("调和平均值：", fs)


if __name__ == '__main__':
    # 测试: 逻辑回归算法的分类阈值
    test_threshold_0(log_reg_2)
    if not np.sum(predict(log_reg_2, X_test) != log_reg_2.predict(X_test)):
        print("--> log_reg.predict默认以0作为分类阈值。")

In [None]:
evaluate(y_test, predict(log_reg_2, X_test, -5))

In [None]:
evaluate(y_test, predict(log_reg_2, X_test, 0))

In [None]:
evaluate(y_test, predict(log_reg_2, X_test, 5))

## pr曲线

In [None]:
from sklearn.metrics import precision_score, recall_score
import matplotlib.pyplot as plt
import numpy as np


def plot_pr_curve(y_true, decision_scores):
    """绘制pr曲线"""
    assert len(np.unique(y_true)) <= 2, "只能绘制二分类PR曲线"

    precisions = []
    recalls = []
    thresholds = np.arange(np.min(decision_scores), np.max(decision_scores), 0.1)
    for threshold in thresholds:
        y_predict = np.array(decision_scores >= threshold, dtype=int)
        precisions.append(precision_score(y_true, y_predict))
        recalls.append(recall_score(y_true, y_predict))
    
    plt.figure(figsize=(9, 3), dpi=100)
    plt.subplot(1, 2, 1)
    plt.plot(thresholds, precisions, label='精准率')
    plt.plot(thresholds, recalls, label='召回率')
    plt.xlabel("分类阈值")
    plt.ylabel("指标")
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(precisions, recalls)
    plt.xlabel("精准率")
    plt.ylabel("召回率")
    plt.show()

In [None]:
plot_pr_curve(y_test, log_reg_2.decision_function(X_test))

## roc曲线

In [None]:
import matplotlib.pyplot as plt
import numpy as np


def plot_roc_curve(y_true, decision_scores):
    """绘制roc曲线"""
    assert len(np.unique(y_true)) <= 2, "只能绘制二分类ROC曲线"
    
    def TN(y_true, y_predict):
        assert len(y_true) == len(y_predict), \
        "the size of y_true must equal to the size of y_predict"
        return np.sum((y_true == 0) & (y_predict == 0))
    
    def FP(y_true, y_predict):
        assert len(y_true) == len(y_predict), \
        "the size of y_true must equal to the size of y_predict"
        return np.sum((y_true == 0) & (y_predict == 1))
    
    def FN(y_true, y_predict):
        assert len(y_true) == len(y_predict), \
        "the size of y_true must equal to the size of y_predict"
        return np.sum((y_true == 1) & (y_predict == 0))
    
    def TP(y_true, y_predict):
        assert len(y_true) == len(y_predict), \
        "the size of y_true must equal to the size of y_predict"
        return np.sum((y_true == 1) & (y_predict == 1))
        
    def TPR(y_true, y_predict):
        tp = TP(y_true, y_predict)
        fn = FN(y_true, y_predict)
        try:
            return tp / (tp + fn)
        except:
            return 0.

    def FPR(y_true, y_predict):
        fp = FP(y_true, y_predict)
        tn = TN(y_true, y_predict)
        try:
            return fp / (fp + tn)
        except:
            return 0.

    fprs = []
    tprs = []
    thresholds = np.arange(np.min(decision_scores), np.max(decision_scores), 0.1)
    for threshold in thresholds:
        y_predict = np.array(decision_scores >= threshold, dtype=int)
        fprs.append(FPR(y_true, y_predict))
        tprs.append(TPR(y_true, y_predict))
    
    plt.figure(figsize=(9, 4), dpi=100)
    plt.subplot(1, 2, 1)
    plt.plot(thresholds, fprs, label='fprs')
    plt.plot(thresholds, tprs, label='tprs')
    plt.xlabel("分类阈值")
    plt.ylabel("指标")
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(fprs, tprs)
    plt.xlabel("fprs")
    plt.ylabel("tprs")
    plt.show()

In [None]:
plot_roc_curve(y_test, log_reg_2.decision_function(X_test))

**计算ROC曲线包围的面积auc: area under curve**

In [None]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_test, log_reg_2.decision_function(X_test)))

## 绘制混淆矩阵

In [None]:
import matplotlib.pyplot as plt
import numpy as np


def plot_confusion_matrix(cfm):
    """绘制混淆矩阵"""
    plt.figure(figsize=(9, 4), dpi=100)

    ax1 = plt.subplot(1, 2, 1)
    ax1.matshow(cfm, cmap=plt.cm.gray)
    plt.title("正确预测")

    ax2 = plt.subplot(1, 2, 2)
    row_sums = np.sum(cfm, axis=0)
    err_matrix = cfm / row_sums  # 百分比
    np.fill_diagonal(err_matrix, 0)  # 对角线元素置为0
    ax2.matshow(err_matrix, cmap=plt.cm.gray)
    plt.title("错误预测")

    plt.show()

In [None]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

digits = load_digits()
X = digits.data
y = digits.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=666)

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

cfm = confusion_matrix(y_test, log_reg.predict(X_test))
print(cfm)
plot_confusion_matrix(cfm)