In [None]:
# -*- coding:utf-8 -*-
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score

import knn

# 1.文件读取

In [None]:
df = pd.read_csv('./OnlineNewsPopularity/OnlineNewsPopularity.csv', header=0) # header参数指定标题的行，默认为0.如果没有标题，则使用None

In [None]:
print(df.head())

In [None]:
df.columns

# 2.转化为二分类问题

In [None]:
for i in range(len(df[' shares'])):
    df.loc[i, ' shares'] = 0 if df[' shares'][i] < 1400 else 1

# 3.数据预处理

In [None]:
# 去掉前两列与预测无关的数据
df = df.drop("url", axis=1)
df = df.drop(" timedelta", axis=1)

In [None]:
print(df.head())

# 4.划分训练集与测试集

In [None]:
# 分配训练集和测试集（由于原文件已经按照时间远近进行了排序，所以需要先划分，后打乱）
rate = 0.7 # 7:3
midi = round(len(df) * 0.7)
df1 = df.iloc[:midi, :]
df2 = df.iloc[midi:, :]

# 打乱数据
df1 = df1.sample(len(df1), random_state=0)
df2 = df2.sample(len(df2), random_state=0)

train_X = df1.iloc[:, :-1]
train_y = df1.iloc[:,  -1]
test_X  = df2.iloc[:, :-1]
test_y  = df2.iloc[:,  -1]

# 5. 数据归一化

In [None]:
from sklearn.preprocessing import StandardScaler

### 数据预处理
## 分别对训练集、验证集和测试集进行数据标准化
scaler = StandardScaler()
train_X = scaler.fit_transform(train_X.squeeze())     # 训练集
test_X = scaler.fit_transform(test_X.squeeze())       # 测试集

# 6.训练与测试

In [None]:
# 模型初始化
knn_k = 6
knn_model = knn.knn(knn_k)
# 模型训练
knn_model.fit(train_X, train_y)
# 模型预测
prey = knn_model.predict(test_X)

In [None]:
# 模型评估
TP = 0 # 正例被预测为正例
FN = 0 # 正例被预测为反例
FP = 0 # 反例被预测为正例
TN = 0 # 反例被预测为反例
temp_test_y = test_y.values # 从series转换为 n array
# 统计
for i in range(len(prey)):
    if prey[i] == temp_test_y[i]:
        if prey[i] == 1:
            TP = TP + 1
        else:
            TN = TN + 1
    else:
        if prey[i] == 1:
            FP = FP + 1
        else:
            FN = FN + 1
accuracy = (TP + TN) / (TP + FP + TN + FN)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
F1_score = (2 * precision * recall) / (precision + recall)
AUC = roc_auc_score(temp_test_y, prey)

# 记录并打印
result = [
    ['KNN中k值', knn_k],
    ['准确率accuracy', accuracy],
    ['精确率precision', precision],
    ['召回率recall', recall],
    ['F1-score', F1_score],
    ['AUC', AUC]
]
result_out = pd.DataFrame(result)
print(result_out)

# 7.K值的取得

In [None]:
F1_score_group = []
K_range = range(1, 20, 1)
temp_test_y = test_y.values # 从series转换为 n array

for kk in K_range:
    knn_k = kk
    knn_model = knn.knn(knn_k)
    knn_model.fit(train_X, train_y)
    prey = knn_model.predict(test_X)

    TP = 0
    FN = 0
    FP = 0
    TN = 0

    for i in range(len(prey)):
        if prey[i] == temp_test_y[i]:
            if prey[i] == 1:
                TP = TP + 1
            else:
                TN = TN + 1
        else:
            if prey[i] == 1:
                FP = FP + 1
            else:
                FN = FN + 1
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    F1_score = (2 * precision * recall) / (precision + recall)

    print("当前进度：%f" %(kk / len(K_range)))
    F1_score_group.append(F1_score)

plt.plot(K_range, F1_score_group)

# 8.K折交叉验证

In [28]:
import numpy as np

cv = 5 # cv表示把数据分为几份
single_length =  round(len(df) / cv)
accuracy_group = []
precision_group = []
recall_group = []
F1_score_group = []
AUC_group = []

knn_k = 6

for j in range(0, cv, 1):
    df1 = pd.concat([df.iloc[0:(j * single_length), :], df.iloc[((j + 1) * single_length):, :]])
    df2 = df.iloc[(j * single_length) : ((j + 1) * single_length), :]

    # 打乱数据
    df1 = df1.sample(len(df1), random_state=0)
    df2 = df2.sample(len(df2), random_state=0)

    train_X = df1.iloc[:, :-1]
    train_y = df1.iloc[:,  -1]
    test_X  = df2.iloc[:, :-1]
    test_y  = df2.iloc[:,  -1]

    # 模型初始化
    knn_model = knn.knn(knn_k)
    # 模型训练
    knn_model.fit(train_X, train_y)
    # 模型预测
    prey = knn_model.predict(test_X)
    temp_test_y = test_y.values # 从series转换为 n array

    # 模型评估
    TP = 0
    FN = 0
    FP = 0
    TN = 0

    # 统计
    for i in range(len(prey)):
        if prey[i] == temp_test_y[i]:
            if prey[i] == 1:
                TP = TP + 1
            else:
                TN = TN + 1
        else:
            if prey[i] == 1:
                FP = FP + 1
            else:
                FN = FN + 1
    accuracy = (TP + TN) / (TP + FP + TN + FN)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    F1_score = (2 * precision * recall) / (precision + recall)
    AUC = roc_auc_score(temp_test_y, prey)

    accuracy_group.append(accuracy)
    precision_group.append(precision)
    recall_group.append(recall)
    F1_score_group.append(F1_score)
    AUC_group.append(AUC)

# 记录并打印
result = [
    ['KNN中k值', knn_k],
    ['准确率accuracy', np.mean(accuracy_group)],
    ['精确率precision', np.mean(precision_group)],
    ['召回率recall', np.mean(recall_group)],
    ['F1-score', np.mean(F1_score_group)],
    ['AUC', np.mean(AUC_group)]
]
result_out = pd.DataFrame(result)
print(result_out)

              0         1
0        KNN中k值  6.000000
1   准确率accuracy  0.548154
2  精确率precision  0.602660
3     召回率recall  0.453566
4      F1-score  0.516018
5           AUC  0.555437
