In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV #网格搜索，交叉验证
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer #列转换，特征转换
from sklearn.neighbors import KNeighborsClassifier
import joblib

In [2]:
# 1.加载数据
heart_disease_data = pd.read_csv("../data/heart_disease.csv")

# 数据清洗
heart_disease_data.dropna(inplace=True)

# 2.数据集划分
# 定义特征
# axis=1 表示按“列”删除
X = heart_disease_data.drop("是否患有心脏病", axis=1)
y = heart_disease_data["是否患有心脏病"]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 3.特征工程
# 数值型
numerical_features = ["年龄", "静息血压", "胆固醇", "最大心率", "运动后的ST下降", "主血管数量"]

# 类别型
categorical_features = ["胸痛类型", "静息心电图结果", "峰值ST段的斜率", "地中海贫血"]

# 二元特征
binary_features = ["性别", "空腹血糖", "运动性心绞痛"]

# 创建列转换器
transformer = ColumnTransformer(
    # (名称，操作，特征列表)
    transformers= [
        ("num", StandardScaler(), numerical_features),
        # drop="first"是独热编码中的一个参数，它的核心目的是避免多重共线性
        ("cat", OneHotEncoder(drop="first"), categorical_features),
        ("bin", "passthrough", binary_features)
    ]
)

# 用 fit_transform：当你在一份数据上既要学习参数（fit）又要立刻做变换（transform），通常发生在训练集上。
# 用 transform：当变换器的参数已经从训练集学好了，你要把同样的变换应用到验证集/测试集/新数据上，保证不泄漏信息。
x_train = transformer.fit_transform(x_train)
x_test = transformer.transform(x_test)

# 4.创建模型训练
# knn = KNeighborsClassifier(n_neighbors=3)
knn = KNeighborsClassifier()
# cv -- cross validation
knn = GridSearchCV(estimator=knn, param_grid={"n_neighbors":np.arange(1, 11), "weights":["uniform", "distance"]}, cv=10)

knn.fit(x_train, y_train)
print(pd.DataFrame(knn.cv_results_))
print(knn.best_estimator_)
print(knn.best_score_)

# 5.测试，模型评估
#print(knn.score(x_test, y_test))

# 6.模型保存
#joblib.dump(knn, "../data/knn.joblib")

# 7.模型加载
#knn_load = joblib.load("../data/knn.joblib")

    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0        0.000350      0.000125         0.004434        0.010519   
1        0.000263      0.000022         0.000908        0.000117   
2        0.000239      0.000012         0.000659        0.000035   
3        0.000271      0.000017         0.000890        0.000040   
4        0.000249      0.000013         0.000777        0.000137   
5        0.000276      0.000010         0.001002        0.000136   
6        0.000271      0.000035         0.000789        0.000081   
7        0.000294      0.000027         0.001158        0.000489   
8        0.000284      0.000054         0.000997        0.000516   
9        0.000292      0.000013         0.001045        0.000110   
10       0.000259      0.000020         0.000898        0.000319   
11       0.000281      0.000012         0.001081        0.000179   
12       0.000259      0.000016         0.000874        0.000314   
13       0.000276      0.000013         0.000998