In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV #网格搜索，交叉验证
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer #列转换，特征转换
from sklearn.neighbors import KNeighborsClassifier
import joblib

In [20]:
# 1.加载数据
heart_disease_data = pd.read_csv("../data/heart_disease.csv")

# 数据清洗
heart_disease_data.dropna(inplace=True)

# 2.数据集划分
# 定义特征
# axis=1 表示按“列”删除
X = heart_disease_data.drop("是否患有心脏病", axis=1)
y = heart_disease_data["是否患有心脏病"]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 3.特征工程
# 数值型
numerical_features = ["年龄", "静息血压", "胆固醇", "最大心率", "运动后的ST下降", "主血管数量"]

# 类别型
categorical_features = ["胸痛类型", "静息心电图结果", "峰值ST段的斜率", "地中海贫血"]

# 二元特征
binary_features = ["性别", "空腹血糖", "运动性心绞痛"]

# 创建列转换器
transformer = ColumnTransformer(
    # (名称，操作，特征列表)
    transformers= [
        ("num", StandardScaler(), numerical_features),
        # drop="first"是独热编码中的一个参数，它的核心目的是避免多重共线性
        ("cat", OneHotEncoder(drop="first"), categorical_features),
        ("bin", "passthrough", binary_features)
    ]
)

# 用 fit_transform：当你在一份数据上既要学习参数（fit）又要立刻做变换（transform），通常发生在训练集上。
# 用 transform：当变换器的参数已经从训练集学好了，你要把同样的变换应用到验证集/测试集/新数据上，保证不泄漏信息。
x_train = transformer.fit_transform(x_train)
x_test = transformer.transform(x_test)

# 4.创建模型训练
# knn = KNeighborsClassifier(n_neighbors=3)
knn = KNeighborsClassifier()
knn = GridSearchCV(estimator=knn, param_grid={"n_neighbors":np.arange(1, 11)}, cv=10)

knn.fit(x_train, y_train)
print(pd.DataFrame(knn.cv_results_))
print(knn.best_estimator_)
print(knn.best_score_)

# 5.测试，模型评估
#print(knn.score(x_test, y_test))

# 6.模型保存
#joblib.dump(knn, "../data/knn.joblib")

# 7.模型加载
#knn_load = joblib.load("../data/knn.joblib")

   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       0.000868      0.000051         0.001681        0.000123   
1       0.000909      0.000147         0.001740        0.000208   
2       0.000692      0.000121         0.001508        0.000151   
3       0.000663      0.000068         0.001526        0.000186   
4       0.000696      0.000143         0.002373        0.002535   
5       0.000615      0.000020         0.001276        0.000076   
6       0.000656      0.000123         0.001337        0.000124   
7       0.000665      0.000042         0.001344        0.000130   
8       0.000621      0.000013         0.001295        0.000044   
9       0.000652      0.000072         0.001430        0.000085   

   param_n_neighbors               params  split0_test_score  \
0                  1   {'n_neighbors': 1}           0.986111   
1                  2   {'n_neighbors': 2}           0.944444   
2                  3   {'n_neighbors': 3}           0.930556   
3     