In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer #列转换，特征转换
from sklearn.linear_model import LogisticRegression

In [3]:
# 1.加载数据
heart_disease_data = pd.read_csv("../data/heart_disease.csv")

# 数据清洗
heart_disease_data.dropna(inplace=True)

# 2.数据集划分
# 定义特征
# axis=1 表示按“列”删除
X = heart_disease_data.drop("是否患有心脏病", axis=1)
y = heart_disease_data["是否患有心脏病"]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 3.特征工程
# 数值型
numerical_features = ["年龄", "静息血压", "胆固醇", "最大心率", "运动后的ST下降", "主血管数量"]

# 类别型
categorical_features = ["胸痛类型", "静息心电图结果", "峰值ST段的斜率", "地中海贫血"]

# 二元特征
binary_features = ["性别", "空腹血糖", "运动性心绞痛"]

# 创建列转换器
transformer = ColumnTransformer(
    # (名称，操作，特征列表)
    transformers= [
        ("num", StandardScaler(), numerical_features),
        # drop="first"是独热编码中的一个参数，它的核心目的是避免多重共线性
        ("cat", OneHotEncoder(drop="first"), categorical_features),
        ("bin", "passthrough", binary_features)
    ]
)

# 用 fit_transform：当你在一份数据上既要学习参数（fit）又要立刻做变换（transform），通常发生在训练集上。
# 用 transform：当变换器的参数已经从训练集学好了，你要把同样的变换应用到验证集/测试集/新数据上，保证不泄漏信息。
x_train = transformer.fit_transform(x_train)
x_test = transformer.transform(x_test)

# 4.创建模型训练
model = LogisticRegression(solver='lbfgs', penalty='l2', class_weight='balanced')
model.fit(x_train, y_train)

# 5.测试，模型评估
print(model.score(x_test, y_test))


0.8084415584415584
