In [3]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2

# 加载数据，指定列名
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = pd.read_csv('pima_data.csv', names=names)

# 分离输入特征X和目标变量y
X = data.iloc[:, 0:8]
y = data['class']

# 创建特征选择器，选择与目标变量最相关的4个特征
selector = SelectKBest(score_func=chi2, k=4)

# 拟合并转换数据，得到筛选后的特征
X_new = selector.fit_transform(X, y)

# 获取被选中特征的列名
selected_cols = X.columns[selector.get_support()]

print("SelectKBest选中的特征：", list(selected_cols))
print(pd.DataFrame(X_new, columns=selected_cols))


SelectKBest选中的特征： ['plas', 'test', 'mass', 'age']
      plas   test  mass   age
0    148.0    0.0  33.6  50.0
1     85.0    0.0  26.6  31.0
2    183.0    0.0  23.3  32.0
3     89.0   94.0  28.1  21.0
4    137.0  168.0  43.1  33.0
..     ...    ...   ...   ...
763  101.0  180.0  32.9  63.0
764  122.0    0.0  36.8  27.0
765  121.0  112.0  26.2  30.0
766  126.0    0.0  30.1  47.0
767   93.0    0.0  30.4  23.0

[768 rows x 4 columns]


In [4]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# 加载数据
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = pd.read_csv('pima_data.csv', names=names)
X = data.iloc[:, 0:8]
y = data['class']

# 创建逻辑回归模型作为基模型
model = LogisticRegression(max_iter=1000)

# 创建RFE选择器，递归消除，最终保留3个特征
rfe = RFE(model, n_features_to_select=3)

# 拟合RFE选择器
fit = rfe.fit(X, y)

# 获取被选中特征的列名
selected_cols = X.columns[fit.support_]

print("RFE选中的特征：", list(selected_cols))
print(pd.DataFrame(fit.transform(X), columns=selected_cols))

RFE选中的特征： ['preg', 'mass', 'pedi']
     preg  mass   pedi
0     6.0  33.6  0.627
1     1.0  26.6  0.351
2     8.0  23.3  0.672
3     1.0  28.1  0.167
4     0.0  43.1  2.288
..    ...   ...    ...
763  10.0  32.9  0.171
764   2.0  36.8  0.340
765   5.0  26.2  0.245
766   1.0  30.1  0.349
767   1.0  30.4  0.315

[768 rows x 3 columns]


In [5]:
from sklearn.decomposition import PCA
import pandas as pd

# 假设数据已加载为DataFrame，X为特征矩阵
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = pd.read_csv('pima_data.csv', names=names)
X = data.iloc[:, 0:8]

# 创建PCA对象，保留3个主成分
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

# 输出每个主成分解释的方差比例
print("PCA主成分解释方差比：", pca.explained_variance_ratio_)

# 输出降维后的数据
print(pd.DataFrame(X_pca, columns=['PC1', 'PC2', 'PC3']))

PCA主成分解释方差比： [0.88854663 0.06159078 0.02579012]
           PC1        PC2        PC3
0   -75.714655  35.950783   7.260789
1   -82.358268 -28.908213   5.496671
2   -74.630643  67.906496 -19.461808
3    11.077423 -34.898486   0.053018
4    89.743788   2.746937 -25.212859
..         ...        ...        ...
763  99.237881 -25.080927  19.534825
764 -78.641239   7.688010   4.137227
765  32.113198  -3.376665   1.587864
766 -80.214494  14.186020 -12.351264
767 -81.308150 -21.621496   8.152768

[768 rows x 3 columns]


In [6]:
from sklearn.ensemble import ExtraTreesClassifier

# 加载数据
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = pd.read_csv('pima_data.csv', names=names)
X = data.iloc[:, 0:8]
y = data['class']

# 创建极端随机树分类器
model = ExtraTreesClassifier()

# 拟合模型
model.fit(X, y)

# 获取每个特征的重要性分数
importances = model.feature_importances_

# 打印每个特征的重要性
for name, score in zip(X.columns, importances):
    print(f"{name}: {score:.4f}")

preg: 0.1144
plas: 0.2277
pres: 0.0978
skin: 0.0807
test: 0.0755
mass: 0.1418
pedi: 0.1203
age: 0.1417
