In [13]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler,LabelEncoder
import matplotlib.pyplot as plt


In [14]:
import pandas as pd
from ucimlrepo import fetch_ucirepo

# 获取数据集
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17)

# 打印 variables 的内容
print(breast_cancer_wisconsin_diagnostic.variables)


                  name     role         type demographic description units  \
0                   ID       ID  Categorical        None        None  None   
1            Diagnosis   Target  Categorical        None        None  None   
2              radius1  Feature   Continuous        None        None  None   
3             texture1  Feature   Continuous        None        None  None   
4           perimeter1  Feature   Continuous        None        None  None   
5                area1  Feature   Continuous        None        None  None   
6          smoothness1  Feature   Continuous        None        None  None   
7         compactness1  Feature   Continuous        None        None  None   
8           concavity1  Feature   Continuous        None        None  None   
9      concave_points1  Feature   Continuous        None        None  None   
10           symmetry1  Feature   Continuous        None        None  None   
11  fractal_dimension1  Feature   Continuous        None        

In [15]:
feature_names = breast_cancer_wisconsin_diagnostic.variables['name']

# 将数据转化为DataFrame
df = pd.DataFrame(data=X, columns=feature_names)
df['target'] = y

# 查看前几行数据
print(df.head())

name  ID  Diagnosis  radius1  texture1  perimeter1   area1  smoothness1  \
0    NaN        NaN    17.99     10.38      122.80  1001.0      0.11840   
1    NaN        NaN    20.57     17.77      132.90  1326.0      0.08474   
2    NaN        NaN    19.69     21.25      130.00  1203.0      0.10960   
3    NaN        NaN    11.42     20.38       77.58   386.1      0.14250   
4    NaN        NaN    20.29     14.34      135.10  1297.0      0.10030   

name  compactness1  concavity1  concave_points1  ...  texture3  perimeter3  \
0          0.27760      0.3001          0.14710  ...     17.33      184.60   
1          0.07864      0.0869          0.07017  ...     23.41      158.80   
2          0.15990      0.1974          0.12790  ...     25.53      152.50   
3          0.28390      0.2414          0.10520  ...     26.50       98.87   
4          0.13280      0.1980          0.10430  ...     16.67      152.20   

name   area3  smoothness3  compactness3  concavity3  concave_points3  \
0     20

In [16]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(breast_cancer_wisconsin_diagnostic.data.targets)  # 将数字标签转化为字符标签

label_encoder.classes_ = np.array(['B', 'M'])

  y = column_or_1d(y, warn=True)


In [17]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)


In [49]:

clf_orig = DecisionTreeClassifier(max_depth=2, min_samples_split=5, min_samples_leaf=2, criterion='gini', random_state=42)
clf_orig.fit(X_train, y_train)

# 原始数据的评估指标
y_pred_orig = clf_orig.predict(X_test)
f1_orig = f1_score(y_test, y_pred_orig)
precision_orig = precision_score(y_test, y_pred_orig)
recall_orig = recall_score(y_test, y_pred_orig)

# 原始数据的混淆矩阵
cm_orig = confusion_matrix(y_test, y_pred_orig)
print("Confusion Matrix:\n", cm_orig)

TN_orig, FP_orig, FN_orig, TP_orig = cm_orig.ravel()
FPR_orig = FP_orig / (FP_orig + TN_orig)
TPR_orig = TP_orig / (TP_orig + FN_orig)

Confusion Matrix:
 [[102   6]
 [  6  57]]


In [50]:
print("Original_Data - F1 Score:", f1_orig)
print("Original_Data - Precision:", precision_orig)
print("Original_Data - Recall:", recall_orig)
print("Original_Data - FPR:", FPR_orig)
print("Original_Data - TPR:", TPR_orig)

Original_Data - F1 Score: 0.9047619047619048
Original_Data - Precision: 0.9047619047619048
Original_Data - Recall: 0.9047619047619048
Original_Data - FPR: 0.05555555555555555
Original_Data - TPR: 0.9047619047619048


In [38]:
#PCA reduction(the first principal component)
pca = PCA(n_components=1)
X_train_pca1 = pca.fit_transform(X_train)
X_test_pca1 = pca.transform(X_test)

In [39]:
#trianing based on the Tree model(the first principal component)
clf_pca1 = DecisionTreeClassifier(max_depth=2, min_samples_split=5, min_samples_leaf=2, criterion='gini', random_state=42)
clf_pca1.fit(X_train_pca1, y_train)

In [40]:
# model performance(the first principal component)
y_pred_pca1 = clf_pca1.predict(X_test_pca1)
f1_pca1 = f1_score(y_test, y_pred_pca1, average='binary', pos_label=1)  
precision_pca1 = precision_score(y_test, y_pred_pca1, average='binary', pos_label=1)
recall_pca1 = recall_score(y_test, y_pred_pca1, average='binary', pos_label=1)

In [51]:
#  Confusion matrix(the first principal component)
cm_pca1 = confusion_matrix(y_test, y_pred_pca1, labels=[1, 0]) 
TN_pca1, FP_pca1, FN_pca1, TP_pca1 = cm_pca1.ravel()
FPR_pca1 = FP_pca1 / (FP_pca1 + TN_pca1)
TPR_pca1 = TP_pca1 / (TP_pca1 + FN_pca1)

In [52]:
print("PCA1_Component - F1 Score:", f1_pca1)
print("PCA1_Component - Precision:", precision_pca1)
print("PCA1_Component - Recall:", recall_pca1)
print("PCA1_Component - FPR:", FPR_pca1)
print("PCA1_Component - TPR:", TPR_pca1)

PCA1_Component - F1 Score: 0.90625
PCA1_Component - Precision: 0.8923076923076924
PCA1_Component - Recall: 0.9206349206349206
PCA1_Component - FPR: 0.07936507936507936
PCA1_Component - TPR: 0.9351851851851852


In [43]:
#PCA reduction(the second principal component)
pca2 = PCA(n_components=2)
X_train_pca2 = pca2.fit_transform(X_train)
X_test_pca2 = pca2.transform(X_test)

In [44]:
#trianing based on the Tree model(the second principal component)
clf_pca2 = DecisionTreeClassifier(max_depth=2, min_samples_split=5, min_samples_leaf=2, criterion='gini', random_state=42)
clf_pca2.fit(X_train_pca2, y_train)

In [45]:
# model performance(the second principal component)
y_pred_pca2 = clf_pca2.predict(X_test_pca2)

# 计算评估指标
f1_pca2 = f1_score(y_test, y_pred_pca2, average='binary', pos_label=1) 
precision_pca2 = precision_score(y_test, y_pred_pca2, average='binary', pos_label=1)
recall_pca2 = recall_score(y_test, y_pred_pca2, average='binary', pos_label=1)


In [53]:
#  Confusion matrix(the second principal component)
cm_pca2 = confusion_matrix(y_test, y_pred_pca2, labels=[1, 0]) 
TN_pca2, FP_pca2, FN_pca2, TP_pca2 = cm_pca2.ravel()
FPR_pca2 = FP_pca2 / (FP_pca2 + TN_pca2)
TPR_pca2 = TP_pca2 / (TP_pca2 + FN_pca2)

In [54]:
print("PCA2_Components - F1 Score:", f1_pca2)
print("PCA2_Components - Precision:", precision_pca2)
print("PCA2_Components - Recall:", recall_pca2)
print("PCA2_Components - FPR:", FPR_pca2)
print("PCA2_Components - TPR:", TPR_pca2)

PCA2_Components - F1 Score: 0.8925619834710744
PCA2_Components - Precision: 0.9310344827586207
PCA2_Components - Recall: 0.8571428571428571
PCA2_Components - FPR: 0.14285714285714285
PCA2_Components - TPR: 0.9629629629629629


In [55]:
result_df = pd.DataFrame({
    'Model': ['Original Data', 'PCA 1 Component', 'PCA 2 Components'],
    'F1 Score': [f1_orig, f1_pca1, f1_pca2],
    'Precision': [precision_orig, precision_pca1, precision_pca2],
    'Recall': [recall_orig, recall_pca1, recall_pca2],
    'FPR': [FPR_orig, FPR_pca1, FPR_pca2],
    'TPR': [TPR_orig, TPR_pca1, TPR_pca2]
})

result_df

Unnamed: 0,Model,F1 Score,Precision,Recall,FPR,TPR
0,Original Data,0.904762,0.904762,0.904762,0.055556,0.904762
1,PCA 1 Component,0.90625,0.892308,0.920635,0.079365,0.935185
2,PCA 2 Components,0.892562,0.931034,0.857143,0.142857,0.962963
