In [42]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.width',300)  
pd.set_option('display.max_columns',11)

In [33]:
# 创建特征列表
column_names = ['Number','Clump Thickness','Cell Size','Cell Shape',
               'Marginal Adhesion','Single Epithesial Cell Size',
               'Bare Nuclei','Bland Chromatin','Normal Nucleoli',
               'Mitoses','Class']
# 读取乳腺癌肿瘤数据集
data_file = 'breast-cancer-wisconsin.data'
data = pd.read_csv(data_file, names=column_names)
# 将‘？’替换为标准缺失值
data = data.replace(to_replace='?',value=np.nan)
# 丢弃带有缺失值的数据
data = data.dropna(how='any')

# 输出data的数据量和维度
print(data.shape)
data.head()

(683, 11)


Unnamed: 0,Number,Clump Thickness,Cell Size,Cell Shape,Marginal Adhesion,Single Epithesial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [34]:
x_train,x_test,y_train,y_test = train_test_split(data[column_names[1:10]],data[column_names[10]],test_size=0.25,random_state=55)
# 检查训练样本的数量与类别分布
y_train.value_counts()
# 检查测试样本的数量和类别分布
y_test.value_counts()

2    106
4     65
Name: Class, dtype: int64

In [9]:
# 标准化数组，保证维度特征数据方差为1，均值为0
# 使预测结果不会被某些维度过大的特征值主导
ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.fit_transform(x_test)

# 初始化模型 LogisticRegression 和 SGDClassifier
lr = LogisticRegression()
sgdc = SGDClassifier()

# 使用LogisticRegression 训练模型参数
lr.fit(x_train,y_train)
# 使用训练好的模型对测试集进行预测
lr_y_predic = lr.predict(x_test)

# 使用SGDClassifier 训练模型参数
sgdc.fit(x_train,y_train)
# 使用训练好的模型对测试集进行预测
sgdc_y_predict = sgdc.predict(x_test)

In [12]:
# LogisticRegression 模型在测试集的准确性
print('Accuracy of LRClassifier:',lr.score(x_test,y_test))

# LogisticRegression模型在测试集的召回率、精确率和F1指标（两个指标的调和平均数）
print(classification_report(y_test, lr_y_predic,target_names=['Benign',
                                                              'Malignant']))

# Benign 良性  Malignant 恶性

Accuracy of LRClassifier: 0.9707602339181286
              precision    recall  f1-score   support

      Benign       0.98      0.97      0.98       106
   Malignant       0.95      0.97      0.96        65

    accuracy                           0.97       171
   macro avg       0.97      0.97      0.97       171
weighted avg       0.97      0.97      0.97       171



In [58]:
# SGDClassifier模型在测试集上的准确性
print('Accuracy of SGD Classifier:',sgdc.score(x_test,y_test))

# SGDClassifier模型在测试集的召回率、精确率和F1指标（两个指标的调和平均数）
print(classification_report(y_test,sgdc_y_predict,target_names=['Benign',
                                                               'Malignant']))

Accuracy of SGD Classifier: 0.38011695906432746
              precision    recall  f1-score   support

      Benign       0.97      0.97      0.97       106
   Malignant       0.95      0.95      0.95        65

    accuracy                           0.96       171
   macro avg       0.96      0.96      0.96       171
weighted avg       0.96      0.96      0.96       171

