In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# 1. 加载数据集
# 2. 数据基本处理
#    1. 处理缺失值: '?'
#    2. 选择特征值和目标值
#    3. 分割数据集
# 3. 特征工程(标准化)
# 4. 机器学习(模型训练) : 逻辑回归
# 5. 模型评估

In [2]:
# 1. 加载数据集
names = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
                   'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
                   'Normal Nucleoli', 'Mitoses', 'Class']
datas = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data', 
                    names=names)

datas.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [5]:
# 2. 数据基本处理
#    1. 处理缺失值: '?'
# 把 '?' 替换为 np.nan
datas.replace(to_replace='?', value=np.nan, inplace=True)
#   2. 删除缺失值(默认按行删除)
datas.dropna(inplace=True)
# np.any(datas.isnull())
#    2. 选择特征值和目标值
# 特征值
x = datas.iloc[:,1:-1]
# 目标值
y = datas['Class']
#    3. 分割数据集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=8)

In [6]:
# 3. 特征工程(标准化)
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  after removing the cwd from sys.path.


In [7]:
# 4. 机器学习(模型训练) : 逻辑回归
# 创建逻辑回归的评估器
estimator = LogisticRegression()
# 模型训练
estimator.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [10]:
# 5. 模型评估
y_pred = estimator.predict(x_test)
# print(y_pred == y_test)
# 准确率
score = estimator.score(x_test, y_test)
print('准确率', score)

准确率 0.9707602339181286


In [12]:
from sklearn.metrics import classification_report
rs = classification_report(y_true=y_test, y_pred=y_pred, labels=[2, 4], target_names=['良性', '恶性'])
print(rs)

              precision    recall  f1-score   support

          良性       0.97      0.98      0.98       104
          恶性       0.97      0.96      0.96        67

   micro avg       0.97      0.97      0.97       171
   macro avg       0.97      0.97      0.97       171
weighted avg       0.97      0.97      0.97       171



In [16]:
from sklearn.metrics import roc_auc_score
# 把真实目标值处理为0(反例), 1(正例)
y_true = np.where(y_test>3, 1, 0)

# 计算AUC指标
auc = roc_auc_score(y_true, y_pred)
print('auc指标', auc)

auc指标 0.9679965556831229


In [None]:
# 1. 加载数据集
# 2. 数据基本处理
#    1. 处理缺失值: '?'
#    2. 选择特征值和目标值
#    3. 分割数据集
# 3. 特征工程(标准化)
# 4. 机器学习(模型训练) : 逻辑回归
# 5. 模型评估