In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [2]:
data = load_breast_cancer()['data']
mm = MinMaxScaler()
data = mm.fit_transform(data)
label = load_breast_cancer()['target']

In [3]:
train_X,test_X,train_y,test_y = train_test_split(data,label,test_size=0.2,random_state=5)
print(train_X.shape,  train_y.shape)
train_y, test_y = train_y.reshape((-1,1)), test_y.reshape((-1,1))
print(train_X.shape,  train_y.shape)

(455, 30) (455,)
(455, 30) (455, 1)


In [4]:
### 定义sigmoid函数
def sigmoid(x):
    return 1/(1+np.exp(-x))

###定义参数初始化函数
def initialize_param(dims):
    w = np.zeros((dims, 1))
    b = 0
    return w, b

In [5]:
### 定义对数几率回归模型主体
def logistic(X, y, w, b):
    '''
    输入:
    X：输入特征矩阵
    y：输出标签向量
    w：权重系数
    b：偏置参数
    输出：
    y_hat: 对数几率回归模型输出
    cost: 损失
    dw：权重梯度
    db：偏置梯度
    '''
    # 训练样本数量
    num_train = X.shape[0]
    # 训练特征数量
    num_feature = X.shape[1]
    # 对数几率回归模型输出
    y_hat = sigmoid(np.dot(X, w) + b)
    # 计算预测输出与实际标签之间的均方损失
    cost = -1/num_train*np.sum(y*np.log(y_hat)+(1-y)*np.log(1-y_hat))
    # 更新梯度
    dw = np.dot(X.T, (y_hat-y))/num_train
    db = np.sum((y_hat-y))/num_train
    cost = np.squeeze(cost)
    return y_hat, cost, dw, db

In [6]:
### 定义对数几率回归模型训练过程
def logistic_train(X, y, learning_rate=0.1, epochs=500):
    '''
    输入：
    X：输入变量矩阵
    y：输出标签向量
    learning_rate：学习率
    epochs：训练迭代次数
    输出：
    cost_list：损失列表
    params：优化后的参数字典
    grads：优化后的参数梯度字典
    '''
    # 记录训练损失的空列表
    cost_list= []
    # 初始化模型参数
    w, b = initialize_param(X.shape[1])
    # 迭代训练
    for i in range(1, epochs):
        # 计算当前迭代的预测值、损失和梯度
        y_hat, cost, dw, db = logistic(X, y, w, b)
        # 基于梯度下降的参数更新
        w += -learning_rate * dw
        b += -learning_rate * db
        # 记录当前迭代的损失
        if i % 100 == 0:
            cost_list.append(cost)
            print('epoch %d loss %f' % (i, cost))
        # 将当前迭代步优化后的参数保存到字典
        params = {
            'w': w,
            'b': b
        }
        # 将当前迭代步的梯度保存到字典
        grads = {
            'dw': dw,
            'db': db
        }     
    return cost_list, params, grads

In [7]:
def logis_predict(X, params):
    '''
    输入：
    X: 输入特征矩阵
    params: 训练好的模型参数
    输出：
    y_prediction: 转换后的模型预测值
    '''
    # 模型预测值
    
    
    y_prediction = sigmoid(np.dot(X, params['w']) + params['b'])
    # 基于分类阈值对概率预测值进行类别转换
    for i in range(len(y_prediction)):        
        if y_prediction[i] > 0.5:
            y_prediction[i] = 1
        else:
            y_prediction[i] = 0
            
    return y_prediction

In [8]:
cost_list, params, grads = logistic_train(train_X, train_y)

epoch 100 loss 0.502322
epoch 200 loss 0.407481
epoch 300 loss 0.351695
epoch 400 loss 0.314808


In [9]:
y_pre = logis_predict(test_X, params=params)
y_pre[:5]

array([[0.],
       [1.],
       [1.],
       [1.],
       [1.]])

In [10]:
from sklearn.metrics import accuracy_score
score = accuracy_score(test_y, y_pre)
score

0.9298245614035088

## 使用sklearn中的逻辑回归

In [11]:
from sklearn.linear_model import LogisticRegression as lr
clf = lr(random_state=1).fit(train_X, train_y)
y_pred = clf.fit(test_X, test_y)
score = accuracy_score(test_y, y_pre)
score

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.9298245614035088