In [None]:
''' 
y_pred = 推理函数
a = 学习率

——————————推理函数y_pred——————————
设
w = [w0,w1] (权重向量)
x = [1,x1] (特征向量 x0 = 1 代表截距)
y = wx =  w0*1 + w1*x1

p = sigmoid(y) = 1 / (1 + e^(-y))
将y(-∞,∞) 映射到0-1之间 表示概率 约等于二分类

推理函数
y_pred = p


——————————损失函数L——————————
y = 标签 0或1
交叉熵作为损失函数
L = y * log(1/y_pred) + (1-y) * log(1/(1-y_pred))


——————————损失函数对于w的偏导数dL——————————
dL/dw
1 L-p 的导数
2 p-y 的导数
3 y-x 的导数
4 L-w 的导数

——————————1 L-p 的导数————————————————
L = y * log(1/y_pred) + (1-y) * log(1/(1-y_pred))
L = y * log(1/p) + (1-y) * log(1/(1-p))
= -y * log(p) - (1-y) * log(1-p)

左边 = -y * log(p)
右边 = (1-y) * log(1-p)

左边求导
= -y * (1/p)

右边求导
= (1-y) * (1/1-p) * (-1)

合并 = 左边 - 右边
= -y * (1/p) - (1-y) * (1/1-p) * (-1)
= -y * (1/p) + (1-y) * (1/1-p)
= (p-y)  /  (p*(1-p))

则
dL/dp = (p-y)  /  (p*(1-p))

——————————2 p-y 的导数 dp/dy————————————————
p = sigmoid(y) = 1 / (1 + e^(-y))
dp/dy = p * (1-p) 

——————————3 y-x 的导数 dy/dw————————————————
y = wx
dy/dw = x

——————————4 L-w 的导数 dL/dw————————————————
dL/dw = dL/dp * dp/dy * dy/dw
= (p-y)  /  (p*(1-p)) *  p * (1-p) * x
= (p-y) * x


——————————结论————————————————
dL/dw = (p-y) * x

——————————更新权重——————————
w = w - a * dL


'''

In [None]:
''' 
sigmoid 求导
dp/dy
p = sigmoid(y) = 1 / (1 + e^(-y))

s(y) = 1 / (1 + e^(-y))
1-s(y) = e^(-y) / (1 + e^(-y))


令 u = 1+e^(-y)
则
s(y)' = 1/u
s'(y) = (1/u)' * u'

分开求导
(1/u)' 
= (u^-1)'
= -1 * u^-2 
= -1/u^2
= -1/(1+e^(-y))^2

u'
= (1+e^(-y))'
= 1' + e^(-y)'
= 0 + e^(-y)'
=  0 + (-y)' * e^(-y)
= -e^(-y)

合并
s'(y) = (1/u)' * u'
= e^(-y)  /  (1+e^(-y))^2

观察并化简
可分解成
a = e^(-y)/(1+e^(-y)) = 1-s(y)
b = 1/(1+e^(-y)) = s(y)

则
s'(y) = a * b
s'(y) = (1-s(y)) * s(y)

'''

In [None]:
''' 
e^x 求导

y = e^x
两边取对数 ln(y) = x
两边对x求导 1/y * y' = 1
y' = y
则
(e^x)' = e^x

'''

In [17]:
import numpy as np

class LogisticRegression:
    def __init__(self, l_rate=0.01, n_iterations=1000):
        self.l_rate = l_rate  # 学习率
        self.n_iterations = n_iterations  # 迭代次数
        self.w = None  # 权重
    
    # sigmoid
    def sigmoid(self, y):
        return 1 / (1 + np.exp(-y))

    def fit(self, x, y):
        ''' 
        最左增加一列 1 作为截距项
        x = [x1, x2, x3] -> [1, x1, x2, x3]

        x.shape = (m,n)
        m = 样本数
        n = 特征数 (包含增加的偏置项)
        '''
        x = np.concatenate([np.ones((x.shape[0],1)),x],axis=1)
        m, n = x.shape 
        y = y.reshape(-1,1) # 确保形状为(m,1)

        self.w = np.zeros((n,1)) # 初始化权重
        for i in range(self.n_iterations):
            ''' 
            推理函数 y_pred = sigmoid(wx)
            损失函数 L = y * log(1/y_pred) + (1-y) * log(1/(1-y_pred))
            损失函数对w的偏导数 dL = (y_pred-y) * x
            更新权重 w = w - l_rate * dL
            '''

            y_pred = self.sigmoid(x @ self.w)
            dL = x.T @ (y_pred - y) / m # 除以m求均值
            self.w = self.w - self.l_rate * dL
    
    # 预测概率
    def predict_prob(self, x, threshold=0.5):
        x = np.concatenate([np.ones((x.shape[0],1)),x],axis=1)
        p = self.sigmoid(x @ self.w)
        return p

    # 预测类别
    def predict(self, x, threshold=0.5):
        p = self.predict_prob(x)
        return (p >= threshold).astype(int) # 根据阈值 返回 0 或 1




In [18]:
# 使用模型

# 创建二分类数据集
# x 2个维度 10个训练数据
np.random.seed(42)
x = np.array([[1, 2], [2, 1], [1, 3], [3, 2], [2, 3], 
              [3, 3], [4, 2], [2, 4], [3, 4], [4, 4]])
y = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1, 1])

print("使用数据:")
print("x1:", x[:, 0])
print("x2:", x[:, 1])
print("y:", y)
print('数据规律 x1 + x2 >= 5 → y=1 else y=0')

# 实例化 梯度下降
model = LogisticRegression(l_rate=0.1, n_iterations=1000)
model.fit(x, y) # 训练

y_pred_prob = model.predict_prob(x) # 预测概率
y_pred = model.predict(x) # 预测分类
print(f"{y_pred_prob.flatten().round(2)}") # 输出 0-1 之间的概率
print(f"{y_pred.flatten()}") # 输出 0 或 1


使用数据:
x1: [1 2 1 3 2 3 4 2 3 4]
x2: [2 1 3 2 3 3 2 4 4 4]
y: [0 0 0 1 1 1 1 1 1 1]
数据规律 x1 + x2 >= 5 → y=1 else y=0
[0.18 0.39 0.32 0.9  0.75 0.95 0.98 0.87 0.98 1.  ]
[0 0 0 1 1 1 1 1 1 1]


In [None]:
''' 
对于logisitc回归的补充


从熵的角度理解
p(A) + p(B) = 1 (二分类事件)

信息熵
n(A) = -log(1/p(A))
n(B) = -log(1/p(B))

y = n(A) - n(B) 用来衡量谁更有可能发生
n(A) - n(B)  > 0 -> A的信息量更高 A更少见 A概率更小
n(A) - n(B)  = 0 -> 相等
n(A) - n(B)  < 0 -> B的信息量更低 B更常见 B概率更大

y = n(A) - n(B)
= (-log(1/p(A))) - (-log(1/p(B)))
= log(p(A)) - log(p(B))
= log(p(A)/p(B))
= log(p(A) / (1 - p(A)))

y = log(p/(1-p))
e^y = p/(1-p)
(1-p)e^y = p
e^y - e^y * p = p
e^y = p + e^y * p
e^y = p(1 + e^y)
p = e^y / (1 + e^y)
p = 1 / (1 + e^(-y))
p = sigmoid(y)
p = sigmoid(n(A) - n(B))
p = sigmoid(log(p/(1-p)))
又因为
y = wx 
所以
标准写法
p = sigmoid(wx)
其他写法
log(p / (1 - p)) = wx 
wx = n(A) - n(B)

'''