### 线性回归

In [2]:
from sklearn.model_selection import train_test_split
import pandas as pd
import warnings
import numpy as np

warnings.filterwarnings("ignore")

Boston数据集的13个属性信息如下：

CRIM：城镇人均犯罪率
ZN：住宅用地所占比例
INDUS：城镇中非住宅用地所占比例
CHAS：是否靠近查尔斯河（1表示靠近，0表示不靠近）
NOX：一氧化氮浓度
RM：房屋平均房间数
AGE：自住房屋中建于1940年前的房屋所占比例
DIS：距离5个波士顿就业中心的加权距离
RAD：距离绿色公园的辐射范围
TAX：每10,000美元的全额物业税率
PTRATIO：城镇中学生与教师的比例
B：黑人占比
MEDV：房价中位数（单位：千美元

In [3]:
feature_name = ['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','MEDV']

In [4]:
# 加载波士顿房屋数据集
feature_name = ['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','MEDV']
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

# 数据准备
X = data
y = target

In [5]:
#dataset = load_boston()
df = pd.DataFrame(X,columns=feature_name)
df['price'] = y
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,MEDV,price
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [6]:
X = df['RM'].values
y = df['price'].values
X[:5], y[:5]

(array([6.575, 6.421, 7.185, 6.998, 7.147]),
 array([24. , 21.6, 34.7, 33.4, 36.2]))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2, shuffle=True)

In [8]:
X_train =X_train.reshape((-1,1))
X_test = X_test.reshape((-1,1))
y_train = y_train.reshape((-1,1))
y_test = y_test.reshape((-1,1))

In [9]:
X_train.shape,  y_train.shape

((404, 1), (404, 1))

### 初始化模型参数
这段代码的主要功能是初始化线性回归模型的参数，具体如下：

1. **输入参数**：
   - `dims`：表示训练数据的特征维度。

2. **输出内容**：
   - `w`：初始化的权重参数，形状为 `(dims, 1)` 的零矩阵。
   - `b`：初始化的偏差参数，值为 0。

3. **代码逻辑**：
   - 使用 `np.zeros((dims, 1))` 创建一个形状为 `(dims, 1)` 的零矩阵作为权重参数 `w`。
   - 将偏差参数 `b` 初始化为标量 0。
   - 打印 `w` 的形状以确认初始化是否正确。

4. **用途**：
   - 在训练线性回归模型之前，需要对模型参数进行初始化。此函数提供了一种简单的方式，将权重和偏差参数初始化为零，以便后续通过梯度下降等优化算法进行更新。


In [10]:
### 初始化模型参数
def initialize_params(dims):
    '''
    输入：
    dims：训练数据变量维度
    输出：
    w：初始化权重参数值
    b：初始化偏差参数值
    '''
    # 初始化权重参数为零矩阵
    w = np.zeros((dims, 1))
    # 初始化偏差参数为零
    b = 0
    print('w.shape',w.shape)
    return w, b

In [11]:
### 定义模型主体部分
### 包括线性回归公式、均方损失和参数偏导三部分
def linear_loss(X, y, w, b):
    '''
    输入:
    X：输入变量矩阵
    y：输出标签向量
    w：变量参数权重矩阵
    b：偏差项
    输出：
    y_hat：线性模型预测输出
    loss：均方损失值
    dw：权重参数一阶偏导
    db：偏差项一阶偏导
    '''
    # 训练样本数量
    num_train = X.shape[0]
    # 训练特征数量
    num_feature = X.shape[1]
    # 线性回归预测输出
    y_hat = np.dot(X, w) + b
    # 计算预测输出与实际标签之间的均方损失
    loss = np.sum((y_hat-y)**2)/num_train
    # 基于均方损失对权重参数的一阶偏导数
    dw = np.dot(X.T, (y_hat-y)) /num_train
    # 基于均方损失对偏差项的一阶偏导数
    db = np.sum((y_hat-y)) /num_train
    return y_hat, loss, dw, db


In [12]:
### 定义线性回归模型训练过程
def linear_train(X, y, learning_rate=0.01, epochs=10000):
    '''
    输入：
    X：输入变量矩阵
    y：输出标签向量
    learning_rate：学习率
    epochs：训练迭代次数
    输出：
    loss_his：每次迭代的均方损失
    params：优化后的参数字典
    grads：优化后的参数梯度字典
    '''
    # 记录训练损失的空列表
    loss_his = []
    # 初始化模型参数
    w, b = initialize_params(X.shape[1])
    # 迭代训练
    for i in range(1, epochs):
        # 计算当前迭代的预测值、损失和梯度
        y_hat, loss, dw, db = linear_loss(X, y, w, b)
        # 基于梯度下降的参数更新
        w += -learning_rate * dw
        b += -learning_rate * db
        # 记录当前迭代的损失
        loss_his.append(loss)
        # 每1000次迭代打印当前损失信息
        if i % 1000 == 0:
            print('epoch %d loss %f' % (i, loss))
        # 将当前迭代步优化后的参数保存到字典
        params = {
            'w': w,
            'b': b
        }
        # 将当前迭代步的梯度保存到字典
        grads = {
            'dw': dw,
            'db': db
        }     
    return loss_his, params, grads


In [13]:
linear_train(X_train, y_train)

w.shape (1, 1)
epoch 1000 loss 54.673255
epoch 2000 loss 52.544814
epoch 3000 loss 50.857963
epoch 4000 loss 49.521084
epoch 5000 loss 48.461568
epoch 6000 loss 47.621871
epoch 7000 loss 46.956387
epoch 8000 loss 46.428972
epoch 9000 loss 46.010979


([588.0343564356435,
  243.58805442113618,
  122.70684288122406,
  80.28309345783622,
  65.39304088550571,
  60.16560630941734,
  58.329150981462085,
  57.68271921745599,
  57.45391226824732,
  57.37166614462048,
  57.340854221392505,
  57.32809296110642,
  57.32166681538127,
  57.317464354254795,
  57.314042721056,
  57.310895562600535,
  57.307845180263804,
  57.30482921275983,
  57.301825775045145,
  57.298827186723564,
  57.295830752347875,
  57.29283552587841,
  57.289841175208934,
  57.28684758368567,
  57.28385471026541,
  57.28086254043945,
  57.2778710690112,
  57.27488029405198,
  57.27189021478,
  57.268900830816,
  57.26591214192197,
  57.26292414790952,
  57.25993684860769,
  57.256950243851655,
  57.25396433347878,
  57.25097911732721,
  57.247994595235404,
  57.24501076704195,
  57.242027632585476,
  57.239045191704705,
  57.23606344423838,
  57.23308239002526,
  57.23010202890419,
  57.22712236071399,
  57.224143385293615,
  57.22116510248193,
  57.21818751211796,
  57.

In [14]:
from sklearn.linear_model import LinearRegression as LR
lr = LR()

In [15]:
model = lr.fit(X_train, y_train)
y_pre = model.predict(X_test)

In [16]:
model.coef_

array([[8.76050748]])

In [17]:
model.intercept_

array([-32.39552265])

In [18]:
from sklearn.metrics import r2_score
r2_score(y_pre,y_test)
#或者直接指定参数
r2_score(y_true = y_test,y_pred = y_pre)


0.5877214395051775