In [None]:
import torch

print(f"PyTorch版本: {torch.__version__}")
print(f"CUDA是否可用: {torch.cuda.is_available()}")
print(f"当前GPU设备: {torch.cuda.current_device()}")
print(f"设备名称: {torch.cuda.get_device_name(0)}")  # 假设使用第0块GPU


In [18]:
%matplotlib inline
import numpy as np
import pandas as pd
import torch
from torch import nn
from d2l import torch as d2l

In [19]:
# 读取训练数据和测试数据
train_data = pd.read_csv(r'california-house-prices\train.csv')  # 训练数据
test_data = pd.read_csv(r'california-house-prices\test.csv')    # 测试数据


In [20]:
print(train_data.shape, test_data.shape)

(47439, 41) (31626, 40)


In [21]:
# 打印训练数据的前4行，选择部分列进行显示
print(train_data.iloc[0:4, [0, 1, 2, 3, -3, -2, -1]])

   Id            Address  Sold Price  \
0   0        540 Pine Ln   3825000.0   
1   1     1727 W 67th St    505000.0   
2   2     28093 Pine Ave    140000.0   
3   3  10750 Braddock Dr   1775000.0   

                                             Summary         City    Zip State  
0  540 Pine Ln, Los Altos, CA 94022 is a single f...    Los Altos  94022    CA  
1  HURRY, HURRY.......Great house 3 bed and 2 bat...  Los Angeles  90047    CA  
2  'THE PERFECT CABIN TO FLIP!  Strawberry deligh...   Strawberry  95375    CA  
3  Rare 2-story Gated 5 bedroom Modern Mediterran...  Culver City  90230    CA  


In [24]:
train_data_del_ID_Price_Summary = train_data.drop(["Id", "Sold Price", "Summary"], axis=1)
test_data_del_ID_Summary = test_data.drop(["Id", "Summary"], axis=1)

all_features = pd.concat([train_data_del_ID_Price_Summary, test_data_del_ID_Summary])

print(train_data_del_ID_Price_Summary.shape, test_data_del_ID_Summary.shape, all_features.shape)
print(type(all_features))



(47439, 38) (31626, 38) (79065, 38)
<class 'pandas.core.frame.DataFrame'>


In [None]:
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index  # 选择数值型特征
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std())  # 标准化数值型特征
)
all_features[numeric_features] = all_features[numeric_features].fillna(0)  # 填充缺失值为0
all_features = pd.get_dummies(all_features, dummy_na=True, dtype=np.int8)  # 将分类特征转换为独热编码
print(all_features.shape)  # 打印处理后的数据的形状

In [None]:
print(all_features.iloc[0:4, :])

In [None]:
test_numeric_features = test_data_del_ID_Summary.dtypes[test_data_del_ID_Summary.dtypes != 'object'].index  # 选择数值型特征
test_data_del_ID_Summary[test_numeric_features] = test_data_del_ID_Summary[test_numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std())  # 标准化数值型特征
)
test_data_del_ID_Summary[test_numeric_features] = test_data_del_ID_Summary[test_numeric_features].fillna(0)  # 填充缺失值为0
test_data_del_ID_Summary = pd.get_dummies(test_data_del_ID_Summary, dummy_na=True,dtype=np.int8)  # 将分类特征转换为独热编码
print(test_data_del_ID_Summary)

In [None]:
# 从pandas格式中提取NumPy格式，并将其转换为张量表示

n_train = train_data.shape[0]  # 训练数据的样本数量
train_features = torch.tensor(train_data_del_ID_Price_Summary.values, dtype=torch.float32)  # 训练特征
test_features = torch.tensor(test_data_del_ID_Summary.values, dtype=torch.float32)  # 测试特征
train_labels = torch.tensor(train_data['Sold Price'].values, dtype=torch.float32)  # 训练标签
print(train_labels[:4])  # 打印训练标签的前4个值

In [None]:
loss = nn.MSELoss()  # 均方误差损失函数
print(loss(train_labels, train_labels))  # 打印损失函数的计算结果
in_features = train_features.shape[1]  # 输入特征的维度
print(in_features)

In [None]:
def get_net():
    net = nn.Sequential(nn.Linear(in_features, 1))  # 定义一个线性回归模型
    return net

In [None]:
def log_rmse(net, features, labels):
    # 计算对数均方根误差
    with torch.no_grad():
        clipped_preds = torch.clamp(net(features), 1, float('inf'))
        rmse = torch.sqrt(loss(torch.log(clipped_preds), torch.log(labels)))  # 计算均方根误差
    return rmse.item()

In [None]:
def train(net, train_features, train_labels, test_features, test_labels, num_epochs=100, learning_rate=0.01, weight_decay=0, batch_size=16):
    
    # 训练模型
    train_labels = train_labels.reshape(-1, 1) # 将训练标签转换为列向量

    train_ls, test_ls = [], []
    train_iter = d2l.load_array((train_features, train_labels), batch_size)  # 创建训练数据迭代器
    optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate, weight_decay=weight_decay)  # Adam优化器
    
    
    for epoch in range(num_epochs):
        for X, y in train_iter:
            optimizer.zero_grad()  # 清空梯度
            l = loss(net(X), y)
            l.backward()  # 反向传播计算梯度
            optimizer.step()  # 更新参数
        train_ls.append(log_rmse(net, train_features, train_labels))  # 评估阶段
        if test_labels is not None:
            test_ls.append(log_rmse(net, test_features, test_labels))  # 计算测试集的均方根误差
    return train_ls, test_ls  # 返回训练和测试的均方根误差

In [None]:
net = get_net()  # 获取模型
num_epochs = 100  # 训练轮数    
batch_size = 64  # 批量大小
lr = 20  # 学习率
weight_decay = 0  # 权重衰减
train_ls, test_ls = train(net, train_features, train_labels, None, None, num_epochs, lr, weight_decay, batch_size)
print(train_ls)  # 打印训练集的均方根误差

In [None]:
# K折交叉验证
def get_k_fold_data(k, i, X, y):
    # 将数据划分为k折
    fold_size = X.shape[0] // k
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size) if j != i else None
        X_part, y_part = X[idx], y[idx]
        if j == i:
            X_valid, y_valid = X_part, y_part   # 验证集
        elif X_train is None:
            X_train, y_train = X_part, y_part  # 训练集
        else:
            X_train = torch.cat((X_train, X_part), 0)   # 合并训练集
            y_train = torch.cat((y_train, y_part), 0)   # 合并训练标签
    return X_train, y_train, X_valid, y_valid

In [None]:
def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay, batch_size):
    # K折交叉验证
    y_train = y_train.reshape(-1, 1)  # 将训练标签转换为列向量
    train_l_sum, valid_l_sum = 0, 0
    for i in range(k):
        data = get_k_fold_data(k, i, X_train, y_train)  # 获取K折数据
        net = get_net()  # 获取模型
        train_ls, valid_ls = train(net, *data, num_epochs, learning_rate, weight_decay, batch_size) # 训练模型
        train_l_sum += train_ls[-1]  # 累加训练损失
        valid_l_sum += valid_ls[-1]  # 累加验证损失
        if i == 0:
            d2l.plot(list(range(1, num_epochs + 1)), [train_ls, valid_ls], xlabel='epoch', ylabel='rmse',
                    xlim=[1, num_epochs], legend=['train', 'valid'], yscale='log')
        print(f'fold {i + 1}, train log rmse {float(train_ls[-1]):f}, '
              f'valid log rmse {float(valid_ls[-1]):f}')
    return train_l_sum / k, valid_l_sum / k  # 返回平均训练和验证损失

In [None]:
k, num_epochs, lr, weight_decay, batch_size = 5, 100, 30, 0, 64
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr, weight_decay, batch_size)
print(f'{k}-折验证：平均训练log rmse {float(train_l):f}, '
      f'平均验证log rmse {float(valid_l):f}')

In [None]:


def train_and_pred(train_features, test_features, train_labels, test_data,
                   num_epochs, lr, weight_decay, batch_size):
    net = get_net()
    train_ls, _ = train(net, train_features, train_labels, None, None,
                        num_epochs, lr, weight_decay, batch_size)
    d2l.plot(np.arange(1, num_epochs + 1), [train_ls], xlabel='epoch',
             ylabel='log rmse', xlim=[1, num_epochs], yscale='log')
    print(f'训练log rmse：{float(train_ls[-1]):f}')
    preds = net(test_features).detach().numpy()
    test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
    submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
    submission.to_csv('submission.csv', index=False)  # 保存预测结果为CSV文件

lr, weight_decay, batch_size = 40, 0, 64  # 学习率、权重衰减和批量大小
print(lr, weight_decay, batch_size)

train_and_pred(train_features, test_features, train_labels, test_data,
               num_epochs, lr, weight_decay, batch_size)