In [124]:
import pandas as pd
train_data=pd.read_csv('train.csv')
test_data=pd.read_csv('test.csv')

In [83]:
# 首先查看数据，根据data-description.txt里面的信息，我们可以看出其具有多少特征
print(train_data.shape)
print(test_data.shape)

(1460, 81)
(1459, 80)


In [125]:
# 首先第一步是数据预处理，这个需要针对所有的feature进行处理
all_features=pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:]))

In [126]:
# 对于数值型标准利用标准化数据，这个与后面的有关
numeric_features=all_features.dtypes[all_features.dtypes!='object'].index

all_features[numeric_features]=all_features[numeric_features].apply(lambda x:(x-x.mean())/(x.std()))

# 同时处缺失值
all_features[numeric_features]=all_features[numeric_features].fillna(0)

In [127]:
# 对于离散型利用one hot编码处理
all_features=pd.get_dummies(all_features,dummy_na=True)
all_features.shape

(2919, 331)

In [129]:
all_features.isna().sum().sum()

0

In [130]:
# 将数据从dataframe转换成torch
import torch
n_train=train_data.shape[0]

train_features=torch.tensor(all_features[:n_train].values,dtype=torch.float32)
test_features=torch.tensor(all_features[n_train:].values,dtype=torch.float32)

train_labels=torch.tensor(train_data.SalePrice.values.reshape(-1,1),dtype=torch.float32)

In [131]:
from torch import nn
from torch.utils import data

features_num=train_features.shape[1]

def get_net():
    return nn.Sequential(nn.Linear(features_num,1))

net=get_net()


def init_weight(m):
    if type(m)==nn.Linear:
        nn.init.normal_(m.weight,std=0.01)
net.apply(init_weight)


loss=nn.MSELoss()

def load_array(feature,label,batch_size):
    import torch
    from torch.utils import data
    dataset=data.TensorDataset(feature,label)
    return data.DataLoader(dataset,batch_size=batch_size,shuffle=True,num_workers=4)

def log_rmse(net, features, labels):
    # 为了在取对数时进一步稳定该值，将小于1的值设置为1
    clipped_preds = torch.clamp(net(features), 1, float('inf'))
    rmse = torch.sqrt(loss(torch.log(clipped_preds),
                           torch.log(labels)))
    return rmse.item()

import d2l

def concreteModel(net,train_feature,train_label,test_feature,test_label,num_epoches,learning_rate,weight_decay,batch_size):
    train_ls=[]
    test_ls=[]
    trainer=torch.optim.Adam(net.parameters(),lr=learning_rate,weight_decay=weight_decay)
    train_iter=load_array(train_feature,train_label,batch_size)
    for epoch in range(num_epoches):
        for X,y in train_iter:
            trainer.zero_grad()
            l=loss(net(X),y)
            l.backward()
            trainer.step()
        train_ls.append(log_rmse(net,train_feature,train_label))
        if test_label is not None:
            test_ls.append(log_rmse(net,test_feature,test_label))
    return train_ls,test_ls

In [132]:
def get_k_fold_data(k, i, X, y):
    assert k > 1
    fold_size = X.shape[0] // k
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx,:]
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.cat([X_train, X_part], 0)
            y_train = torch.cat([y_train, y_part], 0)
    return X_train, y_train, X_valid, y_valid

In [133]:
def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay,
           batch_size):
    train_l_sum, valid_l_sum = 0, 0
    for i in range(k):
        data = get_k_fold_data(k, i, X_train, y_train)
        net = get_net()
        train_ls, valid_ls = concreteModel(net, *data, num_epochs, learning_rate,
                                   weight_decay, batch_size)
        train_l_sum += train_ls[-1]
        valid_l_sum += valid_ls[-1]
        print(f'折{i + 1}，训练log rmse{float(train_ls[-1]):f}, '
              f'验证log rmse{float(valid_ls[-1]):f}')
    return train_l_sum / k, valid_l_sum / k

In [134]:
k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr,
                          weight_decay, batch_size)
print(f'{k}-折验证: 平均训练log rmse: {float(train_l):f}, '
      f'平均验证log rmse: {float(valid_l):f}')

torch.Size([1460, 331])
torch.Size([1460, 1])
折1，训练log rmse0.169609, 验证log rmse0.156060
torch.Size([1460, 331])
torch.Size([1460, 1])
折2，训练log rmse0.161922, 验证log rmse0.188611
torch.Size([1460, 331])
torch.Size([1460, 1])
折3，训练log rmse0.164113, 验证log rmse0.168780
torch.Size([1460, 331])
torch.Size([1460, 1])
折4，训练log rmse0.167919, 验证log rmse0.154550
torch.Size([1460, 331])
torch.Size([1460, 1])
折5，训练log rmse0.162933, 验证log rmse0.182970
5-折验证: 平均训练log rmse: 0.165299, 平均验证log rmse: 0.170194


In [135]:
def train_and_pred(train_features, test_features, train_labels, test_data,
                   num_epochs, lr, weight_decay, batch_size):
    net = get_net()
    train_ls, _ = concreteModel(net, train_features, train_labels, None, None,
                        num_epochs, lr, weight_decay, batch_size)
    print(f'训练log rmse：{float(train_ls[-1]):f}')
    # 将网络应用于测试集。
    preds = net(test_features).detach().numpy()
    # 将其重新格式化以导出到Kaggle
    test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
    submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
    submission.to_csv('submission.csv', index=False)
train_and_pred(train_features, test_features, train_labels, test_data,
               num_epochs, lr, weight_decay, batch_size)

训练log rmse：0.162425
