In [1]:
import torch
import torch.nn as nn
import numpy as np

import pandas as pd

In [3]:
print(torch.__version__)
torch.set_default_tensor_type(torch.FloatTensor)

1.3.0


In [5]:
train_data = pd.read_csv("/document/2019/python/Data/housePrices/train.csv")
test_data = pd.read_csv("/document/2019/python/Data/housePrices/test.csv")

In [6]:
print(train_data.shape, test_data.shape)

(1460, 81) (1459, 80)


In [7]:
print(train_data.iloc[0:4, :])

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD         Normal     208500  
1   2007        WD         Normal     181500  
2   2008        WD         Normal     223500  
3   2006        WD        Abnorml  

In [8]:
print(test_data.iloc[0:4, :])

     Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0  1461          20       RH         80.0    11622   Pave   NaN      Reg   
1  1462          20       RL         81.0    14267   Pave   NaN      IR1   
2  1463          60       RL         74.0    13830   Pave   NaN      IR1   
3  1464          60       RL         78.0     9978   Pave   NaN      IR1   

  LandContour Utilities  ... ScreenPorch PoolArea PoolQC  Fence MiscFeature  \
0         Lvl    AllPub  ...         120        0    NaN  MnPrv         NaN   
1         Lvl    AllPub  ...           0        0    NaN    NaN        Gar2   
2         Lvl    AllPub  ...           0        0    NaN  MnPrv         NaN   
3         Lvl    AllPub  ...           0        0    NaN    NaN         NaN   

  MiscVal MoSold  YrSold  SaleType  SaleCondition  
0       0      6    2010        WD         Normal  
1   12500      6    2010        WD         Normal  
2       0      3    2010        WD         Normal  
3       0      

In [36]:
# 由于数据的第一列为id，在模型训练不参与运算，因此需要将其删除
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))

In [37]:
# 对数值类型的特征做标准化, 找出数值类型的列，对这些列做标准化处理：减去均值然后除以标准差
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))
# 缺省值设为0，即将所有NaN值改为0
all_features = all_features.fillna(0)
print(all_features)

      MSSubClass MSZoning  LotFrontage   LotArea Street Alley LotShape  \
0       0.067320       RL    -0.184443 -0.217841   Pave     0      Reg   
1      -0.873466       RL     0.458096 -0.072032   Pave     0      Reg   
2       0.067320       RL    -0.055935  0.137173   Pave     0      IR1   
3       0.302516       RL    -0.398622 -0.078371   Pave     0      IR1   
4       0.067320       RL     0.629439  0.518814   Pave     0      IR1   
...          ...      ...          ...       ...    ...   ...      ...   
1454    2.419286       RM    -2.069222 -1.043758   Pave     0      Reg   
1455    2.419286       RM    -2.069222 -1.049083   Pave     0      Reg   
1456   -0.873466       RL     3.884968  1.246594   Pave     0      Reg   
1457    0.655311       RL    -0.312950  0.034599   Pave     0      Reg   
1458    0.067320       RL     0.201080 -0.068608   Pave     0      Reg   

     LandContour Utilities LotConfig  ... ScreenPorch  PoolArea PoolQC  Fence  \
0            Lvl    AllPub    

In [38]:
# 将非数值型特征，变为数值型：如果type有name，age两种属性，则变为两个特征type_name:值为0或1；type_age:值为0或1
all_features = pd.get_dummies(all_features, dummy_na=True)
print(all_features.shape)

(2919, 354)


In [39]:
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float)
train_labels = torch.tensor(train_data.SalePrice.values, dtype=torch.float).view(-1, 1)

In [40]:
print(train_features[0], train_labels[0])

tensor([ 0.0673, -0.1844, -0.2178,  0.6461, -0.5072,  1.0461,  0.8967,  0.5230,
         0.5807, -0.2930, -0.9345, -0.4442, -0.7737,  1.2072, -0.1012,  0.4135,
         1.0865, -0.2498,  0.7812,  1.2324,  0.1699, -0.2077,  0.9867, -0.9242,
         0.9731,  0.3064,  0.3488, -0.7406,  0.2000, -0.3595, -0.1033, -0.2859,
        -0.0631, -0.0896, -1.5519,  0.1576,  0.0000,  0.0000,  0.0000,  0.0000,
         1.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  1.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  1.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.00

In [47]:
loss = nn.MSELoss()

# 定义模型，为单层全连接网络，输入为特征个数，输出为1；然后初始化模型参数
def get_net(features_num):
    net = nn.Linear(features_num, 1)
    for param in net.parameters():
        nn.init.normal_(param, mean=0, std=0.01)
    return net

In [42]:
# 定义损差函数，kaggle比赛对于房价数据集要求采用对数均方差
def log_rmse(net, features, labels):
    with torch.no_grad():
        clipped_preds = torch.max(net(features), torch.tensor(1.0))
        rmse = torch.sqrt(2*loss(clipped_preds.log(), labels.log()).mean())
    return rmse.item()

In [43]:
def train(net, train_features, train_labels, test_features, test_labels, 
          num_epochs, learning_rate, weight_decay, batch_size):
    train_ls, test_ls = [], []
    dataset = torch.utils.data.TensorDataset(train_features, train_labels)
    train_iter = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True)
    
    # 这里里里使用用了了Adam优化算法
    optimizer = torch.optim.Adam(params=net.parameters(),
                                 lr=learning_rate, weight_decay=weight_decay)
    net = net.float()
    for epoch in range(num_epochs):
        for X, y in train_iter:
            l = loss(net(X.float()), y.float())
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
        train_ls.append(log_rmse(net, train_features, train_labels))
        if test_labels is not None:
            test_ls.append(log_rmse(net, test_features, test_labels))
    return train_ls, test_ls

In [44]:
def get_k_fold_data(k, i, X, y):
    # 返回第i折交叉验证时所需要的训练和验证数据
    assert k > 1
    fold_size = X.shape[0] // k
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.cat((X_train, X_part), dim=0)
            y_train = torch.cat((y_train, y_part), dim=0)
    return X_train, y_train, X_valid, y_valid

In [45]:
def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay, batch_size):
    train_l_sum, valid_l_sum = 0, 0
    for i in range(k):
        data = get_k_fold_data(k, i, X_train, y_train)
        net = get_net(X_train.shape[1])
        train_ls, valid_ls = train(net, *data, num_epochs,
                                   learning_rate, weight_decay, batch_size)
        train_l_sum += train_ls[-1]
        valid_l_sum += valid_ls[-1]
#         if i == 0:
#             d2l.semilogy(range(1, num_epochs + 1), train_ls, 
#                      'epochs', 'rmse', range(1, num_epochs + 1), valid_ls,  ['train', 'valid'])
        print('fold %d, train rmse %f, valid rmse %f' % (i, train_ls[-1], valid_ls[-1]))
    return train_l_sum / k, valid_l_sum / k

In [48]:
k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr, weight_decay, batch_size)
print('%d-fold validation: avg train rmse %f, avg valid rmse %f' % (k, train_l, valid_l))

fold 0, train rmse 0.240769, valid rmse 0.220868
fold 1, train rmse 0.229672, valid rmse 0.269309
fold 2, train rmse 0.232045, valid rmse 0.238595
fold 3, train rmse 0.236954, valid rmse 0.218382
fold 4, train rmse 0.231144, valid rmse 0.258857
5-fold validation: avg train rmse 0.234117, avg valid rmse 0.241202


In [49]:
def train_and_pred(train_features, test_features, train_labels, test_data, 
                   num_epochs, lr, weight_decay, batch_size):
    net = get_net(train_features.shape[1])
    train_ls, _ = train(net, train_features, train_labels, None, None, num_epochs, lr, weight_decay, batch_size)
#     d2l.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse')
    print('train rmse %f' % train_ls[-1])
    preds = net(test_features).detach().numpy()
    test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
    submission = pd.concat([test_data['Id'],
    test_data['SalePrice']], axis=1)
    submission.to_csv('./submission.csv', index=False)

train_and_pred(train_features, test_features, train_labels, 
               test_data, num_epochs, lr, weight_decay, batch_size)

train rmse 0.229641
