In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset

import sys


from collections import defaultdict


%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
train = pd.read_csv('feature_pre.csv')
feature = train.iloc[:,:-1]
feature.head()
feature.shape
# test = pd.read_csv('./all/test.csv')

(87, 16)

In [3]:
train2 = pd.read_csv('feature3_pre.csv')
feature2 = train.iloc[:,:-1]
feature2.head()
feature2.shape
# test = pd.read_csv('./all/test.csv')
train_features2 = feature2.values.astype(np.float32)
train_features2 = torch.from_numpy(train_features2)


In [4]:
# 对预测的价格取 log
train['price'] = np.log(train['price'])

In [5]:
# 提取训练集和验证集的特征
train_features = feature.values.astype(np.float32)
train_features = torch.from_numpy(train_features)

# 提取训练集和验证集的label
train_labels = train['price'].values.astype(np.float32)
train_labels = torch.from_numpy(train_labels)


In [6]:
def get_model(feat_dim):
    net = nn.Sequential(
        nn.Linear(feat_dim, 1)
    )
    return net

net = get_model(4)
print(net)

Sequential(
  (0): Linear(in_features=4, out_features=1, bias=True)
)


In [7]:
def get_data(x, y, batch_size, shuffle):
    dataset = TensorDataset(x, y)
    return DataLoader(dataset, batch_size, shuffle=shuffle, num_workers=4)

def get_rmse(model, feature, label, use_gpu):
    if use_gpu:
        feature = feature.cuda()
        label = label.cuda()
    model.eval()
    mse_loss = nn.MSELoss()
    with torch.no_grad():
        pred = model(feature)
    # clipped_pred = pred.clamp(1, float('inf'))
    rmse = (mse_loss(pred, label)).sqrt()

    return rmse.item()

def pred(net, test_labels, test_features):
    net = net.eval()
    net = net.cpu()
    with torch.no_grad():
        preds = net(test_features)
    preds = np.exp(preds.numpy())
    test_labels = np.exp(test_labels.numpy())

    print(preds,test_labels)
    MAE = (np.abs(preds - test_labels)).sum(axis=0)/test_features.shape[0]
    MRE = MAE/(test_labels.sum(axis=0))
    return MRE[0], MAE[0]

def train_model(model, x_train, y_train, x_valid, y_valid, epochs, lr, weight_decay, batch_size, use_gpu = 0):
    if use_gpu:
        model = model.cuda()
    metric_log = defaultdict(list)

    train_data = get_data(x_train, y_train, batch_size, True)
    if x_valid is not None:
        valid_data = get_data(x_valid, y_valid, batch_size, False)
    else:
        valid_data = None

    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = nn.MSELoss()

    for e in range(epochs):
        # 训练模型
        model.train()
        for data in train_data:
            x, y = data
            if use_gpu:
                x = x.cuda()
                y = y.cuda()
            # forward
            out = model(x)
            loss = criterion(out, y)
            # backward
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        metric_log['train_rmse'].append(get_rmse(model, x_train, y_train, use_gpu))
        metric_log['valid_rmse'] = []
        # 测试模型
        metric_log['valid_MAE']

        if x_valid is not None:
            metric_log['valid_rmse'].append(get_rmse(model, x_valid, y_valid, use_gpu))
        
            print_str = 'epoch: {}, train rmse: {:.3f}, valid rmse: {:.3f}' \
                .format(e + 1, metric_log['train_rmse'][-1], metric_log['valid_rmse'][-1])
        else:
            print_str = 'epoch: {}, train rmse: {:.3f}'.format(e + 1, metric_log['train_rmse'][-1])
    if (e + 1) % 10 == 0:
        print(print_str)
        print()
    print(metric_log['valid_rmse'])


    return metric_log['train_rmse'], metric_log['valid_rmse']


In [8]:
# K折交叉验证
def get_k_fold_data(k, i, X, y):
    # 返回第i折交叉验证时所需要的训练和验证数据
    assert k > 1
    fold_size = X.shape[0] // k
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.cat((X_train, X_part), dim=0)
            y_train = torch.cat((y_train, y_part), dim=0)
    return X_train, y_train, X_valid, y_valid

def k_fold(k, X_train, y_train, num_epochs,
           learning_rate, weight_decay, batch_size):
    train_l_sum, valid_l_sum = 0, 0
    MRE, MAE = 0, 0
    
    for i in range(k):
        a, b, c, d = get_k_fold_data(k, i, X_train, y_train)
        if i==4:
            net = get_model(X_train.shape[1])
            train_ls, valid_ls = train_model(net, a, b, c, d, num_epochs, learning_rate, weight_decay, batch_size)
            train_l_sum += train_ls[-1]
            valid_l_sum += valid_ls[-1]

            a,b = pred(net, d, c)

            MRE += a
            MAE += b
#         if i==0:
#             figsize = (10, 5)
#             fig = plt.figure(figsize=figsize)
#             #print(train_ls)
#             plt.plot(train_ls, color='red', label='train')
#             plt.plot(valid_ls, color='green', label='vaild')
#             plt.legend(loc='best')
#             plt.xlabel('epochs')
#             plt.ylabel('loss')
#             plt.show()

#         if i == 0:
#             d2l.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse',
#                          range(1, num_epochs + 1), valid_ls,
#                          ['train', 'valid'])
       # print('fold %d, train rmse %f, valid rmse %f' % (i, train_ls[-1], valid_ls[-1]))
    return train_l_sum / k, valid_l_sum / k, MRE/k, MAE/k

# 模型选择
k, num_epochs, lr, weight_decay, batch_size = 5, 300, 0.05, 0, 64
train_l, valid_l, m1, m2 = k_fold(k, train_features, train_labels, num_epochs, lr, weight_decay, batch_size)

print('%d-fold validation: avg train rmse %f, avg valid rmse %f, avg valid MRE %f, avg valid MAE %f' % (k, train_l, valid_l, m1, m2))



  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


epoch: 300, train rmse: 0.606, valid rmse: 0.448

[0.4479233920574188]
[[3835.4673]
 [4415.3496]
 [3891.441 ]
 [3915.3174]
 [4328.3057]
 [5621.1025]
 [4648.1445]
 [3817.262 ]
 [4928.653 ]
 [3860.5684]
 [3879.1467]
 [3858.496 ]
 [3953.0051]
 [3857.2048]
 [3890.235 ]
 [3793.3867]
 [3898.127 ]] [3369.2617 4689.294  2209.4387 2796.036  8908.591  8684.764  4761.382
 2894.231  4562.3296 4612.7817 2745.2014 5317.6353 6840.2446 3758.952
 3306.8052 1913.8713 2738.2239]
5-fold validation: avg train rmse 0.121278, avg valid rmse 0.089585, avg valid MRE 0.002082, avg valid MAE 154.279578


In [9]:
k, num_epochs, lr, weight_decay, batch_size = 5, 300, 0.05, 0, 64
train_l, valid_l, m1, m2 = k_fold(k, train_features2, train_labels, num_epochs, lr, weight_decay, batch_size)

print('%d-fold validation: avg train rmse %f, avg valid rmse %f, avg valid MRE %f, avg valid MAE %f' % (k, train_l, valid_l, m1, m2))


epoch: 300, train rmse: 0.598, valid rmse: 0.444

[0.443930059671402]
[[3833.0503]
 [4316.15  ]
 [3876.1477]
 [3898.3945]
 [4269.044 ]
 [5349.849 ]
 [4510.4927]
 [3811.7363]
 [4760.633 ]
 [3850.2366]
 [3863.069 ]
 [3849.451 ]
 [3932.5425]
 [3847.2637]
 [3871.3157]
 [3791.788 ]
 [3884.952 ]] [3369.2617 4689.294  2209.4387 2796.036  8908.591  8684.764  4761.382
 2894.231  4562.3296 4612.7817 2745.2014 5317.6353 6840.2446 3758.952
 3306.8052 1913.8713 2738.2239]
5-fold validation: avg train rmse 0.119668, avg valid rmse 0.088786, avg valid MRE 0.001943, avg valid MAE 143.984314


In [None]:
# K折交叉验证
def get_k_fold_data(k, i, X, y):
    # 返回第i折交叉验证时所需要的训练和验证数据
    assert k > 1
    fold_size = X.shape[0] // k
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.cat((X_train, X_part), dim=0)
            y_train = torch.cat((y_train, y_part), dim=0)
    return X_train, y_train, X_valid, y_valid

def k_fold(k, X_train, y_train, num_epochs,
           learning_rate, weight_decay, batch_size):
    train_l_sum, valid_l_sum = 0, 0
    MRE, MAE = 0, 0
    for i in range(k):
        a, b, c, d = get_k_fold_data(k, i, X_train, y_train)
        
        net = get_model(X_train.shape[1])
        train_ls, valid_ls = train_model(net, a, b, c, d, num_epochs, learning_rate, weight_decay, batch_size)
        train_l_sum += train_ls[-1]
        valid_l_sum += valid_ls[-1]
        
        a,b = pred(net, d, c)

        MRE += a
        MAE += b
        if i==0:
            figsize = (10, 5)
            fig = plt.figure(figsize=figsize)
            print(train_ls)
            print(valid_ls)
            plt.plot(train_ls, color='red', label='train')
            plt.plot(valid_ls, color='green', label='vaild')
            plt.legend(loc='best')
            plt.xlabel('epochs')
            plt.ylabel('loss')
            plt.show()

#         if i == 0:
#             d2l.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse',
#                          range(1, num_epochs + 1), valid_ls,
#                          ['train', 'valid'])
        print('fold %d, train rmse %f, valid rmse %f' % (i, train_ls[-1], valid_ls[-1]))
    return train_l_sum / k, valid_l_sum / k, MRE/k, MAE/k

# 模型选择
k, num_epochs, lr, weight_decay, batch_size = 5, 300, 0.05, 0, 64
train_l, valid_l, m1, m2 = k_fold(k, train_features, train_labels, num_epochs, lr, weight_decay, batch_size)

print('%d-fold validation: avg train rmse %f, avg valid rmse %f, avg valid MRE %f, avg valid MAE %f' % (k, train_l, valid_l, m1, m2))



%matplotlib inline
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import sys
sys.path.append("/home/kesci/input")
import d2lzh1981 as d2l
print(torch.__version__)
torch.set_default_tensor_type(torch.FloatTensor)


test_data = pd.read_csv("/home/kesci/input/houseprices2807/house-prices-advanced-regression-techniques/test.csv")
train_data = pd.read_csv("/home/kesci/input/houseprices2807/house-prices-advanced-regression-techniques/train.csv")

all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))

# 数据预处理
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std()))
# 标准化后，每个数值特征的均值变为0，所以可以直接用0来替换缺失值
all_features[numeric_features] = all_features[numeric_features].fillna(0)


# dummy_na=True将缺失值也当作合法的特征值并为其创建指示特征
all_features = pd.get_dummies(all_features, dummy_na=True)
all_features.shape

n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float)
train_labels = torch.tensor(train_data.SalePrice.values, dtype=torch.float).view(-1, 1)

# 训练模型
loss = torch.nn.MSELoss()

def get_net(feature_num):
    net = nn.Linear(feature_num, 1)
    for param in net.parameters():
        nn.init.normal_(param, mean=0, std=0.01)
    return net


def log_rmse(net, features, labels):
    with torch.no_grad():
        # 将小于1的值设成1，使得取对数时数值更稳定
        clipped_preds = torch.max(net(features), torch.tensor(1.0))
        rmse = torch.sqrt(2 * loss(clipped_preds.log(), labels.log()).mean())
    return rmse.item()


def train(net, train_features, train_labels, test_features, test_labels,
          num_epochs, learning_rate, weight_decay, batch_size):
    train_ls, test_ls = [], []
    dataset = torch.utils.data.TensorDataset(train_features, train_labels)
    train_iter = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True)
    # 这里使用了Adam优化算法
    optimizer = torch.optim.Adam(params=net.parameters(), lr=learning_rate, weight_decay=weight_decay) 
    net = net.float()
    for epoch in range(num_epochs):
        for X, y in train_iter:
            l = loss(net(X.float()), y.float())
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
        train_ls.append(log_rmse(net, train_features, train_labels))
        if test_labels is not None:
            test_ls.append(log_rmse(net, test_features, test_labels))
    return train_ls, test_ls


# K折交叉验证
def get_k_fold_data(k, i, X, y):
    # 返回第i折交叉验证时所需要的训练和验证数据
    assert k > 1
    fold_size = X.shape[0] // k
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.cat((X_train, X_part), dim=0)
            y_train = torch.cat((y_train, y_part), dim=0)
    return X_train, y_train, X_valid, y_valid

def k_fold(k, X_train, y_train, num_epochs,
           learning_rate, weight_decay, batch_size):
    train_l_sum, valid_l_sum = 0, 0
    for i in range(k):
        data = get_k_fold_data(k, i, X_train, y_train)
        net = get_net(X_train.shape[1])
        train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,
                                   weight_decay, batch_size)
        train_l_sum += train_ls[-1]
        valid_l_sum += valid_ls[-1]
        if i == 0:
            d2l.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse',
                         range(1, num_epochs + 1), valid_ls,
                         ['train', 'valid'])
        print('fold %d, train rmse %f, valid rmse %f' % (i, train_ls[-1], valid_ls[-1]))
    return train_l_sum / k, valid_l_sum / k

# 模型选择
k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr, weight_decay, batch_size)
print('%d-fold validation: avg train rmse %f, avg valid rmse %f' % (k, train_l, valid_l))


# 预测
def train_and_pred(train_features, test_features, train_labels, test_data,
                   num_epochs, lr, weight_decay, batch_size):
    net = get_net(train_features.shape[1])
    train_ls, _ = train(net, train_features, train_labels, None, None,
                        num_epochs, lr, weight_decay, batch_size)
    d2l.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse')
    print('train rmse %f' % train_ls[-1])
    preds = net(test_features).detach().numpy()
    test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
    submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
    submission.to_csv('./submission.csv', index=False)
    # sample_submission_data = pd.read_csv("../input/house-prices-advanced-regression-techniques/sample_submission.csv")

train_and_pred(train_features, test_features, train_labels, test_data, num_epochs, lr, weight_decay, batch_size)