In [1]:
import torch
import numpy as np
import pandas as pd
from torch import nn,optim
import matplotlib.pyplot as plt
import torchvision

In [2]:
train_data=pd.read_csv(r'F:\study\ml\DataSet\House_Prices\train.csv')
test_data=pd.read_csv(r'F:\study\ml\DataSet\House_Prices\test.csv')

In [3]:
train_data.shape

(1460, 81)

In [4]:
train_data.iloc[0:4,[0,1,2,3,4,-3,-2,-1]]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,WD,Normal,208500
1,2,20,RL,80.0,9600,WD,Normal,181500
2,3,60,RL,68.0,11250,WD,Normal,223500
3,4,70,RL,60.0,9550,WD,Abnorml,140000


In [5]:
all_features=pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:]))

In [6]:
test_data.shape

(1459, 80)

In [7]:
all_features.shape

(2919, 79)

In [10]:
numeric_features=all_features.dtypes[all_features.dtypes != 'object'].index

In [11]:
len(numeric_features)

36

In [12]:
all_features[numeric_features]=all_features[numeric_features].apply(
lambda x: (x-x.mean())/x.std())

In [13]:
all_features=all_features.fillna(0)

In [14]:
all_features.shape

(2919, 79)

In [15]:
all_features=pd.get_dummies(all_features,dummy_na=True)

In [16]:
all_features.shape

(2919, 354)

In [22]:
n_train=train_data.shape[0]
train_features=torch.Tensor(all_features[:n_train].values)
test_features=torch.Tensor(all_features[n_train:].values)
train_labels=torch.Tensor(train_data.SalePrice.values).view(-1,1)

In [18]:
loss=torch.nn.MSELoss()

In [20]:
def get_net(feature_num):
    net=nn.Linear(feature_num,1)
    for p in net.parameters():
        nn.init.normal_(p,mean=0,std=0.01)
    return net

In [21]:
def log_rmse(net,features,labels):
    with torch.no_grad():
        clipped_preds=torch.max(net(features),torch.Tensor([1.0]))
        rmse=torch.sqrt(2*loss(clipped_preds.log(),labels.log()).mean())
    return rmse.item()

In [44]:
def train(net,train_features,train_labels,test_features,test_labels,
        num_epochs,learning_rate,weight_decay,batch_size ):
    train_ls,test_ls=[],[]
    dataset=torch.utils.data.TensorDataset(train_features,train_labels)
    train_iter=torch.utils.data.DataLoader(dataset,batch_size,shuffle=True)
    
    optimizer=torch.optim.Adam(net.parameters(),lr=learning_rate,weight_decay=weight_decay)
    net=net.float()
    for epoch in range(num_epochs):
        for x,y in train_iter:
            l=loss(net(x.float()),y.float())
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
        train_ls.append(log_rmse(net,train_features,train_labels))
        if test_labels is not None:
            test_ls.append(log_rmse(net,test_features,test_labels))
    return train_ls,test_ls
    

In [45]:
def get_k_fold_data(k,i,x,y):
    assert k>1
    fold_size=x.shape[0]//k
    x_train,y_train=None,None
    for j in range(k):
        idx=slice(j*fold_size,(j+1) * fold_size)
        x_part,y_part=x[idx,:],y[idx]
        if j==i:
            x_valid,y_valid=x_part,y_part
        elif x_train is None:
            x_train,y_train=x_part,y_part
        else:
            x_train=torch.cat((x_train,x_part),dim=0)
            y_train=torch.cat((y_train,y_part),dim=0)
    return x_train,y_train,x_valid,y_valid

In [46]:
def k_fold(k,x_train,y_train,num_epochs,learning_rate,weight_decay,batch_size):
    train_l_sum,valid_l_sum=0,0
    for i in range(k):
        data=get_k_fold_data(k,i,x_train,y_train)
        net=get_net(x_train.shape[1])
        train_ls,valid_ls=train(net,*data,num_epochs,learning_rate,
                               weight_decay,batch_size)
        train_l_sum+=train_ls[-1]
        valid_l_sum+=valid_ls[-1]
        print('fold %d,train rmse %f,valid rmse %f' % (i,train_ls[-1],valid_ls[-1]))
    return train_l_sum/k,valid_l_sum/k
        
            

In [48]:
k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr,
                          weight_decay, batch_size)
print('%d-fold validation: avg train rmse %f, avg valid rmse %f' %
      (k, train_l, valid_l))

fold 0,train rmse 0.239668,valid rmse 0.221256
fold 1,train rmse 0.229752,valid rmse 0.268001
fold 2,train rmse 0.231911,valid rmse 0.238269
fold 3,train rmse 0.237015,valid rmse 0.218228
fold 4,train rmse 0.230332,valid rmse 0.258358
5-fold validation: avg train rmse 0.233736, avg valid rmse 0.240822


In [None]:
def train_and_pred(train_features,test_features,train_labels,test_data,
                  num_epochs,lr,weight_decay,batch_size):
    net=get_net(train_features.shape[1])
    train_ls,_ = train(net,train_features,train_labels,None,None,num_epochs,
                      lr,weight_decay,batch_size)
    print('train rmse %f' % train_ls[-1])
    preds=net(test_features).detach().numpy()
    test_data['']