In [1]:
import torch
import torchvision
import numpy as np
from torch import nn,optim
import pandas as pd

In [2]:
train_data=pd.read_csv(r'F:\study\ml\DataSet\House_Prices\train.csv')
test_data=pd.read_csv(r'F:\study\ml\DataSet\House_Prices\test.csv')

In [3]:
all_features=pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:]))

In [9]:
numeric_feature=all_features.dtypes[all_features.dtypes != 'object'].index
numeric_feature

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')

In [10]:
all_features[numeric_feature]=all_features[numeric_feature].apply(lambda x : (x-x.mean())/x.std())

In [14]:
all_features=all_features.fillna(0)

In [15]:
all_features.shape

(2919, 79)

In [16]:
all_features=pd.get_dummies(all_features,dummy_na=True)

In [17]:
all_features.shape

(2919, 354)

In [33]:
n_train=train_data.shape[0]
train_features=torch.Tensor(all_features.iloc[:n_train,:].values)
test_features=torch.Tensor(all_features.iloc[n_train:,:].values)
train_labels=torch.Tensor(train_data.SalePrice.values).view(-1,1)

In [18]:
loss=nn.MSELoss()

In [19]:
def get_net(feature_num):
    net=nn.Linear(feature_num,1)
    return net

In [21]:
def log_rmse(net,features,labels):
    with torch.no_grad():
        clipped_pred=torch.max(net(features),torch.Tensor([1]))
        rmse=torch.sqrt(2*(loss(clipped_pred.log(),labels.log()).mean())).item()
    return rmse
    

In [22]:
def train(net,train_features,train_labels,test_features,test_labels,num_epochs,
         lr,weight_decay,batch_size):
    train_l,test_l=[],[]
    dataset=torch.utils.data.TensorDataset(train_features,train_labels)
    train_iter=torch.utils.data.DataLoader(dataset,batch_size,shuffle=True)
    optimizer=optim.Adam(net.parameters(),lr,weight_decay=weight_decay)
    
    for i in range(num_epochs):
        for x,y in train_iter:
            l=loss(net(x),y)
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
        train_l.append(log_rmse(net,train_features,train_labels))
        if test_labels is not None:
            test_l.append(log_rmse(net,test_features,test_labels))
    return train_l,test_l
            
            

In [38]:
def get_k_fold_data(k,i,x,y):
    assert k>1
    fold_size=x.shape[0]//k
    row_list=list(range(x.shape[0]))
    np.random.shuffle(row_list)
    x_train,y_train=None,None
    for j in range(k):
        idx=slice(j*fold_size,(j+1)*fold_size)
        x_part,y_part=x[row_list[idx],:],y[row_list[idx]]
        
        if j==i:
            x_valid,y_valid=x_part,y_part
        elif x_train is None:
            x_train,y_train=x_part,y_part
        else:
            x_train=torch.cat((x_train,x_part))
            y_train=torch.cat((y_train,y_part))
    return x_train,y_train,x_valid,y_valid

In [39]:
def k_fold(k,x_train,y_train,num_epochs,lr,weight_decay,batch_size):
    train_l_sum,valid_l_sum=0,0
    for i in range(k):
        data=get_k_fold_data(k,i,x_train,y_train)
        net=get_net(x_train.shape[1])
        train_l,valid_l=train(net,*data,num_epochs,lr,weight_decay,batch_size)
        train_l_sum+=train_l[-1]
        valid_l_sum+=valid_l[-1]
        
        print('fold %d ,train rmse %f,valid %f ' % (
        i+1,train_l[-1],valid_l[-1]))
    return train_l_sum,valid_l_sum

In [40]:
k,num_epochs,lr,weight_decay,batch_size=5,100,5,0,64

In [41]:
train_l_sum,valid_l_sum=k_fold(k,train_features,train_labels,num_epochs,
                          lr,weight_decay,batch_size)
print('%d fold validation : avg train rmse %f, avg valid rmse %f' % (
k,train_l_sum/k,valid_l_sum/k))

fold 1 ,train rmse 0.242905,valid 0.232301 
fold 2 ,train rmse 0.234325,valid 0.237820 
fold 3 ,train rmse 0.232058,valid 0.236673 
fold 4 ,train rmse 0.238686,valid 0.225906 
fold 5 ,train rmse 0.235760,valid 0.226049 
5 fold validation : avg train rmse 0.236747, avg valid rmse 0.231750
