In [1]:
import torch
import torchvision
import numpy as np
from torch import nn,optim
import pandas as pd

In [2]:
train_data=pd.read_csv(r'F:\study\ml\DataSet\House_Prices\train.csv')
test_data=pd.read_csv(r'F:\study\ml\DataSet\House_Prices\test.csv')

pre-processing data

In [3]:
all_features=pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:]))

In [4]:
numeric_features=all_features.dtypes[all_features.dtypes !='object'].index
numeric_features

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')

In [5]:
all_features[numeric_features]=all_features[numeric_features].apply(
lambda x: (x-x.mean()) / x.std())

In [6]:
all_features.isnull().sum()

MSSubClass          0
MSZoning            4
LotFrontage       486
LotArea             0
Street              0
Alley            2721
LotShape            0
LandContour         0
Utilities           2
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         1
Exterior2nd         1
MasVnrType         24
MasVnrArea         23
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           81
                 ... 
HalfBath            0
BedroomAbvGr        0
KitchenAbvGr        0
KitchenQual         1
TotRmsAbvGrd        0
Functional          2
Fireplaces          0
FireplaceQu      1420
GarageType        157
GarageYrBlt       159
GarageFinish      159
GarageCars          1
GarageArea          1
GarageQual        159
GarageCond

In [7]:
all_features=all_features.fillna(0)

In [8]:
all_features.shape

(2919, 79)

In [9]:
all_features=pd.get_dummies(all_features,dummy_na=True)

In [10]:
all_features.shape

(2919, 354)

dataframe -> tensor

In [11]:
n_train=train_data.shape[0]
train_features=torch.Tensor(all_features.iloc[:n_train,:].values)
test_features=torch.Tensor(all_features.iloc[n_train:,:].values)
train_labels=torch.Tensor(train_data.SalePrice.values).view(-1,1)

In [12]:
train_features.shape

torch.Size([1460, 354])

In [13]:
test_features.shape

torch.Size([1459, 354])

In [14]:
train_labels.shape

torch.Size([1460, 1])

define net and func

In [15]:
loss=nn.MSELoss()

In [16]:
def get_net(feature_num):
    net=nn.Linear(feature_num,1)
    for p in net.parameters():
        nn.init.normal_(p,mean=0,std=0.01)
    return net
    

In [38]:
def log_rmse(net,features,labels):
    with torch.no_grad():
        clipped_pred=torch.max(net(features),torch.Tensor([1.]))
#         print('clip_prede ',clipped_pred.shape)
#         print('labels ',labels.shape)
        rmse=torch.sqrt(2*(loss(clipped_pred.log(),labels.log())).mean())
    return rmse.item()

In [49]:
def train(net,train_features,train_labels,test_features,test_labels,
         num_epochs,lr,weight_decay,batch_size):
#     print('--------------train--------------')
#     print('train_features shape',train_features.shape)
#     print('train_labels shape',train_labels.shape)
#     print('test_features shape',test_features.shape)
#     print('test_labels shape',test_labels.shape)
#     print('---------------train-------------')
    train_l,test_l=[],[]
    dataset=torch.utils.data.TensorDataset(train_features,train_labels)
    train_iter=torch.utils.data.DataLoader(dataset,batch_size,shuffle=True)
    optimizer=optim.Adam(net.parameters(),lr,weight_decay=weight_decay)
#     net=net.float()
    for i in range(num_epochs):
        for x,y in train_iter:
#             l=loss(net(x).float(),y.float())
            l=loss(net(x),y)
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
        train_l.append(log_rmse(net,train_features,train_labels))
        if test_labels is not None:
            test_l.append(log_rmse(net,test_features,test_labels))
    return train_l,test_l
            

In [50]:
def get_k_fold_data(k,i,x,y):
    assert k >1
    fold_size=x.shape[0]//k
    x_train,y_train=None,None
    for j in range(k):
        idx=slice(j*fold_size,(j+1)*fold_size)
        x_part,y_part=x[idx,:],y[idx]
        
        if j==i:
            x_valid,y_valid=x_part,y_part
        elif x_train is None:
            x_train,y_train=x_part,y_part
        else:
            x_train=torch.cat((x_train,x_part))
            y_train=torch.cat((y_train,y_part))
#     print('----------------------------')
#     print('x_train shape',x_train.shape)
#     print('y_train shape',y_train.shape)
#     print('x_valid shape',x_valid.shape)
#     print('y_valid shape',y_valid.shape)
#     print('----------------------------')
    return x_train,y_train,x_valid,y_valid


In [51]:
def k_fold(k,x_train,y_train,num_epochs,lr,weight_decay,batch_size):
    train_l_sum,valid_l_sum=0,0
    for i in range(k):
        data=get_k_fold_data(k,i,x_train,y_train)
        net=get_net(x_train.shape[1])
        train_l,valid_l=train(net,*data,num_epochs,lr,weight_decay,batch_size)
        train_l_sum+=train_l[-1]
        valid_l_sum+=valid_l[-1]
        
        print('fold %d,train rmse %f,valid %f' % (
        i+1,train_l[-1],valid_l[-1]))
    return train_l_sum,valid_l_sum
        
        

In [52]:
k,num_epochs,lr,weight_decay,batch_size=5,100,5,0,64

In [53]:
train_l_sum,valid_l_sum=k_fold(k,train_features,train_labels,num_epochs,
                          lr,weight_decay,batch_size)
print('%d fold validation : avg train rmse %f, avg valid rmse %f' % (
k,train_l_sum/k,valid_l_sum/k))

fold 1,train rmse 0.240412,valid 0.221117
fold 2,train rmse 0.228891,valid 0.266901
fold 3,train rmse 0.231674,valid 0.237498
fold 4,train rmse 0.238200,valid 0.219113
fold 5,train rmse 0.231010,valid 0.258768
5 fold validation : avg train rmse 0.234037, avg valid rmse 0.240679


In [54]:
def train_and_pred(train_features,test_features,train_labels,test_data,
                  num_epochs,lr,weight_decay,batch_size):
    net=get_net(train_features.shape[1])
    train_ls,_ = train(net,train_features,train_labels,None,None,
                      num_epochs,lr,weight_decay,batch_size)
    print('train rmse  %f ' % (train_ls[-1]))
    preds=net(test_features).detach().numpy()
    print('pred shape :',preds.shape)
    test_data['SalePrice']=pd.Series(preds.reshape(1,-1)[0])
    submission=pd.concat([test_data['Id'],test_data['SalePrice']],axis=1)
    submission.to_csv(r'F:\study\ml\DataSet\HousePrice\submission.csv',index=False)

In [55]:
train_and_pred(train_features, test_features, train_labels,test_data, num_epochs, lr, weight_decay, batch_size)

train rmse  0.229397 
pred shape : (1459, 1)


In [61]:
pd.Series(np.array([1,2,3,4,5,6]).reshape(1,-1)[0])

0    1
1    2
2    3
3    4
4    5
5    6
dtype: int32

In [62]:
a=np.array([1,2,3,4,5,6]).reshape(-1,1)
a

array([[1],
       [2],
       [3],
       [4],
       [5],
       [6]])

In [63]:
a.flatten()

array([1, 2, 3, 4, 5, 6])