In [1]:
import numpy as np
import torch
import torch.utils.data
import math
import torch.nn.functional as F

In [2]:
npdata = np.genfromtxt(
    open("./data/house-train.csv", "rb"), 
    delimiter=",",
    dtype='unicode'
)
npdata

array([['Id', 'MSSubClass', 'MSZoning', ..., 'SaleType', 'SaleCondition',
        'SalePrice'],
       ['1', '60', 'RL', ..., 'WD', 'Normal', '208500'],
       ['2', '20', 'RL', ..., 'WD', 'Normal', '181500'],
       ...,
       ['1458', '70', 'RL', ..., 'WD', 'Normal', '266500'],
       ['1459', '20', 'RL', ..., 'WD', 'Normal', '142125'],
       ['1460', '20', 'RL', ..., 'WD', 'Normal', '147500']], dtype='<U13')

In [3]:
npdata[0]

array(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu',
       'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars',
       'GarageArea', 'GarageQual', 'GarageCond', 'Pav

In [4]:
npdata[0, [4, 38, 46, 62, 12, 15]]

array(['LotArea', 'TotalBsmtSF', 'GrLivArea', 'GarageArea',
       'Neighborhood', 'BldgType'], dtype='<U13')

In [5]:
npdata[0, [80]]

array(['SalePrice'], dtype='<U13')

In [6]:
neighborhoods = list(set(npdata[1:, 12]))
building_type = list(set(npdata[1:, 15]))

In [7]:
class HouseDataset(torch.utils.data.Dataset):
    
    def __init__(self):
        npdata = np.genfromtxt(
            open("./data/house-train.csv", "rb"), 
            delimiter=",",
            dtype='unicode'
        )
        
        np_inputs = npdata[1:, [4, 38, 46, 62, 12, 15]]
        
        for row in np_inputs:
            row[4] = neighborhoods.index(row[4])
            row[5] = building_type.index(row[5])
        
        np_inputs = np_inputs.astype(np.float32)
        
        np_outputs = npdata[1:, [80]].astype(np.float32)
        
        self.inputs = torch.from_numpy(np_inputs)
        self.outputs = torch.from_numpy(np_outputs)
        
    def __len__(self):
        return len(self.inputs)
        
    def __getitem__(self, idx):
        return (self.inputs[idx], self.outputs[idx])
        
        

In [8]:
dataset = HouseDataset()

In [9]:
dataset.__getitem__(0)

(tensor([8.4500e+03, 8.5600e+02, 1.7100e+03, 5.4800e+02, 2.1000e+01, 3.0000e+00]),
 tensor([208500.]))

In [10]:
train_size = math.floor(len(dataset) * 0.8)

In [11]:
val_size = len(dataset) - train_size

In [12]:
train_data, val_data = torch.utils.data.random_split(
    dataset, 
    [train_size, val_size]
)

In [13]:
len(train_data)

1168

In [14]:
len(val_data)

292

In [15]:
train_loader = torch.utils.data.DataLoader(
    train_data,
    batch_size=128,
    shuffle=True
)

In [16]:
val_loader = torch.utils.data.DataLoader(
    val_data,
    batch_size=128,
    shuffle=True
)

In [44]:
class Net(torch.nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        
        self.n_emb = torch.nn.Embedding(len(neighborhoods), 5)
        self.bt_emb = torch.nn.Embedding(len(building_type), 3)
        
        # 4 + 5 + 3 = 12
        
        self.layer1 = torch.nn.Linear(12, 16)
        self.layer2 = torch.nn.Linear(16, 1)
    
    def forward(self, x):
        
        neigh = x[:, 4].long()
        bt = x[:, 5].long()
        
        e_neigh = self.n_emb(neigh)
        e_bt = self.bt_emb(bt)
        
        all_inputs = torch.cat((x[:, [0, 1, 2, 3]], e_neigh, e_bt), dim=1)
        
        x = self.layer1(all_inputs)
        x = F.relu(x)
        x = F.dropout(x, p=0.2, training=self.training)
        
        x = self.layer2(x)
        
        return x
        

In [45]:
net = Net()

In [55]:
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.01, weight_decay=0.01)

In [56]:
total_pct_error = 0
total_count = 0

net.train()

for i in range(250):
    for j, data in enumerate(train_loader, 0):
        
        inputs = data[0]
        outputs = data[1]
        
        net.zero_grad()
        net_output = net(inputs)
        
        diff = (net_output - outputs).abs()
        pct_error = (diff / outputs * 100).mean().item()
        total_pct_error += (pct_error * len(inputs))
        total_count += len(inputs)
        
        loss = criterion(net_output, outputs)
        loss.backward()
        optimizer.step()
        
print("%.1f%%" % (total_pct_error / total_count))

19.9%


In [57]:
val_pct_error = 0
val_count = 0

net.eval()

for i, data in enumerate(val_loader, 0):
    inputs = data[0]
    prices = data[1]

    net_output = net(inputs)
    diff = (net_output - prices).abs()
    pct_diff = (diff / prices * 100).mean().item()

    val_pct_error += (pct_diff * len(inputs))
    val_count += len(inputs)

print("%0.1f%%" % (val_pct_error / val_count))

13.0%


In [46]:
torch.save({
    'model': net.state_dict(),
    'optimizer': optimizer.state_dict()
}, './house_prices.pt')