In [3]:
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.init as init
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import matplotlib.pyplot as plt

In [8]:
class PreprocessData:
    def __init__(self, csv):
        houses = pd.read_csv(csv)
        self.training_set = houses.sample(frac=0.75)
        self.validate_set = houses.sample(frac=0.25)

In [6]:
class HousePriceDataset(Dataset):
    def __init__(self, df, exclude=[]):
        houses = df.drop(exclude, axis=1)
        self.houses = houses
    
    def __len__(self):
        return len(self.houses)
    
    def __getitem__(self, idx):
        return self.houses.iloc[idx, :].values
    
    def get_labels(self):
        return list(self.houses)

In [9]:
exclude_fields = ['date', 'id', 'zipcode', 'lat', 'long', 'condition']

data = PreprocessData('./train.csv')
data.training_set
data.validate_set

train_df = HousePriceDataset(data.training_set, exclude_fields)
validate_df = HousePriceDataset(data.validate_set, exclude_fields)

print(train_df.houses.shape)
print(validate_df.houses.shape)
train_generator = DataLoader( \
    train_df, batch_size=50, shuffle=True, num_workers=1)
validate_generator = DataLoader( \
    validate_df, batch_size=50, shuffle=True, num_workers=1)

(11276, 15)
(3759, 15)


In [10]:
class OneLayerNet(nn.Module):
    def __init__(self, D_in, D_out):
        super(OneLayerNet, self).__init__()
        self.linear1 = nn.Linear(D_in, D_out)

    def forward(self, x):
        y_pred = self.linear1(x)
        return y_pred

In [11]:
class TwoLayerNet(nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.linear1 = nn.Linear(D_in, H)
        self.linear2 = nn.Linear(H, D_out)

    def forward(self, x):
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred

In [12]:
class FourLayerNet(nn.Module):
    def __init__(self, D_in, H, D_out):
        super(FourLayerNet, self).__init__()
        self.linear1 = nn.Sequential(nn.Linear(D_in, H, bias=False), nn.BatchNorm1d(H))
        self.linear2 = nn.Sequential(nn.Linear(H, H, bias=False), nn.BatchNorm1d(H))
        self.linear3 = nn.Sequential(nn.Linear(H, H, bias=False), nn.BatchNorm1d(H))
        self.linear4 = nn.Linear(H, D_out)

    def forward(self, x):
        h_relu = self.linear1(x).clamp(min=0)
        h_relu2 = self.linear2(h_relu).clamp(min=0)
        h_relu3 = self.linear3(h_relu2).clamp(min=0)
        y_pred = self.linear4(h_relu3)
        return y_pred

In [13]:
def weight_init(m):
    '''
    Usage:
        model = Model()
        model.apply(weight_init)
    '''
    if isinstance(m, nn.Conv2d):
        init.xavier_normal_(m.weight.data)
        if m.bias is not None:
            init.zeros_(m.bias.data)
    elif isinstance(m, nn.Linear):
        init.xavier_normal_(m.weight.data)
        if m.bias is not None:
            init.zeros_(m.bias.data)

In [14]:
D_in, H, D_out = 14,100,1

In [40]:
model = FourLayerNet(D_in, H, D_out)
model.apply(weight_init)
# criterion = nn.MSELoss(reduction='mean')
optimizer = optim.Adam(model.parameters(), lr=0.1)

In [20]:
def calc_loss(model, loader):
    loss = 0.
    totals = 0
    for (data, target) in loader:
        totals += len(data)
        loss += F.mse_loss(model(data), target, reduction='sum').item()
    return loss/totals

In [41]:
for t in range(50):
    model.train()
    train_loss = 0
    train_totals = 0
    for _, data in enumerate(train_generator):
        x, y = data[:,1:].float(), data[:, 0].float()
        train_totals += len(data)
        loss = F.mse_loss(model(x), y, reduction="sum")
        train_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    train_loss = train_loss/train_totals * 1e-6
    
    print("TRAIN " + str(t) + ": ", train_loss)
    
    model.eval()
    valid_loss = 0
    valid_totals = 0
    for _, data in enumerate(validate_generator):
        x, y = data[:,1:].float(), data[:, 0].float()
        valid_totals += len(data)
        loss = F.mse_loss(model(x), y, reduction="sum")
        valid_loss += loss.item()
    valid_loss = valid_loss/valid_totals * 1e-6
    
    print("VALIDATE " + str(t) + ": ", valid_loss)
        
    print("------------------------------------------------------------")


TRAIN 0:  19218865.689024977
VALIDATE 0:  15006766.80527422
------------------------------------------------------------
TRAIN 1:  10921074.11765779
VALIDATE 1:  7446403.889914211
------------------------------------------------------------
TRAIN 2:  6991562.016812316
VALIDATE 2:  6827187.076483644
------------------------------------------------------------
TRAIN 3:  6674345.683066778
VALIDATE 3:  6786339.643934647
------------------------------------------------------------
TRAIN 4:  6677652.575598518
VALIDATE 4:  6782960.210269778
------------------------------------------------------------
TRAIN 5:  6687353.805095685
VALIDATE 5:  6787023.587603136
------------------------------------------------------------
TRAIN 6:  6684042.995003664
VALIDATE 6:  6790653.494804022
------------------------------------------------------------
TRAIN 7:  6684641.893705467
VALIDATE 7:  6790719.460039542
------------------------------------------------------------
TRAIN 8:  6680581.994161083
VALIDATE 8:

In [23]:
for t in range(20):
    model.eval()
    for _, data in enumerate(validate_generator):
        x, y = data[:,1:].float(), data[:, 0].float()
        y_pred = model(x)
        loss = criterion(y_pred, y)*1e-6
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(t, loss.item())

0 46653.546875
1 114894.890625
2 194340.78125
3 84300.9609375
4 82674.2890625
5 50256.91796875
6 93193.2421875
7 52049.9609375
8 29042.482421875
9 46815.1640625
10 135198.390625
11 92232.90625
12 433154.9375
13 38341.39453125
14 507893.25
15 69008.5234375
16 78224.359375
17 835537.875
18 36253.546875
19 77298.671875
