In [0]:
import torch
import pandas as pd

In [0]:
class OneHotEncoder():
  
  def __init__(self,series):
    unique_values = series.unique()
    self.ordinals = {c:i for i, c in enumerate(unique_values)}
    self.encoder = torch.eye(len(unique_values),len(unique_values))
    
  def __getitem__(self,value):
    return self.encoder[self.ordinals[value]]

In [3]:
look = pd.read_csv('./kc_house_data.csv')
look.iloc[0]

id                    7129300520
date             20141013T000000
price                     221900
bedrooms                       3
bathrooms                      1
sqft_living                 1180
sqft_lot                    5650
floors                         1
waterfront                     0
view                           0
condition                      3
grade                          7
sqft_above                  1180
sqft_basement                  0
yr_built                    1955
yr_renovated                   0
zipcode                    98178
lat                      47.5112
long                    -122.257
sqft_living15               1340
sqft_lot15                  5650
Name: 0, dtype: object

In [0]:
categorical = ['waterfront','view','condition','grade']

In [0]:
discard = ['id']

In [0]:
import dateutil

In [0]:
class DateEncoder():
  
  def __getitem__(self,datestring):
    parsed = dateutil.parser.parse(datestring)
    return torch.Tensor([parsed.year,parsed.month,parsed.day])

In [14]:
dates = ['date']
DateEncoder()['20141013T000000']

tensor([2014.,   10.,   13.])

In [0]:
from torch.utils.data import Dataset

In [0]:
class MixedCSV(Dataset):
  def __init__(self,datafile,output_series_name,date_series_names,categorical_series_names,ignore_series_names):
    self.dataset = pd.read_csv('./kc_house_data.csv')
    self.output_series_name = output_series_name
    self.encoders = {}
    for series_name in date_series_names:
      self.encoders[series_name] = DateEncoder()
    for series_name in categorical_series_names:
      self.encoders[series_name] = OneHotEncoder(self.dataset[series_name])
    self.ignore = ignore_series_names
   
  def __len__(self):
    return len(self.dataset)
   
  def __getitem__(self,idx):
    if type(idx) == torch.Tensor:
      idx = idx.item()
    sample = self.dataset.iloc[idx]
    output = torch.Tensor([sample[self.output_series_name]])
    
    input_components = []
    for name,value in sample.items():
      if name in self.ignore:
        continue
      elif name in self.encoders:
        input_components.append(self.encoders[name][value])
      else :
        input_components.append(torch.Tensor([value]))
     
    input = torch.cat(input_components)
    return input,output
  
      

In [25]:
houses = MixedCSV('./kc_house_data.csv','price',dates,categorical,discard)
houses[0]

(tensor([ 2.0140e+03,  1.0000e+01,  1.3000e+01,  2.2190e+05,  3.0000e+00,
          1.0000e+00,  1.1800e+03,  5.6500e+03,  1.0000e+00,  1.0000e+00,
          0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  1.1800e+03,  0.0000e+00,
          1.9550e+03,  0.0000e+00,  9.8178e+04,  4.7511e+01, -1.2226e+02,
          1.3400e+03,  5.6500e+03]), tensor([221900.]))

In [0]:
class Model(torch.nn.Module):
  
  def __init__(self,input_dimensions,size = 128):
    super().__init__()
    self.layer_one = torch.nn.Linear(input_dimensions,size)
    self.activation_one = torch.nn.ReLU()
    self.layer_two = torch.nn.Linear(size,size)
    self.activation_two = torch.nn.ReLU()
    self.shape_outputs = torch.nn.Linear(size,1)
    
  def forward(self,inputs):
    x = self.layer_one(inputs)
    x = self.activation_one(x)
    x = self.layer_two(x)
    x = self.activation_two(x)
    x = self.shape_outputs(x)
    return x

In [0]:
model = Model(houses[0][0].shape[0])
optimizer = torch.optim.Adam(model.parameters())
loss_function = torch.nn.MSELoss()

In [36]:
number_for_testing = int(len(houses)* 0.05)
number_for_training = len(houses) - number_for_testing
train,test = torch.utils.data.random_split(houses,[number_for_training,number_for_testing])
training = torch.utils.data.DataLoader(train,batch_size = 32,shuffle = True)
for epoch in range(16):
  for inputs,outputs in training:
    optimizer.zero_grad()
    results = model(inputs)
    loss = loss_function(results,outputs)
    loss.backward()
    optimizer.step()
  print('loss : {0}'.format(loss))

loss : 860235.9375
loss : 1674349.125
loss : 194048.671875
loss : 100432.5
loss : 952169.0
loss : 8119.04736328125
loss : 182728.25
loss : 2799585.75
loss : 928227.875
loss : 232133.15625
loss : 3273.12451171875
loss : 359160.1875
loss : 6005.0322265625
loss : 224687.90625
loss : 47495.67578125
loss : 684945.8125


In [38]:
actual = test[0][1]
predicted = model(test[0][0])
actual,predicted

(tensor([615000.]), tensor([614755.5625], grad_fn=<AddBackward0>))

In [0]:
import sklearn.metrics
import torch.utils.data


In [41]:
testing = torch.utils.data.DataLoader(test,batch_size = len(test),shuffle = False)
for inputs,outputs in testing:
  predicted = model(inputs).detach().numpy()
  actual = outputs.numpy()
  print(sklearn.metrics.r2_score(actual,predicted))

0.9999977580759927
