In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


class TabularDataset(Dataset):
    def __init__(self, data, cat_cols=None, output_col=None):
 

        self.n = data.shape[0]

        if output_col:
            self.y = data[output_col].astype(np.float32).values.reshape(-1, 1)
        else:
            self.y =  np.zeros((self.n, 1))

        self.cat_cols = cat_cols if cat_cols else []
        self.cont_cols = [col for col in data.columns
                          if col not in self.cat_cols + [output_col]]

        if self.cont_cols:
            self.cont_X = data[self.cont_cols].astype(np.float32).values
        else:
            self.cont_X = np.zeros((self.n, 1))

        if self.cat_cols:
            self.cat_X = data[cat_cols].astype(np.int64).values
        else:
            self.cat_X =  np.zeros((self.n, 1))

    def __len__(self):
    """
    Denotes the total number of samples.
    """
        return self.n

    def __getitem__(self, idx):
    """
    Generates one sample of data.
    """
        return [self.y[idx], self.cont_X[idx], self.cat_X[idx]]


class FeedForwardNN(nn.Module):

    def __init__(self, emb_dims, no_of_cont, lin_layer_sizes,
               output_size, emb_dropout, lin_layer_dropouts):

    """
    Parameters
    ----------
    emb_dims: List of two element tuples
      This list will contain a two element tuple for each
      categorical feature. The first element of a tuple will
      denote the number of unique values of the categorical
      feature. The second element will denote the embedding
      dimension to be used for that feature.
    no_of_cont: Integer
      The number of continuous features in the data.
    lin_layer_sizes: List of integers.
      The size of each linear layer. The length will be equal
      to the total number
      of linear layers in the network.
    output_size: Integer
      The size of the final output.
    emb_dropout: Float
      The dropout to be used after the embedding layers.
    lin_layer_dropouts: List of floats
      The dropouts to be used after each linear layer.
    """

        super().__init__()

        # Embedding layers
        self.emb_layers = nn.ModuleList([nn.Embedding(x, y)
                                         for x, y in emb_dims])

        no_of_embs = sum([y for x, y in emb_dims])
        self.no_of_embs = no_of_embs
        self.no_of_cont = no_of_cont

        # Linear Layers
        first_lin_layer = nn.Linear(self.no_of_embs + self.no_of_cont,
                                    lin_layer_sizes[0])

        self.lin_layers =\
         nn.ModuleList([first_lin_layer] +\
              [nn.Linear(lin_layer_sizes[i], lin_layer_sizes[i + 1])
               for i in range(len(lin_layer_sizes) - 1)])

        for lin_layer in self.lin_layers:
            nn.init.kaiming_normal_(lin_layer.weight.data)

        # Output Layer
        self.output_layer = nn.Linear(lin_layer_sizes[-1],
                                      output_size)
        nn.init.kaiming_normal_(self.output_layer.weight.data)

        # Batch Norm Layers
        self.first_bn_layer = nn.BatchNorm1d(self.no_of_cont)
        self.bn_layers = nn.ModuleList([nn.BatchNorm1d(size)
                                        for size in lin_layer_sizes])

        # Dropout Layers
        self.emb_dropout_layer = nn.Dropout(emb_dropout)
        self.droput_layers = nn.ModuleList([nn.Dropout(size)
                                      for size in lin_layer_dropouts])

    def forward(self, cont_data, cat_data):

        if self.no_of_embs != 0:
            x = [emb_layer(cat_data[:, i])
               for i,emb_layer in enumerate(self.emb_layers)]
            x = torch.cat(x, 1)
            x = self.emb_dropout_layer(x)

        if self.no_of_cont != 0:
            normalized_cont_data = self.first_bn_layer(cont_data)

            if self.no_of_embs != 0:
                x = torch.cat([x, normalized_cont_data], 1) 
            else:
                x = normalized_cont_data

        for lin_layer, dropout_layer, bn_layer in\
            zip(self.lin_layers, self.droput_layers, self.bn_layers):

            x = F.relu(lin_layer(x))
            x = bn_layer(x)
            x = dropout_layer(x)

        x = self.output_layer(x)

        return x

In [2]:
data = pd.read_csv('data.csv')

In [3]:
data['date_in'] = pd.to_datetime(data['date_in'])
data['year'] = data['date_in'].map(lambda x: x.strftime('%Y'))
data['month'] = data['date_in'].map(lambda x: x.strftime('%m'))
data['day'] = data['date_in'].map(lambda x: x.strftime('%d'))

In [4]:
dr = ['date_in','house_pk']

data = data.drop(dr,axis = 1)
data.head().T

Unnamed: 0,0,1,2,3,4
agency_id,90.0,90.0,90.0,90.0,90.0
price,532.0,588.0,588.0,588.0,588.0
dis_water_real,0.261,0.261,0.261,0.261,0.261
dis_shopping,3.0,3.0,3.0,3.0,3.0
no_bedrooms,3.0,3.0,3.0,3.0,3.0
max_persons,4.0,4.0,4.0,4.0,4.0
house_size,140.0,140.0,140.0,140.0,140.0
land_size,726.0,726.0,726.0,726.0,726.0
build_year,1953.0,1953.0,1953.0,1953.0,1953.0
renovation_year,2014.0,2014.0,2014.0,2014.0,2014.0


In [5]:
categorical_features = ['agency_id', 'apartment', 'indoor_pool', 'spa', 'internet', 'pets_allowed', 'water_view', 'fire_stove', 'year', 'month', 'day']
output_columns = 'price'

In [6]:
data.shape

(85195, 21)

In [7]:
label_encoders = {}
for cat_col in categorical_features:
        label_encoders[cat_col] = LabelEncoder()
        data[cat_col] = label_encoders[cat_col].fit_transform(data[cat_col])

In [8]:
dataset = TabularDataset(data=data, cat_cols=categorical_features, output_col=output_columns)

In [9]:
batchsize = 64
dataloader = DataLoader(dataset, batchsize, shuffle=True)

In [10]:
cat_dims = [int(data[col].nunique()) for col in categorical_features]
cat_dims

[4, 2, 2, 2, 2, 2, 2, 2, 4, 12, 31]

In [11]:
emb_dims = [(x, min(50, (x + 1) // 2)) for x in cat_dims]
emb_dims

[(4, 2),
 (2, 1),
 (2, 1),
 (2, 1),
 (2, 1),
 (2, 1),
 (2, 1),
 (2, 1),
 (4, 2),
 (12, 6),
 (31, 16)]

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [13]:
model = FeedForwardNN(emb_dims, no_of_cont=9, lin_layer_sizes=[50, 100],
                          output_size=1, emb_dropout=0.04,
                          lin_layer_dropouts=[0.001,0.01]).to(device)

In [14]:
print(model)

FeedForwardNN(
  (emb_layers): ModuleList(
    (0): Embedding(4, 2)
    (1): Embedding(2, 1)
    (2): Embedding(2, 1)
    (3): Embedding(2, 1)
    (4): Embedding(2, 1)
    (5): Embedding(2, 1)
    (6): Embedding(2, 1)
    (7): Embedding(2, 1)
    (8): Embedding(4, 2)
    (9): Embedding(12, 6)
    (10): Embedding(31, 16)
  )
  (lin_layers): ModuleList(
    (0): Linear(in_features=42, out_features=50, bias=True)
    (1): Linear(in_features=50, out_features=100, bias=True)
  )
  (output_layer): Linear(in_features=100, out_features=1, bias=True)
  (first_bn_layer): BatchNorm1d(9, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn_layers): ModuleList(
    (0): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (emb_dropout_layer): Dropout(p=0.04)
  (droput_layers): ModuleList(
    (0): Dropout(p=0.001)
    (1): Dropout(p=0.01)
  )
)


In [15]:
epochs = 5

criterion = nn.MSELoss()

optimizer = torch.optim.Adam(model.parameters(), lr = 0.1)

for epoch in range(epochs):
    
    for y, cont_x, cat_x in dataloader:
        
        optimizer.zero_grad()
        
        cont_x = cont_x.to(device)
        cat_x = cat_x.to(device)
        y = y.to(device)
        
        output = model(cont_x, cat_x)
        
        loss = criterion(output, y)
        
        print('loss: {:.3f}'.format(loss.item()))
        
        loss.backward()
        
        optimizer.step()

loss: 372733.125
loss: 414318.125
loss: 450339.312
loss: 371294.000
loss: 298650.031
loss: 380865.938
loss: 413957.156
loss: 428347.750
loss: 339070.875
loss: 311659.562
loss: 294935.469
loss: 200001.344
loss: 235193.969
loss: 152916.484
loss: 143250.859
loss: 217440.094
loss: 156203.453
loss: 130534.094
loss: 132738.344
loss: 56379.609
loss: 81454.188
loss: 39026.027
loss: 76324.656
loss: 22570.779
loss: 33659.035
loss: 56022.523
loss: 30564.424
loss: 39445.141
loss: 48782.688
loss: 33481.246
loss: 49696.645
loss: 64311.863
loss: 48971.883
loss: 41842.715
loss: 31264.102
loss: 34848.176
loss: 25339.396
loss: 31726.633
loss: 21478.301
loss: 17477.660
loss: 40352.160
loss: 25923.250
loss: 31710.609
loss: 42272.734
loss: 14377.985
loss: 31015.713
loss: 27594.117
loss: 29184.539
loss: 56441.098
loss: 35275.109
loss: 57423.547
loss: 36998.004
loss: 57168.781
loss: 25049.277
loss: 35334.617
loss: 47338.949
loss: 31146.760
loss: 22143.041
loss: 20933.846
loss: 26805.371
loss: 44760.211
loss:

KeyboardInterrupt: 