In [23]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import torch.nn.functional as F
%matplotlib inline

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df.head().T

Unnamed: 0,0,1,2,3,4
house_pk,84561,84561,84561,84561,84561
agency_id,90,90,90,90,90
date_in,2016-08-27,2016-08-26,2016-08-25,2016-08-24,2016-08-23
price,532,588,588,588,588
dis_water_real,0.261,0.261,0.261,0.261,0.261
dis_shopping,3,3,3,3,3
no_bedrooms,3,3,3,3,3
max_persons,4,4,4,4,4
house_size,140,140,140,140,140
land_size,726,726,726,726,726


In [4]:
df['date_in'] = pd.to_datetime(df['date_in'])

In [5]:
df['year'] = df['date_in'].map(lambda x: x.strftime('%Y'))
df['month'] = df['date_in'].map(lambda x: x.strftime('%m'))
df['day'] = df['date_in'].map(lambda x: x.strftime('%d'))

In [7]:
dr = ['date_in','house_pk']

df = df.drop(dr,axis = 1)
df.head().T

Unnamed: 0,0,1,2,3,4
agency_id,90.0,90.0,90.0,90.0,90.0
price,532.0,588.0,588.0,588.0,588.0
dis_water_real,0.261,0.261,0.261,0.261,0.261
dis_shopping,3.0,3.0,3.0,3.0,3.0
no_bedrooms,3.0,3.0,3.0,3.0,3.0
max_persons,4.0,4.0,4.0,4.0,4.0
house_size,140.0,140.0,140.0,140.0,140.0
land_size,726.0,726.0,726.0,726.0,726.0
build_year,1953.0,1953.0,1953.0,1953.0,1953.0
renovation_year,2014.0,2014.0,2014.0,2014.0,2014.0


In [8]:
df.shape

(85195, 21)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85195 entries, 0 to 85194
Data columns (total 21 columns):
agency_id          85195 non-null int64
price              85195 non-null int64
dis_water_real     85195 non-null float64
dis_shopping       85195 non-null float64
no_bedrooms        85195 non-null int64
max_persons        85195 non-null int64
house_size         85195 non-null int64
land_size          85195 non-null int64
build_year         85195 non-null int64
renovation_year    85195 non-null int64
apartment          85195 non-null int64
indoor_pool        85195 non-null int64
spa                85195 non-null int64
internet           85195 non-null int64
pets_allowed       85195 non-null int64
water_view         85195 non-null int64
fire_stove         85195 non-null int64
agency_rating      85195 non-null float64
year               85195 non-null object
month              85195 non-null object
day                85195 non-null object
dtypes: float64(3), int64(15), object(3)
m

In [10]:
categorical_columns = ['agency_id', 'apartment', 'indoor_pool', 'spa', 'internet', 'pets_allowed', 'water_view', 'fire_stove', 'year', 'month', 'day']

In [11]:
output_columns = 'price'

In [12]:
from sklearn.preprocessing import LabelEncoder

In [14]:
label_encoders = {}

for cat_col in categorical_columns:
    label_encoders[cat_col] = LabelEncoder()
    df[cat_col] = label_encoders[cat_col].fit_transform(df[cat_col])

In [16]:
df.agency_id.unique()

array([0, 1, 3, 2], dtype=int64)

In [17]:
from torch.utils.data import Dataset, DataLoader

In [18]:
class TabularDataset(Dataset):
    def __init__(self, data, cat_cols = None, output_col=None):
        
        self.n = data.shape[0]
        
        if output_col:
            self.y = data[output_col].astype(np.float32).values.reshape(-1, 1)
        else:
            self.y = np.zeros((self.n, 1))
            
        self.cat_cols = cat_cols if cat_cols else []
        
        self.cont_cols = [col for col in data.columns
                      if col not in self.cat_cols + [output_col]]
        
        if self.cont_cols:
            self.cont_X = data[self.cont_cols].astype(np.float32).values
        else:
            self.cont_X = np.zeros((self.n, 1))
            
        if self.cat_cols:
            self.cat_X = data[cat_cols].astype(np.int64).values
        else:
            self.cat_X = np.zeros((self.n, 1))
            
    def __len__(self):
        return self.n
    
    def __getitem__(self, idx):
        return [self.y[idx], self.cont_X[idx], self.cat_X[idx]]

In [20]:
dataset = TabularDataset(data=df, cat_cols=categorical_columns, output_col=output_columns)

In [24]:
class Test(nn.Module):
    
    def __init__(self, emb_dims, no_of_cont, lin_layer_size, output_size, emb_dropout, lin_layer_dropout):
        
        super(Test, self).__init__()
        
        self.emb_layers = nn.ModuleList([nn.Embedding(x,y)
                                        for x,y in emb_dims])
        
        no_of_embs = sum([y for x,y in emb_dims])
        
        self.no_of_embs = no_of_embs
        self.no_of_cont = no_of_cont
        
        #linear layer
        
        first_lin_layer = nn.Linear(self.no_of_embs + self.no_of_cont, lin_layer_size[0])
        
        self.lin_layers = nn.ModuleList([first_lin_layer] +
                                       [nn.Linear(lin_layer_size[i], lin_layer_size[i+1])
                                       for i in range(len(lin_layer_size) - 1)])
        
        for lin_layer in self.lin_layers:
            nn.init.kaiming_normal_(lin_layer.weight.data)
            
        #output layers
        
        self.output_layer = nn.Linear(lin_layer_size[-1], output_size)
        
        nn.init.kaiming_normal_(self.output_layer.weight.data)
        
        #batch norm layers
        
        self.first_bn_layer = nn.BatchNorm1d(self.no_of_cont)
        self.bn_layers = nn.ModuleList([nn.BatchNorm1d(size)
                                    for size in lin_layer_size])

        # Dropout Layers
        self.emb_dropout_layer = nn.Dropout(emb_dropout)
        self.droput_layers = nn.ModuleList([nn.Dropout(size)
                                  for size in lin_layer_dropout])
        
    def forward(self, cont_data, cat_data):
        
        if self.no_of_embs != 0:
            x = [emb_layers(cat_data[:,i])
                 for i, emb_layers in enumerate(self.emb_layers)]
            x = torch.cat(x,1)
            x = self.emb_dropout_layer(x)
            
        if self.no_of_cont != 0:
            normalized_cont_data = self.first_bn_layer(cont_data)
            
            if self.no_of_embs != 0:
                x = torch.cat([x, normalized_cont_data], 1)
                
            else:
                x = normalized_cont_data
                
        for lin_layer, droput_layer, bn_layer in zip(self.lin_layers, self.droput_layers, self.bn_layers):
            
            x = F.relu(lin_layer(x))
            x = bn_layer(x)
            x = droput_layer(x)
            
        x = self.output_layer(x)
        
        return x

In [25]:
batchsize = 64
dataloader = DataLoader(dataset, batchsize, shuffle=True, num_workers=1)

In [27]:
cat_dims = [int(df[col].nunique()) for col in categorical_columns]
cat_dims

[4, 2, 2, 2, 2, 2, 2, 2, 4, 12, 31]

In [28]:
emb_dims = [(x, min(50, (x + 1) // 2)) for x in cat_dims]
emb_dims

[(4, 2),
 (2, 1),
 (2, 1),
 (2, 1),
 (2, 1),
 (2, 1),
 (2, 1),
 (2, 1),
 (4, 2),
 (12, 6),
 (31, 16)]

In [30]:
model = Test(emb_dims, no_of_cont=9, lin_layer_size=[50, 100],
                          output_size=1, emb_dropout=0.04,
                          lin_layer_dropout=[0.001,0.01])

In [31]:
print(model)

Test(
  (emb_layers): ModuleList(
    (0): Embedding(4, 2)
    (1): Embedding(2, 1)
    (2): Embedding(2, 1)
    (3): Embedding(2, 1)
    (4): Embedding(2, 1)
    (5): Embedding(2, 1)
    (6): Embedding(2, 1)
    (7): Embedding(2, 1)
    (8): Embedding(4, 2)
    (9): Embedding(12, 6)
    (10): Embedding(31, 16)
  )
  (lin_layers): ModuleList(
    (0): Linear(in_features=42, out_features=50, bias=True)
    (1): Linear(in_features=50, out_features=100, bias=True)
  )
  (output_layer): Linear(in_features=100, out_features=1, bias=True)
  (first_bn_layer): BatchNorm1d(9, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn_layers): ModuleList(
    (0): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (emb_dropout_layer): Dropout(p=0.04)
  (droput_layers): ModuleList(
    (0): Dropout(p=0.001)
    (1): Dropout(p=0.01)
  )
)


In [32]:
epochs = 5

criterion = nn.MSELoss()

optimizer = torch.optim.Adam(model.parameters(), lr = 0.1)

for epoch in range(epochs):
    
    for y, cont_x, cat_x in dataloader:
        
        optimizer.zero_grad()
        
        output = model(cont_x, cat_x)
        
        loss = criterion(output, y)
        
        print('loss: {:.3f}'.format(loss.item()))
        
        loss.backward()
        
        optimizer.step()

BrokenPipeError: [Errno 32] Broken pipe