In [3]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sb
sb.set_style('whitegrid')
import torch
import torch.nn as nn
from torch.autograd import Variable
from sklearn.preprocessing import LabelEncoder
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
%matplotlib inline

In [4]:
class TabularDataset(Dataset):
    def __init__(self, data, cat_cols=None, output_col=None):
 

        self.n = data.shape[0]

        if output_col:
            self.y = data[output_col].astype(np.float32).values.reshape(-1, 1)
        else:
            self.y =  np.zeros((self.n, 1))

        self.cat_cols = cat_cols if cat_cols else []
        self.cont_cols = [col for col in data.columns
                          if col not in self.cat_cols + [output_col]]

        if self.cont_cols:
            self.cont_X = data[self.cont_cols].astype(np.float32).values
        else:
            self.cont_X = np.zeros((self.n, 1))

        if self.cat_cols:
            self.cat_X = data[cat_cols].astype(np.int64).values
        else:
            self.cat_X =  np.zeros((self.n, 1))

    def __len__(self):
    
        return self.n

    def __getitem__(self, idx):
   
        return [self.y[idx], self.cont_X[idx], self.cat_X[idx]]


In [2]:
class Test(nn.Module):
    def __init__(self):
        super(Test, self).__init__()
        
        self.linear1 = nn.Linear(20, 50)
        self.linear2 = nn.Linear(50, 100)
        #self.linear3 = nn.Linear(hidden_dim, hidden_dim)
        self.linear4 = nn.Linear(100, 50)
        self.linear5 = nn.Linear(50, 1)
        
        self.relu = nn.ReLU()
        self.batch_norm = nn.BatchNorm1d(100)
        self.batch_norm2 = nn.BatchNorm1d(50)
        
       
        
    def forward(self, x):
        
        out = self.relu(self.linear1(x))
        out = self.relu(self.batch_norm(self.linear2(out)))
        out = nn.functional.dropout(out, 0.2)
        #out = nn.BatchNorm1d(50)
        #out = self.relu(self.linear3(out))
        out = self.relu(self.batch_norm2(self.linear4(out)))
        out = nn.functional.dropout(out, 0.5)
        out = self.linear5(out)
        
        return out

In [5]:
model = Test().to(device)
print(model)

Test(
  (linear1): Linear(in_features=20, out_features=50, bias=True)
  (linear2): Linear(in_features=50, out_features=100, bias=True)
  (linear4): Linear(in_features=100, out_features=50, bias=True)
  (linear5): Linear(in_features=50, out_features=1, bias=True)
  (relu): ReLU()
  (batch_norm): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch_norm2): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [6]:
print(device)
df = pd.read_csv('data.csv')

cuda:0


In [7]:
df.head().T

Unnamed: 0,0,1,2,3,4
house_pk,84561,84561,84561,84561,84561
agency_id,90,90,90,90,90
date_in,2016-08-27,2016-08-26,2016-08-25,2016-08-24,2016-08-23
price,532,588,588,588,588
dis_water_real,0.261,0.261,0.261,0.261,0.261
dis_shopping,3,3,3,3,3
no_bedrooms,3,3,3,3,3
max_persons,4,4,4,4,4
house_size,140,140,140,140,140
land_size,726,726,726,726,726


In [8]:
df['date_in'] = pd.to_datetime(df['date_in'])
df['year'] = df['date_in'].map(lambda x: x.strftime('%Y'))
df['month'] = df['date_in'].map(lambda x: x.strftime('%m'))
df['day'] = df['date_in'].map(lambda x: x.strftime('%d'))

dr = ['date_in','house_pk']

df = df.drop(dr,axis = 1)
df.head().T

Unnamed: 0,0,1,2,3,4
agency_id,90.0,90.0,90.0,90.0,90.0
price,532.0,588.0,588.0,588.0,588.0
dis_water_real,0.261,0.261,0.261,0.261,0.261
dis_shopping,3.0,3.0,3.0,3.0,3.0
no_bedrooms,3.0,3.0,3.0,3.0,3.0
max_persons,4.0,4.0,4.0,4.0,4.0
house_size,140.0,140.0,140.0,140.0,140.0
land_size,726.0,726.0,726.0,726.0,726.0
build_year,1953.0,1953.0,1953.0,1953.0,1953.0
renovation_year,2014.0,2014.0,2014.0,2014.0,2014.0


In [9]:
categorical_features = ['agency_id', 'apartment', 'indoor_pool', 'spa', 'internet', 'pets_allowed', 'water_view', 'fire_stove', 'year', 'month', 'day', 'build_year', 'renovation_year']
output_columns = 'price'

In [10]:
label_encoders = {}
for cat_col in categorical_features:
        label_encoders[cat_col] = LabelEncoder()
        df[cat_col] = label_encoders[cat_col].fit_transform(df[cat_col])

In [11]:
print(df.head(), type(df))
dataset = TabularDataset(data=df, cat_cols=categorical_features, output_col=output_columns)
print('len of dataset: {}'.format(len(dataset)))

   agency_id  price  dis_water_real  dis_shopping  no_bedrooms  max_persons  \
0          0    532           0.261           3.0            3            4   
1          0    588           0.261           3.0            3            4   
2          0    588           0.261           3.0            3            4   
3          0    588           0.261           3.0            3            4   
4          0    588           0.261           3.0            3            4   

   house_size  land_size  build_year  renovation_year  ...  indoor_pool  spa  \
0         140        726           4               21  ...            0    1   
1         140        726           4               21  ...            0    1   
2         140        726           4               21  ...            0    1   
3         140        726           4               21  ...            0    1   
4         140        726           4               21  ...            0    1   

   internet  pets_allowed  water_view  fire_

In [12]:
batchsize = 64
number_for_testing = int(len(dataset) * 0.05)
number_for_training = len(dataset) - number_for_testing
train, test = torch.utils.data.random_split(dataset,
    [number_for_training, number_for_testing])
trainloader = DataLoader(train, batchsize, shuffle=True)
testloader = DataLoader(test, batchsize, shuffle=True)