In [1]:
%matplotlib inline
import pathlib
import matplotlib.pyplot as plt
from pylab import rcParams
rcParams['figure.figsize'] = 8, 6
import pandas as pd
import numpy as np
import seaborn as sns
pd.set_option('display.max_columns', 500)
from collections import defaultdict

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error

pd.options.mode.chained_assignment = None

from torch.nn import init
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
from torch.utils import data
from torch.optim import lr_scheduler

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

from tqdm import tqdm, tqdm_notebook, tnrange
tqdm.pandas(desc='Progress')

In [2]:
def prepare_dataset(df):
    df['date_in'] = pd.to_datetime(df['date_in'])
    df['year'] = df['date_in'].map(lambda x: x.strftime('%Y'))
    df['month'] = df['date_in'].map(lambda x: x.strftime('%m'))
    df['day'] = df['date_in'].map(lambda x: x.strftime('%d'))
    df.dropna(inplace=True)
    y = np.log(df.price)    
    df.drop(['date_in','house_pk','price'], axis=1, inplace=True)
    
    return df, y

def split_features(df):
    catf = ['agency_id', 'apartment', 'indoor_pool', 'spa', 'internet', 'pets_allowed', 'water_view', 'fire_stove', 'year', 'month', 'day', 'build_year', 'renovation_year']
    

    numf = [col for col in df.columns if col not in catf]
    for c in catf: 
        df[c] = df[c].astype('category').cat.as_ordered()
        df[c] = df[c].cat.codes+1
    
    return catf, numf

def numericalize(df):
    df[name] = col.cat.codes+1

def split_dataset(df, y): return train_test_split(df, y, test_size=0.10, random_state=42)

def inv_y(y): return np.exp(y)

def get_numf_scaler(train): return preprocessing.StandardScaler().fit(train)

def scale_numf(df, num, scaler):
    cols = numf
    index = df.index
    scaled = scaler.transform(df[numf])
    scaled = pd.DataFrame(scaled, columns=cols, index=index)
    return pd.concat([scaled, df.drop(numf, axis=1)], axis=1)

In [3]:
class RegressionColumnarDataset(data.Dataset):
    def __init__(self, df, cats, y):
        self.dfcats = df[cats]
        self.dfconts = df.drop(cats, axis=1)
        
        self.cats = np.stack([c.values for n, c in self.dfcats.items()], axis=1).astype(np.int64)
        self.conts = np.stack([c.values for n, c in self.dfconts.items()], axis=1).astype(np.float32)
        self.y = y.values.astype(np.float32)
        
    def __len__(self): return len(self.y)

    def __getitem__(self, idx):
        return [self.cats[idx], self.conts[idx], self.y[idx]]
    
def rmse(targ, y_pred):
    return np.sqrt(mean_squared_error(inv_y(y_pred), inv_y(targ))) #.detach().numpy()

def emb_init(x):
    x = x.weight.data
    sc = 2/(x.size(1)+1)
    x.uniform_(-sc,sc)

In [4]:
class MixedInputModel(nn.Module):
    def __init__(self, emb_szs, n_cont, emb_drop, out_sz, szs, drops, y_range, use_bn=True):
        super().__init__()
        for i,(c,s) in enumerate(emb_szs): assert c > 1, f"cardinality must be >=2, got emb_szs[{i}]: ({c},{s})"
        self.embs = nn.ModuleList([nn.Embedding(c, s) for c,s in emb_szs])
        for emb in self.embs: emb_init(emb)
        n_emb = sum(e.embedding_dim for e in self.embs)
        self.n_emb, self.n_cont=n_emb, n_cont
        
        szs = [n_emb+n_cont] + szs
        self.lins = nn.ModuleList([nn.Linear(szs[i], szs[i+1]) for i in range(len(szs)-1)])
        self.bns = nn.ModuleList([nn.BatchNorm1d(sz) for sz in szs[1:]])
        for o in self.lins: nn.init.kaiming_normal_(o.weight.data)
        self.outp = nn.Linear(szs[-1], out_sz)
        nn.init.kaiming_normal_(self.outp.weight.data)

        self.emb_drop = nn.Dropout(emb_drop)
        self.drops = nn.ModuleList([nn.Dropout(drop) for drop in drops])
        self.bn = nn.BatchNorm1d(n_cont)
        self.use_bn,self.y_range = use_bn,y_range

    def forward(self, x_cat, x_cont):
        if self.n_emb != 0:
            x = [e(x_cat[:,i]) for i,e in enumerate(self.embs)]
            x = torch.cat(x, 1)
            x = self.emb_drop(x)
        if self.n_cont != 0:
            x2 = self.bn(x_cont)
            x = torch.cat([x, x2], 1) if self.n_emb != 0 else x2
        for l,d,b in zip(self.lins, self.drops, self.bns):
            x = F.relu(l(x))
            if self.use_bn: x = b(x)
            x = d(x)
        x = self.outp(x)
        if self.y_range:
            x = torch.sigmoid(x)
            x = x*(self.y_range[1] - self.y_range[0])
            x = x+self.y_range[0]
        return x.squeeze()

In [5]:
df = pd.read_csv('data.csv')

In [6]:
df.head()

Unnamed: 0,house_pk,agency_id,date_in,price,dis_water_real,dis_shopping,no_bedrooms,max_persons,house_size,land_size,build_year,renovation_year,apartment,indoor_pool,spa,internet,pets_allowed,water_view,fire_stove,agency_rating
0,84561,90,2016-08-27,532,0.261,3.0,3,4,140,726,1953,2014,0,0,1,1,0,1,1,3.0
1,84561,90,2016-08-26,588,0.261,3.0,3,4,140,726,1953,2014,0,0,1,1,0,1,1,3.0
2,84561,90,2016-08-25,588,0.261,3.0,3,4,140,726,1953,2014,0,0,1,1,0,1,1,3.0
3,84561,90,2016-08-24,588,0.261,3.0,3,4,140,726,1953,2014,0,0,1,1,0,1,1,3.0
4,84561,90,2016-08-23,588,0.261,3.0,3,4,140,726,1953,2014,0,0,1,1,0,1,1,3.0


In [7]:
df, y = prepare_dataset(df)
df.head()

Unnamed: 0,agency_id,dis_water_real,dis_shopping,no_bedrooms,max_persons,house_size,land_size,build_year,renovation_year,apartment,indoor_pool,spa,internet,pets_allowed,water_view,fire_stove,agency_rating,year,month,day
0,90,0.261,3.0,3,4,140,726,1953,2014,0,0,1,1,0,1,1,3.0,2016,8,27
1,90,0.261,3.0,3,4,140,726,1953,2014,0,0,1,1,0,1,1,3.0,2016,8,26
2,90,0.261,3.0,3,4,140,726,1953,2014,0,0,1,1,0,1,1,3.0,2016,8,25
3,90,0.261,3.0,3,4,140,726,1953,2014,0,0,1,1,0,1,1,3.0,2016,8,24
4,90,0.261,3.0,3,4,140,726,1953,2014,0,0,1,1,0,1,1,3.0,2016,8,23


In [8]:
catf, numf = split_features(df)

len(catf)
catf

len(numf)
numf

13

['agency_id',
 'apartment',
 'indoor_pool',
 'spa',
 'internet',
 'pets_allowed',
 'water_view',
 'fire_stove',
 'year',
 'month',
 'day',
 'build_year',
 'renovation_year']

7

['dis_water_real',
 'dis_shopping',
 'no_bedrooms',
 'max_persons',
 'house_size',
 'land_size',
 'agency_rating']

In [9]:
X_train, X_test, y_train, y_test = split_dataset(df, y)

X_train.shape
X_test.shape

(76675, 20)

(8520, 20)

In [10]:
scaler = get_numf_scaler(X_train[numf])

X_train_sc = scale_numf(X_train, numf, scaler)
X_train_sc.std(axis=0)

  return self.partial_fit(X, y)


dis_water_real      1.000007
dis_shopping        1.000007
no_bedrooms         1.000007
max_persons         1.000007
house_size          1.000007
land_size           1.000007
agency_rating       1.000007
agency_id           1.155344
build_year         11.123847
renovation_year     6.108187
apartment           0.372211
indoor_pool         0.181366
spa                 0.499029
internet            0.397566
pets_allowed        0.498068
water_view          0.375947
fire_stove          0.473652
year                0.917424
month               3.172072
day                 8.814027
dtype: float64

In [11]:
X_test_sc = scale_numf(X_test, numf, scaler)

X_train_sc.shape
X_test_sc.shape
X_test_sc.std(axis=0)



(76675, 20)

(8520, 20)

dis_water_real      1.002875
dis_shopping        0.986489
no_bedrooms         1.007128
max_persons         0.998606
house_size          0.995486
land_size           0.986481
agency_rating       0.999960
agency_id           1.153273
build_year         11.195102
renovation_year     6.091708
apartment           0.372700
indoor_pool         0.177983
spa                 0.498771
internet            0.398519
pets_allowed        0.498349
water_view          0.373015
fire_stove          0.476215
year                0.923446
month               3.182723
day                 8.788179
dtype: float64

In [12]:
trainds = RegressionColumnarDataset(X_train_sc, catf, y_train)
valds = RegressionColumnarDataset(X_test_sc, catf, y_test)

In [13]:
params = {'batch_size': 128,
          'shuffle': True,
          'num_workers': 1}

traindl = data.DataLoader(trainds, **params)
valdl = data.DataLoader(valds, **params)

In [14]:
y_range = (0, y_train.max()*1.2)
y_range

cat_sz = [(c, df[c].max()+1) for c in catf]
cat_sz

emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]
emb_szs

(0, 9.475863241971648)

[('agency_id', 5),
 ('apartment', 3),
 ('indoor_pool', 3),
 ('spa', 3),
 ('internet', 3),
 ('pets_allowed', 3),
 ('water_view', 3),
 ('fire_stove', 3),
 ('year', 5),
 ('month', 13),
 ('day', 32),
 ('build_year', 40),
 ('renovation_year', 27)]

[(5, 3),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (5, 3),
 (13, 7),
 (32, 16),
 (40, 20),
 (27, 14)]

In [15]:
m = MixedInputModel(emb_szs=emb_szs, 
                    n_cont=len(df.columns)-len(catf), 
                    emb_drop=0.04, 
                    out_sz=1, 
                    szs=[1000,500,250], 
                    drops=[0.001,0.01,0.01], 
                    y_range=y_range).to(device)

opt = optim.Adam(m.parameters(), 1e-2)
lr_cosine = lr_scheduler.CosineAnnealingLR(opt, 1000)

lr = defaultdict(list)
tloss = defaultdict(list)
vloss = defaultdict(list)

In [16]:
print(m)

MixedInputModel(
  (embs): ModuleList(
    (0): Embedding(5, 3)
    (1): Embedding(3, 2)
    (2): Embedding(3, 2)
    (3): Embedding(3, 2)
    (4): Embedding(3, 2)
    (5): Embedding(3, 2)
    (6): Embedding(3, 2)
    (7): Embedding(3, 2)
    (8): Embedding(5, 3)
    (9): Embedding(13, 7)
    (10): Embedding(32, 16)
    (11): Embedding(40, 20)
    (12): Embedding(27, 14)
  )
  (lins): ModuleList(
    (0): Linear(in_features=84, out_features=1000, bias=True)
    (1): Linear(in_features=1000, out_features=500, bias=True)
    (2): Linear(in_features=500, out_features=250, bias=True)
  )
  (bns): ModuleList(
    (0): BatchNorm1d(1000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): BatchNorm1d(250, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (outp): Linear(in_features=250, out_features=1, bias=True)
  (emb_drop): Dropout(p=0.04)
  (drops): ModuleList(

In [17]:
num_batch = len(traindl)
print(num_batch)

600


In [19]:
for epoch in tnrange(5):
    y_true_train = list()
    y_pred_train = list()
    total_loss_train = 0
    
    t = tqdm_notebook(iter(traindl), leave=False, total=num_batch)
    for cat, cont, y in t:
        t.set_description(f'Epoch {epoch}')
        opt.zero_grad()
        pred = model(cat, cont)
        loss = loss_fn(pred, y)
        
        print('loss: {}'.format(loss))
        loss.backward()
        opt.step()

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




BrokenPipeError: [Errno 32] Broken pipe