In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils import data
from torch.optim import lr_scheduler
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from collections import defaultdict
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

from tqdm import tqdm, tqdm_notebook, tnrange
tqdm.pandas(desc='Progress')
%matplotlib inline

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df.head().T

Unnamed: 0,0,1,2,3,4
house_pk,84561,84561,84561,84561,84561
agency_id,90,90,90,90,90
date_in,2016-08-27,2016-08-26,2016-08-25,2016-08-24,2016-08-23
price,532,588,588,588,588
dis_water_real,0.261,0.261,0.261,0.261,0.261
dis_shopping,3,3,3,3,3
no_bedrooms,3,3,3,3,3
max_persons,4,4,4,4,4
house_size,140,140,140,140,140
land_size,726,726,726,726,726


In [4]:
df['date_in'] = pd.to_datetime(df['date_in'])

In [5]:
df['year'] = df['date_in'].map(lambda x: x.strftime('%Y'))
df['month'] = df['date_in'].map(lambda x: x.strftime('%m'))
df['day'] = df['date_in'].map(lambda x: x.strftime('%d'))

In [6]:
dr = ['date_in','house_pk']

df = df.drop(dr,axis = 1)
df.head().T

Unnamed: 0,0,1,2,3,4
agency_id,90.0,90.0,90.0,90.0,90.0
price,532.0,588.0,588.0,588.0,588.0
dis_water_real,0.261,0.261,0.261,0.261,0.261
dis_shopping,3.0,3.0,3.0,3.0,3.0
no_bedrooms,3.0,3.0,3.0,3.0,3.0
max_persons,4.0,4.0,4.0,4.0,4.0
house_size,140.0,140.0,140.0,140.0,140.0
land_size,726.0,726.0,726.0,726.0,726.0
build_year,1953.0,1953.0,1953.0,1953.0,1953.0
renovation_year,2014.0,2014.0,2014.0,2014.0,2014.0


In [7]:
df.shape

(85195, 21)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85195 entries, 0 to 85194
Data columns (total 21 columns):
agency_id          85195 non-null int64
price              85195 non-null int64
dis_water_real     85195 non-null float64
dis_shopping       85195 non-null float64
no_bedrooms        85195 non-null int64
max_persons        85195 non-null int64
house_size         85195 non-null int64
land_size          85195 non-null int64
build_year         85195 non-null int64
renovation_year    85195 non-null int64
apartment          85195 non-null int64
indoor_pool        85195 non-null int64
spa                85195 non-null int64
internet           85195 non-null int64
pets_allowed       85195 non-null int64
water_view         85195 non-null int64
fire_stove         85195 non-null int64
agency_rating      85195 non-null float64
year               85195 non-null object
month              85195 non-null object
day                85195 non-null object
dtypes: float64(3), int64(15), object(3)
m

In [9]:
categorical_columns = ['agency_id', 'apartment', 'indoor_pool', 'spa', 'internet', 'pets_allowed', 'water_view', 'fire_stove', 'year', 'month', 'day']

In [10]:
output_columns = 'price'

In [11]:
def split_features(df):
    catf = ['agency_id', 'apartment', 'indoor_pool', 'spa', 'internet', 'pets_allowed', 'water_view', 'fire_stove', 'year', 'month', 'day']

    numf = [col for col in df.columns if col not in catf]
    for c in catf: 
        df[c] = df[c].astype('category').cat.as_ordered()
        df[c] = df[c].cat.codes+1
    
    return catf, numf

In [12]:
def numericalize(df):
    df[name] = col.cat.codes+1

def split_dataset(df, y): return train_test_split(df, y, test_size=0.10, random_state=42)

def inv_y(y): return np.exp(y)

def get_numf_scaler(train): return preprocessing.StandardScaler().fit(train)

def scale_numf(df, num, scaler):
    cols = numf
    index = df.index
    scaled = scaler.transform(df[numf])
    scaled = pd.DataFrame(scaled, columns=cols, index=index)
    return pd.concat([scaled, df.drop(numf, axis=1)], axis=1)

In [13]:
def prepare_dataset(df):
    df.dropna(inplace=True)
    y = np.log(df.price)
    df.drop('price', axis=1, inplace=True)
    
    return df, y

In [14]:
df, y = prepare_dataset(df)

In [15]:
df

Unnamed: 0,agency_id,dis_water_real,dis_shopping,no_bedrooms,max_persons,house_size,land_size,build_year,renovation_year,apartment,indoor_pool,spa,internet,pets_allowed,water_view,fire_stove,agency_rating,year,month,day
0,90,0.261,3.0,3,4,140,726,1953,2014,0,0,1,1,0,1,1,3.0,2016,08,27
1,90,0.261,3.0,3,4,140,726,1953,2014,0,0,1,1,0,1,1,3.0,2016,08,26
2,90,0.261,3.0,3,4,140,726,1953,2014,0,0,1,1,0,1,1,3.0,2016,08,25
3,90,0.261,3.0,3,4,140,726,1953,2014,0,0,1,1,0,1,1,3.0,2016,08,24
4,90,0.261,3.0,3,4,140,726,1953,2014,0,0,1,1,0,1,1,3.0,2016,08,23
5,90,0.261,3.0,3,4,140,726,1953,2014,0,0,1,1,0,1,1,3.0,2016,08,22
6,90,0.261,3.0,3,4,140,726,1953,2014,0,0,1,1,0,1,1,3.0,2016,08,21
7,90,0.261,3.0,3,4,140,726,1953,2014,0,0,1,1,0,1,1,3.0,2016,08,20
8,90,0.261,3.0,3,4,140,726,1953,2014,0,0,1,1,0,1,1,3.0,2016,08,19
9,90,0.261,3.0,3,4,140,726,1953,2014,0,0,1,1,0,1,1,3.0,2016,08,18


In [16]:
y.shape

(85195,)

In [17]:
df.head().T

Unnamed: 0,0,1,2,3,4
agency_id,90.0,90.0,90.0,90.0,90.0
dis_water_real,0.261,0.261,0.261,0.261,0.261
dis_shopping,3.0,3.0,3.0,3.0,3.0
no_bedrooms,3.0,3.0,3.0,3.0,3.0
max_persons,4.0,4.0,4.0,4.0,4.0
house_size,140.0,140.0,140.0,140.0,140.0
land_size,726.0,726.0,726.0,726.0,726.0
build_year,1953.0,1953.0,1953.0,1953.0,1953.0
renovation_year,2014.0,2014.0,2014.0,2014.0,2014.0
apartment,0.0,0.0,0.0,0.0,0.0


In [18]:
catf, numf = split_features(df)

In [19]:
type(catf)

list

In [20]:
numf

['dis_water_real',
 'dis_shopping',
 'no_bedrooms',
 'max_persons',
 'house_size',
 'land_size',
 'build_year',
 'renovation_year',
 'agency_rating']

In [21]:
X_train, X_test, y_train, y_test = split_dataset(df, y)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

(76675, 20)
(8520, 20)
(76675,)


In [22]:
scaler = get_numf_scaler(X_train[numf])

  return self.partial_fit(X, y)


In [23]:
X_train_sc = scale_numf(X_train, numf, scaler)
X_train_sc.std(axis=0)

  del sys.path[0]


dis_water_real     1.000007
dis_shopping       1.000007
no_bedrooms        1.000007
max_persons        1.000007
house_size         1.000007
land_size          1.000007
build_year         1.000007
renovation_year    1.000007
agency_rating      1.000007
agency_id          1.155344
apartment          0.372211
indoor_pool        0.181366
spa                0.499029
internet           0.397566
pets_allowed       0.498068
water_view         0.375947
fire_stove         0.473652
year               0.917424
month              3.172072
day                8.814027
dtype: float64

In [24]:
X_train_sc.head()

Unnamed: 0,dis_water_real,dis_shopping,no_bedrooms,max_persons,house_size,land_size,build_year,renovation_year,agency_rating,agency_id,apartment,indoor_pool,spa,internet,pets_allowed,water_view,fire_stove,year,month,day
79075,-0.287593,-0.558123,-1.707244,-0.206952,-0.032606,-0.192836,0.059912,-2.234136,0.313485,4,1,1,1,1,2,2,2,2,6,23
63821,-0.238652,-0.617151,0.562047,-0.206952,-0.586476,0.659852,0.0303,-0.229709,-0.93051,1,1,1,1,2,2,1,2,2,9,14
44062,-0.27164,-0.262982,0.562047,-0.206952,0.858401,0.174843,0.415255,-0.730815,0.313485,2,1,1,2,2,1,2,2,2,10,20
14359,0.312819,-0.381039,-0.572598,-0.206952,-0.923613,-0.8714,-3.404686,0.897782,0.313485,2,2,1,1,2,1,1,1,2,1,13
43492,0.325258,-0.381039,-0.572598,-1.291206,-1.068101,-0.946131,-0.443491,0.020845,-0.93051,2,2,1,1,1,2,1,1,3,9,29


In [25]:
X_test_sc = scale_numf(X_test, numf, scaler)

X_train_sc.shape
X_test_sc.shape
X_test_sc.std(axis=0)

  del sys.path[0]


dis_water_real     1.002875
dis_shopping       0.986489
no_bedrooms        1.007128
max_persons        0.998606
house_size         0.995486
land_size          0.986481
build_year         1.014101
renovation_year    0.993906
agency_rating      0.999960
agency_id          1.153273
apartment          0.372700
indoor_pool        0.177983
spa                0.498771
internet           0.398519
pets_allowed       0.498349
water_view         0.373015
fire_stove         0.476215
year               0.923446
month              3.182723
day                8.788179
dtype: float64

In [26]:
class RegressionColumnarDataset(data.Dataset):
    def __init__(self, df, cats, y):
        self.dfcats = df[cats]
        self.dfconts = df.drop(cats, axis=1)
        
        self.cats = np.stack([c.values for n, c in self.dfcats.items()], axis=1).astype(np.int64)
        self.conts = np.stack([c.values for n, c in self.dfconts.items()], axis=1).astype(np.float32)
        self.y = y.values.astype(np.float32)
        
    def __len__(self): return len(self.y)

    def __getitem__(self, idx):
        return [self.cats[idx], self.conts[idx], self.y[idx]]

In [27]:
trainds = RegressionColumnarDataset(X_train_sc, catf, y_train)
valds = RegressionColumnarDataset(X_test_sc, catf, y_test)

In [28]:
params = {'batch_size': 128,
          'shuffle': True,
          'num_workers': 1}

traindl = data.DataLoader(trainds, **params)
valdl = data.DataLoader(valds, **params)

In [29]:
y_range = (0, y_train.max()*1.2)
y_range

cat_sz = [(c, df[c].max()+1) for c in catf]
cat_sz

emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]
emb_szs

[(5, 3),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (5, 3),
 (13, 7),
 (32, 16)]

In [30]:
def rmse(targ, y_pred):
    return np.sqrt(mean_squared_error(inv_y(y_pred), inv_y(targ))) #.detach().numpy()

def emb_init(x):
    x = x.weight.data
    sc = 2/(x.size(1)+1)
    x.uniform_(-sc,sc)

class MixedInputModel(nn.Module):
    def __init__(self, emb_szs, n_cont, emb_drop, out_sz, szs, drops, y_range, use_bn=True):
        super().__init__()
        for i,(c,s) in enumerate(emb_szs): assert c > 1, f"cardinality must be >=2, got emb_szs[{i}]: ({c},{s})"
        self.embs = nn.ModuleList([nn.Embedding(c, s) for c,s in emb_szs])
        for emb in self.embs: emb_init(emb)
        n_emb = sum(e.embedding_dim for e in self.embs)
        self.n_emb, self.n_cont=n_emb, n_cont
        
        szs = [n_emb+n_cont] + szs
        self.lins = nn.ModuleList([nn.Linear(szs[i], szs[i+1]) for i in range(len(szs)-1)])
        self.bns = nn.ModuleList([nn.BatchNorm1d(sz) for sz in szs[1:]])
        for o in self.lins: nn.init.kaiming_normal_(o.weight.data)
        self.outp = nn.Linear(szs[-1], out_sz)
        nn.init.kaiming_normal_(self.outp.weight.data)

        self.emb_drop = nn.Dropout(emb_drop)
        self.drops = nn.ModuleList([nn.Dropout(drop) for drop in drops])
        self.bn = nn.BatchNorm1d(n_cont)
        self.use_bn,self.y_range = use_bn,y_range

    def forward(self, x_cat, x_cont):
        if self.n_emb != 0:
            x = [e(x_cat[:,i]) for i,e in enumerate(self.embs)]
            x = torch.cat(x, 1)
            x = self.emb_drop(x)
        if self.n_cont != 0:
            x2 = self.bn(x_cont)
            x = torch.cat([x, x2], 1) if self.n_emb != 0 else x2
        for l,d,b in zip(self.lins, self.drops, self.bns):
            x = F.relu(l(x))
            if self.use_bn: x = b(x)
            x = d(x)
        x = self.outp(x)
        if self.y_range:
            x = torch.sigmoid(x)
            x = x*(self.y_range[1] - self.y_range[0])
            x = x+self.y_range[0]
        return x.squeeze()

def fit(model, train_dl, val_dl, loss_fn, opt, scheduler, epochs=3):
    num_batch = len(train_dl)
    for epoch in tnrange(epochs):      
        y_true_train = list()
        y_pred_train = list()
        total_loss_train = 0          
        
        t = tqdm_notebook(iter(train_dl), leave=False, total=num_batch)
        for cat, cont, y in t:
            cat = cat.cuda()
            cont = cont.cuda()
            y = y.cuda()
            
            t.set_description(f'Epoch {epoch}')
            
            opt.zero_grad()
            pred = model(cat, cont)
            loss = loss_fn(pred, y)
            loss.backward()
            lr[epoch].append(opt.param_groups[0]['lr'])
            tloss[epoch].append(loss.item())
            scheduler.step()
            opt.step()
            
            t.set_postfix(loss=loss.item())
            
            y_true_train += list(y.cpu().data.numpy())
            y_pred_train += list(pred.cpu().data.numpy())
            total_loss_train += loss.item()
            
        train_acc = rmse(y_true_train, y_pred_train)
        train_loss = total_loss_train/len(train_dl)
        
        if val_dl:
            y_true_val = list()
            y_pred_val = list()
            total_loss_val = 0
            for cat, cont, y in tqdm_notebook(val_dl, leave=False):
                cat = cat.cuda()
                cont = cont.cuda()
                y = y.cuda()
                pred = model(cat, cont)
                loss = loss_fn(pred, y)
                
                y_true_val += list(y.cpu().data.numpy())
                y_pred_val += list(pred.cpu().data.numpy())
                total_loss_val += loss.item()
                vloss[epoch].append(loss.item())
            valacc = rmse(y_true_val, y_pred_val)
            valloss = total_loss_val/len(valdl)
            print(f'Epoch {epoch}: train_loss: {train_loss:.4f} train_rmse: {train_acc:.4f} | val_loss: {valloss:.4f} val_rmse: {valacc:.4f}')
        else:
            print(f'Epoch {epoch}: train_loss: {train_loss:.4f} train_rmse: {train_acc:.4f}')
    
    return lr, tloss, vloss

In [31]:
device

device(type='cpu')

In [32]:
m = MixedInputModel(emb_szs=emb_szs, 
                    n_cont=len(df.columns)-len(catf), 
                    emb_drop=0.04, 
                    out_sz=1, 
                    szs=[1000,500,250], 
                    drops=[0.001,0.01,0.01], 
                    y_range=y_range).to(device)

opt = optim.Adam(m.parameters(), 1e-2)
lr_cosine = lr_scheduler.CosineAnnealingLR(opt, 1000)

lr = defaultdict(list)
tloss = defaultdict(list)
vloss = defaultdict(list)

In [33]:
m

MixedInputModel(
  (embs): ModuleList(
    (0): Embedding(5, 3)
    (1): Embedding(3, 2)
    (2): Embedding(3, 2)
    (3): Embedding(3, 2)
    (4): Embedding(3, 2)
    (5): Embedding(3, 2)
    (6): Embedding(3, 2)
    (7): Embedding(3, 2)
    (8): Embedding(5, 3)
    (9): Embedding(13, 7)
    (10): Embedding(32, 16)
  )
  (lins): ModuleList(
    (0): Linear(in_features=52, out_features=1000, bias=True)
    (1): Linear(in_features=1000, out_features=500, bias=True)
    (2): Linear(in_features=500, out_features=250, bias=True)
  )
  (bns): ModuleList(
    (0): BatchNorm1d(1000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): BatchNorm1d(250, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (outp): Linear(in_features=250, out_features=1, bias=True)
  (emb_drop): Dropout(p=0.04)
  (drops): ModuleList(
    (0): Dropout(p=0.001)
    (1): Dropout(p=0.01)
    

In [34]:
lr, tloss, vloss = fit(model=m, train_dl=traindl, val_dl=valdl, loss_fn=F.mse_loss, opt=opt, scheduler=lr_cosine, epochs=10)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




BrokenPipeError: [Errno 32] Broken pipe