In [2]:
#misc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
import tqdm

In [3]:
# pytorch
from torch.utils.data import DataLoader as torch_dl
from torch.utils.data import Dataset
from torch import  nn
from torch import optim
from torch.nn.init import *
from torch.nn import functional as F

In [60]:
train = pd.read_csv('Route15.csv', index_col=0)
train['TT_next30'] = train['TT'].shift(-6)
train, val = train[288*(365+60):288*(365+90+100)], train[288*(365+90+100):-6]
#test = pd.read_csv('input/test.tsv', sep='\t')

In [42]:
def preprocess(data, cats):
    """
    map the string data to integer id for calculate embedding
    """
    data = data.fillna('missing') # replace nan with str "missing"
    data = data.replace('-99', 'missing')
    # map data[cat] to integer ids
    for cat in cats:
        data[cat] = data[cat].astype('category').cat.codes # cat: some utilities for categorical
    # for those who are already integer ids, don't have to change
    return data

In [6]:
def EmbeddingDataPreprocess(data, cats, inplace =True):
    ### Each categorical column should have indices as values 
    ### Which will be looked up at embedding matrix and used in modeling
    ### Make changes inplace
    if inplace:
        for c in cats:
            # same as data[c].replace({val:i for i, val in zip(range(len(data[c].unique())), data_copy[c].unique())})
            data[c].replace({val:i for i, val in enumerate(data[c].unique())}, inplace=True)
        return data
    else:
        data_copy = data.copy()
        for c in cats:
            data_copy[c].replace({val:i for i, val in enumerate(data_copy[c].unique())}, inplace=True)
        return data_copy

In [66]:
def get_embs_dims(data, cats):
    # get # unique categories of each category
    cat_sz = [len(data[c].unique()) for c in cats]
    return [(c, min(50, (c+1)//2)) for c in cat_sz] # we don't want the embedding vector too long (over 50) so use min(50, (c+1)//2)
def emb_init(x):
    x = x.weight.data
    sc = 2/(x.size(1)+1)
    x.uniform_(-sc,sc)

class EmbeddingDataset(Dataset):
    ### This dataset will prepare inputs cats, conts and output y 
    ### To be feed into our mixed input embedding fully connected NN model 
    ### Stacks numpy arrays to create nxm matrices where n = rows, m = columns
    ### Gives y 0 if not specified
    def __init__(self, cats, conts, y):
        n = len(cats[0]) if cats else len(conts[0])
        self.cats = np.stack(cats, 1).astype(np.int64) if cats else np.zeros((n,1))
        self.conts = np.stack(conts, 1).astype(np.float32) if conts else np.zeros((n,1))
        self.y = np.zeros((n,1)) if y is None else y[:,None].astype(np.float32)
        
    def __len__(self): return len(self.y)

    def __getitem__(self, idx):
        return [self.cats[idx], self.conts[idx], self.y[idx]]
    
    @classmethod
    def from_data_frames(cls, df_cat, df_cont, y=None):
        cat_cols = [c.values for n,c in df_cat.items()]
        cont_cols = [c.values for n,c in df_cont.items()]
        return cls(cat_cols, cont_cols, y)

    @classmethod
    def from_data_frame(cls, df, cat_flds, y=None):
        return cls.from_data_frames(df[cat_flds], df.drop(cat_flds, axis=1), y)        

In [67]:
### We will keep this for fastai compatibility
class ModelData():
    def __init__(self, path, trn_dl, val_dl, test_dl=None):
        self.path,self.trn_dl,self.val_dl,self.test_dl = path,trn_dl,val_dl,test_dl
        
class EmbeddingModelData(ModelData):
    ### This class provides training and validation dataloaders
    ### Which we will use in our model
    
    def __init__(self, path, trnx_ds, val_ds, bs, test_ds=None):
        test_dl = DataLoader(test_ds, bs, shuffle=False, num_workers=1) if test_ds is not None else None
        super().__init__(path, torch_dl(trnx_ds, batch_size=bs, shuffle=True, num_workers=1)
                         ,torch_dl(val_ds, batch_size=bs, shuffle=True, num_workers=1), test_ds)
    
    @classmethod
    def from_data_frames(cls, path, trn_df, val_df, trn_y, val_y, cat_flds, bs, test_df=None):
        test_ds = EmbeddingDataset.from_data_frame(test_df, cat_flds) if test_df is not None else None
        return cls(path, EmbeddingDataset.from_data_frame(trn_df, cat_flds, trn_y),
                    EmbeddingDataset.from_data_frame(val_df, cat_flds,val_y), bs, test_ds=test_ds)

    @classmethod
    def from_data_frame(cls, path, val_idxs, trn_idxs, df, y, cat_flds, bs, test_df=None):
        val_df, val_y = df.iloc[val_idxs], y[val_idxs]
        trn_df, trn_y = df.iloc[trn_idxs], y[trn_idxs]
        return cls.from_data_frames(path, trn_df, val_df, trn_y, val_y, cat_flds, bs, test_df)
    
class EmbeddingModel(nn.Module):
    def __init__(self, emb_szs, n_cont, emb_drop, out_sz, szs, drops, y_range=None, use_bn=False, classify=None):
        super().__init__() ## inherit from nn.Module parent class
        self.embs = nn.ModuleList([nn.Embedding(m, d) for m, d in emb_szs]) ## construct embeddings
        for emb in self.embs: emb_init(emb) ## initialize embedding weights
        n_emb = sum(e.embedding_dim for e in self.embs) ## get embedding dimension needed for 1st layer
        szs = [n_emb+n_cont] + szs ## add input layer to szs
        self.lins = nn.ModuleList([
            nn.Linear(szs[i], szs[i+1]) for i in range(len(szs)-1)]) ## create linear layers input, l1 -> l1, l2 ...
        self.bns = nn.ModuleList([
            nn.BatchNorm1d(sz) for sz in szs[1:]]) ## batchnormalization for hidden layers activations
        for o in self.lins: kaiming_normal(o.weight.data) ## init weights with kaiming normalization
        self.outp = nn.Linear(szs[-1], out_sz) ## create linear from last hidden layer to output
        kaiming_normal(self.outp.weight.data) ## do kaiming initialization
        
        self.emb_drop = nn.Dropout(emb_drop) ## embedding dropout, will zero out weights of embeddings
        self.drops = nn.ModuleList([nn.Dropout(drop) for drop in drops]) ## fc layer dropout
        self.bn = nn.BatchNorm1d(n_cont) # bacthnorm for continous data
        self.use_bn,self.y_range = use_bn,y_range 
        self.classify = classify
        
    def forward(self, x_cat, x_cont):
        x = [emb(x_cat[:, i]) for i, emb in enumerate(self.embs)] # takes necessary emb vectors 
        x = torch.cat(x, 1) ## concatenate along axis = 1 (columns - side by side) # this is our input from cats
        x = self.emb_drop(x) ## apply dropout to elements of embedding tensor
        x2 = self.bn(x_cont) ## apply batchnorm to continous variables
        x = torch.cat([x, x2], 1) ## concatenate cats and conts for final input
        for l, d, b in zip(self.lins, self.drops, self.bns):
            x = F.relu(l(x)) ## dotprod + non-linearity
            if self.use_bn: x = b(x) ## apply batchnorm activations
            x = d(x) ## apply dropout to activations
        x = self.outp(x) # we defined this externally just not to apply dropout to output
        if self.classify:
            x = F.sigmoid(x) # for classification
        elif y_range:
            x = F.sigmoid(x) ## scales the output between 0,1
            x = x*(self.y_range[1] - self.y_range[0]) ## scale output
            x = x + self.y_range[0] ## shift output
        return x    

In [61]:
train_test = pd.concat([train, val])[['holiday', 'weekday', 'hldy_seq', 'timeslot', 'TT_next30']]

In [62]:
cats = ['holiday', 'weekday', 'hldy_seq', 'timeslot']
train_test = preprocess(train_test, cats) # turn category into id
# drop=False: the origin index will be added to the dataframe, but we don't need, so use drop=True
train_test = train_test.reset_index(drop=True)
train_test = EmbeddingDataPreprocess(train_test, cats, inplace=True) # turn id to another id (meaningless imo...)
train_df = train_test.iloc[range(len(train))]
test_df = train_test.iloc[range(len(train),len(train_test))]

In [63]:
del train
test_id = test.index
del val
gc.collect()

475

In [72]:
train_input, train_y = train_df.drop('TT_next30', 1), np.log(train_df['TT_next30'] + 1)
test_input, test_y = test_df.drop('TT_next30', 1), np.log(test_df['TT_next30'] + 1)
y_range = (train_y.min(), train_y.max())
emb_szs = get_embs_dims(train_test, cats)

model_data = EmbeddingModelData.from_data_frames('./tmp', train_input, test_input, train_y, test_y, cats, bs=32) 
emb_model = EmbeddingModel(emb_szs, 1, 0.04, 1, [1000, 500], [0.001, 0.01], y_range = y_range, classify=None)
emb_model.cuda()

EmbeddingModel(
  (embs): ModuleList(
    (0): Embedding(9, 5)
    (1): Embedding(7, 4)
    (2): Embedding(9, 5)
    (3): Embedding(288, 50)
  )
  (lins): ModuleList(
    (0): Linear(in_features=65, out_features=1000)
    (1): Linear(in_features=1000, out_features=500)
  )
  (bns): ModuleList(
    (0): BatchNorm1d(1000, eps=1e-05, momentum=0.1, affine=True)
    (1): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True)
  )
  (outp): Linear(in_features=500, out_features=1)
  (emb_drop): Dropout(p=0.04)
  (drops): ModuleList(
    (0): Dropout(p=0.001)
    (1): Dropout(p=0.01)
  )
  (bn): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True)
)

In [69]:
def embedding_train(model, model_data, optimizer, criterion, epochs):    
    for epoch in range(epochs):
        for data in iter(model_data.trn_dl):
            
            # get inputs
            x_cats, x_conts, y = data

            # wrap with variable
            x_cats, x_conts, y = Variable(x_cats), Variable(x_conts), Variable(y)

            # zero the parameter gradients
            optimizer.zero_grad()
            
            # forward + backward + optimize
            outputs = model(x_cats, x_conts)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()

In [73]:
# Training
opt = optim.SGD(emb_model.parameters(), lr = 1e-4, weight_decay=1e-4)
crit = F.mse_loss
epochs = 1
embedding_train(emb_model, model_data, opt, crit, 1)



BrokenPipeError: [Errno 32] Broken pipe

In [None]:
# Second training
opt = optim.SGD(emb_model.parameters(), lr = 5e-4, weight_decay=1e-4)
crit = F.mse_loss
epochs = 1
embedding_train(emb_model, model_data, opt, crit, 1)