In [1]:
import os 
os.chdir(os.path.pardir)
# load data from file 
import numpy as np 
save_file_name = ['fea_seq.npy', 'last_observation_seq.npy', 'label_seq.npy', 'masking_seq.npy',
                   'delta_seq.npy', 'train_valid_test_split.npy']
save_folder = 'data/raw'
saved_arrays = []
for file_name in save_file_name:
    saved_arrays.append(np.load(os.path.join(save_folder, file_name)))
[fea_seq, last_observation_seq, label_seq, masking_seq, delta_seq, train_valid_test_split] = saved_arrays

# train-test-split 
train_index = [k for k in range(train_valid_test_split[0])]
dev_index = [k for k in range(train_valid_test_split[0], 
                               train_valid_test_split[0] + train_valid_test_split[1])]
test_index = [k for k in range(train_valid_test_split[0] + train_valid_test_split[1],
              train_valid_test_split[0] + train_valid_test_split[1] + train_valid_test_split[2])]

def get_array_by_index_range(nparray_list, label_array, index_range):
    '''
    nparray_list: list of nparrays to select according to index range 
    label_array: select the labels from label array
    '''
    # get non-na index
    non_na_index = []
    for index in index_range:
        if not np.isnan(label_array[index]):
            non_na_index.append(index)
    
    return [k[non_na_index] for k in nparray_list], label_array[non_na_index].reshape(-1)

In [2]:
# normalize delta

delta_seq = (delta_seq - np.mean(delta_seq)) / np.std(delta_seq) 

In [4]:
# delta_seq[0]

In [5]:
# split set to train, test and dev sets 
# train set
[fea_train, last_train, masking_train, delta_train], label_train =  get_array_by_index_range([fea_seq,last_observation_seq, masking_seq, delta_seq
                                                                 ], label_seq, train_index)
# dev set 
[fea_dev, last_dev, masking_dev, delta_dev], label_dev =  get_array_by_index_range([fea_seq, last_observation_seq, masking_seq, delta_seq
                                                           ], label_seq, dev_index)
# test set 
[fea_test, last_test, masking_test, delta_test], label_test =  get_array_by_index_range([fea_seq, last_observation_seq, masking_seq, delta_seq
                                                              ], label_seq, test_index)

def normalize_feature(fea_train, array_list):
    """
    array_list: [fea_dev, fea_test, last_train, last_dev, last_test] to normalize 
    """
    train_mean = np.nanmean(fea_train, axis=0)
    train_std = np.nanstd(fea_train, axis=0)
    def norm_arr(nparr):
        return(nparr - train_mean)/train_std
    return (norm_arr(fea_train), [norm_arr(k) for k in array_list])

fea_train, [fea_dev, fea_test, last_train, last_dev, last_test] = normalize_feature(fea_train,
                                                                                   [fea_dev, fea_test, 
                                                                                    last_train, last_dev,
                                                                                    last_test])


In [6]:
# record mean after normalization 
x_mean_aft_nor = np.nanmean(fea_train, axis=0)

In [7]:
import torch
import torch.utils.data as utils
import torch.nn.functional as F
import torch.nn as nn
from torch.nn.parameter import Parameter
import torch.optim as optim
import random
from torch.autograd import Variable, grad
from torch.optim.lr_scheduler import ReduceLROnPlateau
import math

In [8]:
class FilterLinear(nn.Module):
    def __init__(self, in_features, out_features, filter_square_matrix, bias=True):
        '''
        filter_square_matrix : filter square matrix, whose each elements is 0 or 1.
        '''
        super(FilterLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        
        use_gpu = torch.cuda.is_available()
        self.filter_square_matrix = None
        if use_gpu:
            self.filter_square_matrix = Variable(filter_square_matrix.cuda(), requires_grad=False)
        else:
            self.filter_square_matrix = Variable(filter_square_matrix, requires_grad=False)
        
        self.weight = Parameter(torch.Tensor(out_features, in_features))
        if bias:
            self.bias = Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()
        
    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)
#         print(self.weight.data)
#         print(self.bias.data)

    def forward(self, input):
#         print(self.filter_square_matrix.mul(self.weight))
        return F.linear(input, self.filter_square_matrix.mul(self.weight), self.bias)

    def __repr__(self):
        return self.__class__.__name__ + '(' \
            + 'in_features=' + str(self.in_features) \
            + ', out_features=' + str(self.out_features) \
            + ', bias=' + str(self.bias is not None) + ')'

In [19]:
class GRUD(nn.Module):
    def __init__(self, input_size, hidden_size, X_mean, output_last = True, dropout=0):
        """
        Recurrent Neural Networks for Multivariate Times Series with Missing Values
        GRU-D: GRU exploit two representations of informative missingness patterns, i.e., masking and time interval.
        cell_size is the size of cell_state.
        
        GRU-D:
            input_size: variable dimension of each time
            hidden_size: dimension of hidden_state
            mask_size: dimension of masking vector
            X_mean: the mean of the historical input data
        """
        
        super(GRUD, self).__init__()
        
        self.hidden_size = hidden_size
        self.delta_size = input_size
        self.mask_size = input_size
        
        use_gpu = torch.cuda.is_available()
        if use_gpu:
            self.identity = torch.eye(input_size).cuda()
            self.zeros = Variable(torch.zeros(input_size).cuda())
            self.X_mean = Variable(torch.Tensor(X_mean).cuda())
        else:
            self.identity = torch.eye(input_size)
            self.zeros = Variable(torch.zeros(input_size))
            self.X_mean = Variable(torch.Tensor(X_mean))
        
        self.zl = nn.Linear(input_size + hidden_size + self.mask_size, hidden_size)
        self.rl = nn.Linear(input_size + hidden_size + self.mask_size, hidden_size)
        self.hl = nn.Linear(input_size + hidden_size + self.mask_size, hidden_size)
        
        self.gamma_x_l = FilterLinear(self.delta_size, self.delta_size, self.identity)
        
        self.gamma_h_l = nn.Linear(self.delta_size, self.delta_size)
        
        self.output_last = output_last
        self.fc1 = nn.Linear(hidden_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)
        self.dropout = nn.Dropout(dropout)
        
    def step(self, x, x_last_obsv, x_mean, h, mask, delta):
        
        batch_size = x.shape[0]
        dim_size = x.shape[1]
        
        delta_x = torch.exp(-torch.max(self.zeros, self.gamma_x_l(delta)))
        delta_h = torch.exp(-torch.max(self.zeros, self.gamma_h_l(delta)))
        
        tmp = mask * x + (1 - mask) * (delta_x * x_last_obsv)
        x = mask * x + (1 - mask) * (delta_x * x_last_obsv + (1 - delta_x) * x_mean)
        h = delta_h * h
        
        combined = torch.cat((x, h, mask), 1)
        z = torch.sigmoid(self.zl(combined))
        r = torch.sigmoid(self.rl(combined))
        combined_r = torch.cat((x, r * h, mask), 1)
        h_tilde = torch.tanh(self.hl(combined_r))
        h = (1 - z) * h + z * h_tilde
        
        return h
    
    def forward(self, input):
        batch_size = input.size(0)
        type_size = input.size(1)
        step_size = input.size(2)
        spatial_size = input.size(3)
        
        Hidden_State = self.initHidden(batch_size)
        
        def squeeze_d1(matrix):
            return torch.squeeze(matrix, dim=1)
        X = squeeze_d1(input[:,0,:,:])
        X_last_obsv = squeeze_d1(input[:,1,:,:])
        Mask = squeeze_d1(input[:,2,:,:])
        Delta = squeeze_d1(input[:,3,:,:])
        
        outputs = None
        for i in range(step_size):
#             print("x_mean size: ")
#             print(self.X_mean.size())
            Hidden_State = self.step(squeeze_d1(X[:,i:i+1,:])\
                                     , squeeze_d1(X_last_obsv[:,i:i+1,:])\
                                     , squeeze_d1(self.X_mean[:,i:i+1,:])\
                                     , Hidden_State\
                                     , squeeze_d1(Mask[:,i:i+1,:])\
                                     , squeeze_d1(Delta[:,i:i+1,:]))
            if outputs is None:
                outputs = Hidden_State.unsqueeze(1)
            else:
                outputs = torch.cat((outputs, Hidden_State.unsqueeze(1)), 1)
                
        if self.output_last:
            last_hs = outputs[:,-1,:]
            output = F.relu(self.fc1(last_hs))
            output = self.dropout(output)
            output = self.fc2(output)
            return output
        else:
            raise Exception("Not output last")

    
    def initHidden(self, batch_size):
        use_gpu = torch.cuda.is_available()
        if use_gpu:
            Hidden_State = Variable(torch.zeros(batch_size, self.hidden_size).cuda())
            return Hidden_State
        else:
            Hidden_State = Variable(torch.zeros(batch_size, self.hidden_size))
            return Hidden_State

In [10]:
# get dataset for grud 
def dataset_aggregation(feature_array, last_obsv, mask, delta):
    # expand dimension of array
    def expd(arr):
        return np.expand_dims(arr, axis=1)
    return np.concatenate((expd(feature_array), expd(last_obsv), expd(mask), expd(delta)), axis = 1)

In [11]:
# dataset_aggregation for train, dev, test 
# train_aggr = dataset_aggregation(fea_train, last_train, masking_train, delta_train)
train_aggr = dataset_aggregation(last_train, last_train, masking_train, delta_train)

In [12]:
# dev_aggr = dataset_aggregation(fea_dev, last_dev, masking_dev, delta_dev)
# test_aggr = dataset_aggregation(fea_test, last_test, masking_test, delta_test)
dev_aggr = dataset_aggregation(last_dev, last_dev, masking_dev, delta_dev)
test_aggr = dataset_aggregation(last_test, last_test, masking_test, delta_test)

In [13]:
train_aggr[0:1,:].shape[3]

1

In [14]:
x_mean_aft_nor = np.expand_dims(x_mean_aft_nor, axis=0)

In [20]:
def train_grud(X_train, y_train, X_valid, y_valid, X_test, y_test, config, x_mean_aft_nor, dropout = 0):
    # no shuffle, keep original order 
    # swap axes for back propagation 
#     def swap_axes(nparr):
#         return nparr.swapaxes(0,1)
#     X_train = swap_axes(X_train)
#     X_valid = swap_axes(X_valid)
#     X_test = swap_axes(X_test)
    
    # model parameters
    input_size = X_train.shape[3]
    h = config["h"]
    t = X_train.shape[2]
    output_dim = 1
    dropout = config["drop"]
    
    model = GRUD(input_size, h, x_mean_aft_nor, output_last = True, dropout=dropout)
    
    optimizer = optim.Adam(model.parameters(), lr=config["lr"])

    criterion = nn.MSELoss()
    
    device = torch.device('cpu')
    model = model.to(device)
    criterion = criterion.to(device)
    scheduler = ReduceLROnPlateau(optimizer, mode="min", patience=10, factor=0.5, verbose=True)
    
    def train(model, batchsize, X_train, y_train, optimizer, criterion):
        epoch_loss = 0
        model.train()
        total_n = X_train.shape[0]
        num_batches = math.ceil(total_n / batchsize)
        for batch in range(num_batches):
            start = batch*batchsize
            end = (batch+1)*batchsize
            optimizer.zero_grad()
            batch_X = torch.Tensor(X_train[start:end, :])
            batch_y = torch.Tensor(y_train[start:end])
            predictions = model.forward(batch_X).squeeze(1)
            loss = criterion(predictions, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        return epoch_loss / num_batches 
    
    def evaluate(model, X_valid, y_valid, criterion):
        epoch_loss = 0
        model.eval()
        with torch.no_grad():
            batch_X = torch.Tensor(X_valid)
            batch_y = torch.Tensor(y_valid)
            predictions = model.forward(batch_X).squeeze(1)
            epoch_loss = criterion(predictions, batch_y).item()
        return epoch_loss

    def predict(model, X_test):
        epoch_loss = 0
        model.eval()
        with torch.no_grad():
            batch_X = torch.Tensor(X_test)
            predictions = model.forward(batch_X).squeeze(1)
            predictions = predictions.cpu().data.numpy()
        return predictions

    # timing
#     start_time = time.time()
#     predictions = predict(model, X_test)
#     print(predictions.shape)
#     print(predictions)
#     end_time = time.time()
#     print(end_time-start_time)
#     assert False
     
    best_valid = 999999.0
    rand = random.randint(0,100000)
    print('epoch train_loss valid_loss')
    for epoch in range(config["num_epochs"]):
        train_loss = train(model, config["batchsize"], X_train, y_train, optimizer, criterion)
        valid_loss = evaluate(model, X_valid, y_valid, criterion)
        scheduler.step(valid_loss)
        if valid_loss <= best_valid:
            # save model
            best_valid = valid_loss
            print(epoch, train_loss, valid_loss, 'saving model')
            torch.save(model, 'models/lstm_%d.pt' %rand)
        else:
            print(epoch, train_loss, valid_loss)

    model = torch.load('models/lstm_%d.pt' %rand)

    predictions = predict(model, X_test)
    mae = np.mean(np.absolute(predictions-y_test))
    print("mae: ", mae)
    mse = np.mean((predictions - y_test)**2)
    print("mse: ", mse)
#     corr = np.corrcoef(predictions,y_test)[0][1]
#     print("corr: ", corr)
#     true_label = (y_test >= 0)
#     sys.stdout.flush()

In [21]:
x_mean_aft_nor.shape

(1, 7, 1)

In [25]:
config = {'h':64, 'lr':0.0001, 'num_epochs':50, 'batchsize':32, 'drop':0.5}
seed = 123
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
train_grud(train_aggr, label_train, dev_aggr, label_dev, test_aggr, label_test, config, x_mean_aft_nor)

epoch train_loss valid_loss
0 144.09268261137464 112.29701232910156 saving model
1 143.32800220307848 111.58708190917969 saving model
2 142.55936177571616 110.85371398925781 saving model
3 141.67873164585657 110.01898193359375 saving model
4 140.76136452811105 109.03749084472656 saving model
5 139.59604971749442 107.8071060180664 saving model
6 138.01105753580728 106.09915161132812 saving model
7 136.02139572870163 103.64372253417969 saving model
8 132.81409672328405 99.7139663696289 saving model
9 127.34080069405692 92.69762420654297 saving model
10 118.0801511492048 79.47148895263672 saving model
11 100.42741811843146 58.703277587890625 saving model
12 77.838927314395 39.956932067871094 saving model
13 58.35975156511579 29.307415008544922 saving model
14 46.32283465067545 23.608463287353516 saving model
15 39.20863233293806 20.46603775024414 saving model
16 36.831469127110076 18.812332153320312 saving model
17 32.338939757574174 18.036155700683594 saving model
18 30.797796476454963 1

In [24]:
config = {'h':64, 'lr':0.0001, 'num_epochs':50, 'batchsize':32, 'drop':0.2}
seed = 123
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
train_grud(train_aggr, label_train, dev_aggr, label_dev, test_aggr, label_test, config, x_mean_aft_nor)

epoch train_loss valid_loss
0 144.09613000778924 112.28630065917969 saving model
1 143.30677759079705 111.55638122558594 saving model
2 142.51056598481676 110.79736328125 saving model
3 141.60945311046783 109.93473815917969 saving model
4 140.5901391165597 108.89582061767578 saving model
5 139.3742937360491 107.51197052001953 saving model
6 137.61573973156158 105.55892181396484 saving model
7 135.25978379022507 102.72268676757812 saving model
8 131.56709362211683 98.08820343017578 saving model
9 125.22598520914714 89.62700653076172 saving model
10 113.65036828177315 73.9071044921875 saving model
11 93.41203689575195 52.07622146606445 saving model
12 70.14385414123535 35.50328826904297 saving model
13 52.85699953351702 26.624557495117188 saving model
14 42.01252442314511 21.877925872802734 saving model
15 35.05150236402239 19.378318786621094 saving model
16 32.28873820531936 18.186214447021484 saving model
17 28.633030528113956 17.746469497680664 saving model
18 28.058009510948544 17.71

In [23]:
config = {'h':32, 'lr':0.0001, 'num_epochs':50, 'batchsize':32, 'drop':0}
seed = 123
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
train_grud(train_aggr, label_train, dev_aggr, label_dev, test_aggr, label_test, config, x_mean_aft_nor)

epoch train_loss valid_loss
0 144.98113614036924 113.31004333496094 saving model
1 144.58987499418714 112.9237289428711 saving model
2 144.17743610200426 112.51087188720703 saving model
3 143.74056970505487 112.07484436035156 saving model
4 143.27923184349424 111.61736297607422 saving model
5 142.7927031744094 111.13607788085938 saving model
6 142.27810632614862 110.62649536132812 saving model
7 141.7314689272926 110.08468627929688 saving model
8 141.14879208519346 109.50629425048828 saving model
9 140.52348145984467 108.88167572021484 saving model
10 139.84443846203033 108.1917953491211 saving model
11 139.09578850155785 107.42391967773438 saving model
12 138.26453145345053 106.56607055664062 saving model
13 137.33827609107607 105.60845947265625 saving model
14 136.302249000186 104.53165435791016 saving model
15 135.13734109061105 103.32125854492188 saving model
16 133.82503509521484 101.95484161376953 saving model
17 132.3396987915039 100.39956665039062 saving model
18 130.6502943493

In [22]:
config = {'h':64, 'lr':0.0001, 'num_epochs':50, 'batchsize':32, 'drop':0}
seed = 123
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
train_grud(train_aggr, label_train, dev_aggr, label_dev, test_aggr, label_test, config, x_mean_aft_nor)

epoch train_loss valid_loss
0 144.08581906273253 112.28168487548828 saving model
1 143.2817153930664 111.54412841796875 saving model
2 142.4728273664202 110.77467346191406 saving model
3 141.59358433314733 109.9035415649414 saving model
4 140.56763930547805 108.83810424804688 saving model
5 139.2681179954892 107.39895629882812 saving model
6 137.4923328218006 105.37459564208984 saving model
7 135.01947203136626 102.42082214355469 saving model
8 131.2140372140067 97.54923248291016 saving model
9 124.59526025681268 88.57099151611328 saving model
10 112.18622716267903 72.04300689697266 saving model
11 91.30404772077289 50.12971496582031 saving model
12 67.83551270621163 34.187705993652344 saving model
13 50.75630832853771 25.747900009155273 saving model
14 40.43479456220354 21.32763671875 saving model
15 34.212643895830425 19.054880142211914 saving model
16 30.385031336829776 18.025062561035156 saving model
17 28.04489312853132 17.704607009887695 saving model
18 26.653306189037504 17.7532

In [26]:
config = {'h':128, 'lr':0.0001, 'num_epochs':50, 'batchsize':32, 'drop':0}
seed = 123
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
train_grud(train_aggr, label_train, dev_aggr, label_dev, test_aggr, label_test, config, x_mean_aft_nor)

epoch train_loss valid_loss
0 145.50146702357702 113.42267608642578 saving model
1 144.22535414922805 112.1179428100586 saving model
2 142.71865554082962 110.42972564697266 saving model
3 140.66197313581193 107.91881561279297 saving model
4 137.26863388788132 103.06591033935547 saving model
5 129.83041054861886 91.03114318847656 saving model
6 109.74551464262463 57.65034484863281 saving model
7 66.62122354053315 20.20960235595703 saving model
8 35.84184605734689 18.501569747924805 saving model
9 28.93174916221982 19.746389389038086
10 27.04877049582345 19.373741149902344
11 26.153835932413738 19.11964225769043
12 25.66512530190604 19.05242347717285
13 25.376947266714915 19.03957176208496
14 25.206623894827707 19.03276252746582
15 25.104502587091353 19.026987075805664
16 25.035078366597492 19.017995834350586
17 24.98800159635998 19.012653350830078
18 24.950860023498535 18.98025894165039
Epoch    20: reducing learning rate of group 0 to 5.0000e-05.
19 24.932546252296085 18.98865699768066

In [None]:
fea_train[0]

In [None]:
last_train[0]

In [None]:
masking_train[0]

In [None]:
delta_train[0]