In [1]:
import os 
os.chdir(os.path.pardir)
# load data from file 
import numpy as np 
save_file_name = ['fea_seq.npy', 'last_observation_seq.npy', 'label_seq.npy', 'masking_seq.npy',
                   'delta_seq.npy', 'train_valid_test_split.npy']
save_folder = 'data/raw/pol_temp_rh'
saved_arrays = []
for file_name in save_file_name:
    saved_arrays.append(np.load(os.path.join(save_folder, file_name)))
[fea_seq, last_observation_seq, label_seq, masking_seq, delta_seq, train_valid_test_split] = saved_arrays

In [3]:
# train-test-split 
train_index = [k for k in range(train_valid_test_split[0])]
dev_index = [k for k in range(train_valid_test_split[0], 
                               train_valid_test_split[0] + train_valid_test_split[1])]
test_index = [k for k in range(train_valid_test_split[0] + train_valid_test_split[1],
              train_valid_test_split[0] + train_valid_test_split[1] + train_valid_test_split[2])]

In [4]:
def get_array_by_index_range(nparray_list, label_array, index_range):
    '''
    nparray_list: list of nparrays to select according to index range 
    label_array: select the labels from label array
    '''
    # get non-na index
    non_na_index = []
    for index in index_range:
        if not np.isnan(label_array[index]):
            non_na_index.append(index)
    
    return [k[non_na_index] for k in nparray_list], label_array[non_na_index].reshape(-1)

In [5]:
# split set to train, test and dev sets 
# train set
[fea_train, last_train], label_train =  get_array_by_index_range([fea_seq,last_observation_seq], label_seq, train_index)
# dev set 
[fea_dev, last_dev], label_dev =  get_array_by_index_range([fea_seq, last_observation_seq], label_seq, dev_index)
# test set 
[fea_test, last_test], label_test =  get_array_by_index_range([fea_seq, last_observation_seq], label_seq, test_index)

In [6]:
def normalize_feature(fea_train, array_list):
    """
    array_list: [fea_dev, fea_test, last_train, last_dev, last_test] to normalize 
    """
    train_mean = np.nanmean(fea_train, axis=0)
    train_std = np.nanstd(fea_train, axis=0)
    def norm_arr(nparr):
        return(nparr - train_mean)/train_std
    return (norm_arr(fea_train), [norm_arr(k) for k in array_list])

In [7]:
fea_train, [fea_dev, fea_test, last_train, last_dev, last_test] = normalize_feature(fea_train,
                                                                                   [fea_dev, fea_test, 
                                                                                    last_train, last_dev,
                                                                                    last_test])

In [8]:
# record mean after normalization 
x_mean_aft_nor = np.nanmean(fea_train, axis=0)

In [9]:
# control experiment using last observed value for missing data imputation 
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
from torch.autograd import Variable, grad
from torch.optim.lr_scheduler import ReduceLROnPlateau
import math

In [10]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_dim, dropout):
        """
        input_size - the number of expected features in the input x
        hidden_size - the number of hidden units in state h
        """
        super(LSTM, self).__init__()
        self.h = hidden_size
        self.lstm = nn.LSTMCell(input_size, hidden_size)
        self.fc1 = nn.Linear(hidden_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    
    def forward(self, x):
        """
        x: shape (time_step, n_features)
        """
        t = x.shape[0]
        n = x.shape[1]
        self.hx = torch.zeros(n, self.h)
        self.cx = torch.zeros(n, self.h)
        all_hs = []
        all_cs = []
        # iterate through cells 
        for i in range(t):
            self.hx, self.cx = self.lstm(x[i], (self.hx, self.cx))
            all_hs.append(self.hx)
            all_cs.append(self.cx)
        # last hidden layer last_hs is n * h
        last_hs = all_hs[-1]
        output = F.relu(self.fc1(last_hs))
        output = self.dropout(output)
        output = self.fc2(output)
        return output
    
def train_lstm(X_train, y_train, X_valid, y_valid, X_test, y_test, config):
    # no shuffle, keep original order 
    # swap axes for back propagation 
    def swap_axes(nparr):
        return nparr.swapaxes(0,1)
    X_train = swap_axes(X_train)
    X_valid = swap_axes(X_valid)
    X_test = swap_axes(X_test)
    
    # model parameters
    input_size = X_train.shape[2]
    h = config["h"]
    t = X_train.shape[0]
    output_dim = 1
    dropout = config["drop"]
    
    model = LSTM(input_size, h, output_dim, dropout)
    
    optimizer = optim.Adam(model.parameters(), lr=config["lr"])

    criterion = nn.MSELoss()
    
    device = torch.device('cpu')
    model = model.to(device)
    criterion = criterion.to(device)
    scheduler = ReduceLROnPlateau(optimizer, mode="min", patience=10, factor=0.5, verbose=True)
    
    def train(model, batchsize, X_train, y_train, optimizer, criterion):
        epoch_loss = 0
        model.train()
        total_n = X_train.shape[1]
        num_batches = math.ceil(total_n / batchsize)
        for batch in range(num_batches):
            start = batch*batchsize
            end = (batch+1)*batchsize
            optimizer.zero_grad()
            batch_X = torch.Tensor(X_train[:, start:end])
            batch_y = torch.Tensor(y_train[start:end])
            predictions = model.forward(batch_X).squeeze(1)
            loss = criterion(predictions, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        return epoch_loss / num_batches 
    
    def evaluate(model, X_valid, y_valid, criterion):
        epoch_loss = 0
        model.eval()
        with torch.no_grad():
            batch_X = torch.Tensor(X_valid)
            batch_y = torch.Tensor(y_valid)
            predictions = model.forward(batch_X).squeeze(1)
            epoch_loss = criterion(predictions, batch_y).item()
        return epoch_loss

    def predict(model, X_test):
        epoch_loss = 0
        model.eval()
        with torch.no_grad():
            batch_X = torch.Tensor(X_test)
            predictions = model.forward(batch_X).squeeze(1)
            predictions = predictions.cpu().data.numpy()
        return predictions

    # timing
#     start_time = time.time()
#     predictions = predict(model, X_test)
#     print(predictions.shape)
#     print(predictions)
#     end_time = time.time()
#     print(end_time-start_time)
#     assert False
     
    best_valid = 999999.0
    rand = random.randint(0,100000)
    print('epoch train_loss valid_loss')
    for epoch in range(config["num_epochs"]):
        train_loss = train(model, config["batchsize"], X_train, y_train, optimizer, criterion)
        valid_loss = evaluate(model, X_valid, y_valid, criterion)
        scheduler.step(valid_loss)
        if valid_loss <= best_valid:
            # save model
            best_valid = valid_loss
            print(epoch, train_loss, valid_loss, 'saving model')
            torch.save(model, 'models/lstm_%d.pt' %rand)
        else:
            print(epoch, train_loss, valid_loss)

    model = torch.load('models/lstm_%d.pt' %rand)

    predictions = predict(model, X_test)
    mae = np.mean(np.absolute(predictions-y_test))
    print("mae: ", mae)
    mse = np.mean((predictions - y_test)**2)
    print("mse: ", mse)
#     corr = np.corrcoef(predictions,y_test)[0][1]
#     print("corr: ", corr)
#     true_label = (y_test >= 0)
#     sys.stdout.flush()

In [18]:
# last_train[:,:,1:]

In [14]:
config = {'h':32, 'lr':0.0001, 'num_epochs':150, 'batchsize':32, 'drop':0.5}
seed = 123
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
train_lstm(last_train[:,:,1:], label_train, last_dev[:,:,1:], label_dev, last_test[:,:,1:], label_test, config)

epoch train_loss valid_loss
0 146.69658733549574 114.76644897460938 saving model
1 146.245848156157 114.44473266601562 saving model
2 145.9383356003534 114.11944580078125 saving model
3 145.57897258940199 113.78263854980469 saving model
4 145.13750821068174 113.42277526855469 saving model
5 144.74981435139975 113.03050231933594 saving model
6 144.3395254952567 112.59529876708984 saving model
7 143.816528683617 112.10729217529297 saving model
8 143.3779580252511 111.56178283691406 saving model
9 142.69454556419737 110.94979095458984 saving model
10 141.87390500023253 110.23274993896484 saving model
11 141.12169174920945 109.38721466064453 saving model
12 140.1775138491676 108.37999725341797 saving model
13 138.97293127150763 107.15682220458984 saving model
14 137.56603349958147 105.64766693115234 saving model
15 135.86024765741257 103.73005676269531 saving model
16 133.3707057407924 101.1859359741211 saving model
17 129.87890407017298 97.73564147949219 saving model
18 126.33740779331752

In [17]:
config = {'h':32, 'lr':0.0001, 'num_epochs':150, 'batchsize':32, 'drop':0.5}
seed = 123
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
train_lstm(last_train[:,:,0:1], label_train, last_dev[:,:,0:1], label_dev, last_test[:,:,0:1], label_test, config)

epoch train_loss valid_loss
0 143.39490799676804 112.08809661865234 saving model
1 142.88226572672525 111.64618682861328 saving model
2 142.45459856305803 111.20196533203125 saving model
3 142.03448631649925 110.74302673339844 saving model
4 141.53744325183686 110.25714874267578 saving model
5 140.9085736955915 109.73889923095703 saving model
6 140.32575334821428 109.16926574707031 saving model
7 139.70548502604166 108.6005859375 saving model
8 139.0303475516183 107.97180938720703 saving model
9 138.4253892444429 107.21587371826172 saving model
10 137.373413449242 106.23810577392578 saving model
11 136.09661356608072 104.85802459716797 saving model
12 134.3394041515532 102.77561950683594 saving model
13 131.76061321440199 99.80361938476562 saving model
14 127.65883273170108 95.62702178955078 saving model
15 122.50021798270089 89.80530548095703 saving model
16 116.0691379365467 82.45384979248047 saving model
17 106.96993310110909 74.1333236694336 saving model
18 97.15976987566266 65.793

In [19]:
label_test

array([13. , 10.8, 12.6,  4. ,  8.4, 10.6, 15.3,  3.8, 13.1,  4.5, 12. ,
        6.8,  5.1,  6.3,  4.3,  5.8,  9. , 15. ,  5.2,  9.2,  2.8, 12.3,
       11.3,  3.4, 13.5, 10.2,  5.1, 13.3,  8.4,  3.2, 18.2,  6.4,  3.9,
        8.8, 11.9,  7.1,  5.3,  1.8,  8.3,  4.8, 12.6, 13.8, 10. , 15.5,
       17.6,  4.5, 10.6,  3. ,  6.4,  3.1,  5.7,  6.7, 13. , 12.7,  9. ,
       10.5, 12.6,  6.7,  8. ,  6.1,  5.9, 12.9,  8.3, 10.9, 12.9,  9.4,
       11.9,  7.4,  8.3,  9.6,  9. ,  5. , 10.8,  7.1, 12.5, 18. ,  5.8,
        8.2, 12.9, 12.5,  4.6,  4. ,  6.7,  7. ,  5.9,  4.4, 11.2,  8.7,
        6.6,  8.4,  8.9, 15.5,  6.2,  3.5,  9.8,  3.6,  9.9,  9.9,  2.3,
        6. ,  5.4,  4.7, 10. ,  4.7,  3.5,  7.2, 22.9, 12.8, 10.5,  8.8,
        6. ,  3.2, 15.1,  4.8, 12.7,  2.2, 10.5,  6.3, 17.8,  4.2,  4.2])