In [1]:
import os 
os.chdir(os.path.pardir)
# load data from file 
import numpy as np 
save_file_name = ['fea_seq.npy', 'last_observation_seq.npy', 'label_seq.npy', 'masking_seq.npy',
                   'delta_seq.npy', 'train_valid_test_split.npy']
save_folder = 'data/raw/pol_temp_rh'
saved_arrays = []
for file_name in save_file_name:
    saved_arrays.append(np.load(os.path.join(save_folder, file_name)))
[fea_seq, last_observation_seq, label_seq, masking_seq, delta_seq, train_valid_test_split] = saved_arrays

In [2]:
# train-test-split 
train_index = [k for k in range(train_valid_test_split[0])]
dev_index = [k for k in range(train_valid_test_split[0], 
                               train_valid_test_split[0] + train_valid_test_split[1])]
test_index = [k for k in range(train_valid_test_split[0] + train_valid_test_split[1],
              train_valid_test_split[0] + train_valid_test_split[1] + train_valid_test_split[2])]

In [5]:
def get_array_by_index_range(nparray_list, label_array, index_range):
    '''
    nparray_list: list of nparrays to select according to index range 
    label_array: select the labels from label array
    '''
    # get non-na index
    non_na_index = []
    for index in index_range:
        if not np.isnan(label_array[index]):
            non_na_index.append(index)
    
    return [k[non_na_index] for k in nparray_list], label_array[non_na_index].reshape(-1)

In [6]:
# split set to train, test and dev sets 
# train set
[fea_train, last_train], label_train =  get_array_by_index_range([fea_seq,last_observation_seq], label_seq, train_index)
# dev set 
[fea_dev, last_dev], label_dev =  get_array_by_index_range([fea_seq, last_observation_seq], label_seq, dev_index)
# test set 
[fea_test, last_test], label_test =  get_array_by_index_range([fea_seq, last_observation_seq], label_seq, test_index)

In [15]:
def normalize_feature(fea_train, array_list):
    """
    array_list: [fea_dev, fea_test, last_train, last_dev, last_test] to normalize 
    """
    train_mean = np.nanmean(fea_train, axis=0)
    train_std = np.nanstd(fea_train, axis=0)
    def norm_arr(nparr):
        return(nparr - train_mean)/train_std
    return (norm_arr(fea_train), [norm_arr(k) for k in array_list])

In [16]:
fea_train, [fea_dev, fea_test, last_train, last_dev, last_test] = normalize_feature(fea_train,
                                                                                   [fea_dev, fea_test, 
                                                                                    last_train, last_dev,
                                                                                    last_test])

In [19]:
# record mean after normalization 
x_mean_aft_nor = np.nanmean(fea_train, axis=0)

In [21]:
# control experiment using last observed value for missing data imputation 
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
from torch.autograd import Variable, grad
from torch.optim.lr_scheduler import ReduceLROnPlateau
import math

In [22]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_dim, dropout):
        """
        input_size - the number of expected features in the input x
        hidden_size - the number of hidden units in state h
        """
        super(LSTM, self).__init__()
        self.h = hidden_size
        self.lstm = nn.LSTMCell(input_size, hidden_size)
        self.fc1 = nn.Linear(hidden_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    
    def forward(self, x):
        """
        x: shape (time_step, n_features)
        """
        t = x.shape[0]
        n = x.shape[1]
        self.hx = torch.zeros(n, self.h)
        self.cx = torch.zeros(n, self.h)
        all_hs = []
        all_cs = []
        # iterate through cells 
        for i in range(t):
            self.hx, self.cx = self.lstm(x[i], (self.hx, self.cx))
            all_hs.append(self.hx)
            all_cs.append(self.cx)
        # last hidden layer last_hs is n * h
        last_hs = all_hs[-1]
        output = F.relu(self.fc1(last_hs))
        output = self.dropout(output)
        output = self.fc2(output)
        return output
    
def train_lstm(X_train, y_train, X_valid, y_valid, X_test, y_test, config):
    # no shuffle, keep original order 
    # swap axes for back propagation 
    def swap_axes(nparr):
        return nparr.swapaxes(0,1)
    X_train = swap_axes(X_train)
    X_valid = swap_axes(X_valid)
    X_test = swap_axes(X_test)
    
    # model parameters
    input_size = X_train.shape[2]
    h = config["h"]
    t = X_train.shape[0]
    output_dim = 1
    dropout = config["drop"]
    
    model = LSTM(input_size, h, output_dim, dropout)
    
    optimizer = optim.Adam(model.parameters(), lr=config["lr"])

    criterion = nn.MSELoss()
    
    device = torch.device('cpu')
    model = model.to(device)
    criterion = criterion.to(device)
    scheduler = ReduceLROnPlateau(optimizer, mode="min", patience=10, factor=0.5, verbose=True)
    
    def train(model, batchsize, X_train, y_train, optimizer, criterion):
        epoch_loss = 0
        model.train()
        total_n = X_train.shape[1]
        num_batches = math.ceil(total_n / batchsize)
        for batch in range(num_batches):
            start = batch*batchsize
            end = (batch+1)*batchsize
            optimizer.zero_grad()
            batch_X = torch.Tensor(X_train[:, start:end])
            batch_y = torch.Tensor(y_train[start:end])
            predictions = model.forward(batch_X).squeeze(1)
            loss = criterion(predictions, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        return epoch_loss / num_batches 
    
    def evaluate(model, X_valid, y_valid, criterion):
        epoch_loss = 0
        model.eval()
        with torch.no_grad():
            batch_X = torch.Tensor(X_valid)
            batch_y = torch.Tensor(y_valid)
            predictions = model.forward(batch_X).squeeze(1)
            epoch_loss = criterion(predictions, batch_y).item()
        return epoch_loss

    def predict(model, X_test):
        epoch_loss = 0
        model.eval()
        with torch.no_grad():
            batch_X = torch.Tensor(X_test)
            predictions = model.forward(batch_X).squeeze(1)
            predictions = predictions.cpu().data.numpy()
        return predictions

    # timing
#     start_time = time.time()
#     predictions = predict(model, X_test)
#     print(predictions.shape)
#     print(predictions)
#     end_time = time.time()
#     print(end_time-start_time)
#     assert False
     
    best_valid = 999999.0
    rand = random.randint(0,100000)
    print('epoch train_loss valid_loss')
    for epoch in range(config["num_epochs"]):
        train_loss = train(model, config["batchsize"], X_train, y_train, optimizer, criterion)
        valid_loss = evaluate(model, X_valid, y_valid, criterion)
        scheduler.step(valid_loss)
        if valid_loss <= best_valid:
            # save model
            best_valid = valid_loss
            print(epoch, train_loss, valid_loss, 'saving model')
            torch.save(model, 'models/lstm_%d.pt' %rand)
        else:
            print(epoch, train_loss, valid_loss)

    model = torch.load('models/lstm_%d.pt' %rand)

    predictions = predict(model, X_test)
    mae = np.mean(np.absolute(predictions-y_test))
    print("mae: ", mae)
    mse = np.mean((predictions - y_test)**2)
    print("mse: ", mse)
#     corr = np.corrcoef(predictions,y_test)[0][1]
#     print("corr: ", corr)
#     true_label = (y_test >= 0)
#     sys.stdout.flush()

In [27]:
config = {'h':32, 'lr':0.0001, 'num_epochs':50, 'batchsize':32, 'drop':0.5}
seed = 123
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
train_lstm(last_train, label_train, last_dev, label_dev, last_test, label_test, config)

epoch train_loss valid_loss
0 148.51093546549478 116.60440063476562 saving model
1 148.29742358979723 116.32820129394531 saving model
2 147.93506440662202 116.04935455322266 saving model
3 147.6263173421224 115.76716613769531 saving model
4 147.3431425548735 115.47895050048828 saving model
5 147.0048344930013 115.17292022705078 saving model
6 146.71435219900948 114.84748077392578 saving model
7 146.2334703717913 114.49068450927734 saving model
8 145.88888004847936 114.0997314453125 saving model
9 145.56317283993675 113.64879608154297 saving model
10 144.92964317685082 113.11221313476562 saving model
11 144.3488039289202 112.47896575927734 saving model
12 143.73105802990142 111.7404556274414 saving model
13 143.0209448678153 110.87417602539062 saving model
14 141.92852892194475 109.79824829101562 saving model
15 140.88487352643693 108.43879699707031 saving model
16 139.21480996268136 106.72991180419922 saving model
17 137.60929470970518 104.52569580078125 saving model
18 135.17953709193

In [26]:
config = {'h':32, 'lr':0.0001, 'num_epochs':50, 'batchsize':32, 'drop':0.2}
seed = 123
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
train_lstm(last_train, label_train, last_dev, label_dev, last_test, label_test, config)

epoch train_loss valid_loss
0 148.5558355422247 116.59574127197266 saving model
1 148.21519942510696 116.31136322021484 saving model
2 147.90960003080824 116.02476501464844 saving model
3 147.5915716262091 115.7329330444336 saving model
4 147.309083484468 115.42832946777344 saving model
5 146.9377688453311 115.10294342041016 saving model
6 146.59620848156158 114.75729370117188 saving model
7 146.1917495727539 114.37591552734375 saving model
8 145.75304049537294 113.94268035888672 saving model
9 145.31788925897507 113.43444061279297 saving model
10 144.7667995634533 112.83334350585938 saving model
11 144.1587346394857 112.13513946533203 saving model
12 143.35314360119048 111.29439544677734 saving model
13 142.49611009870256 110.27655029296875 saving model
14 141.31615193684897 109.0082778930664 saving model
15 140.0712429228283 107.42472076416016 saving model
16 138.28850846063523 105.38589477539062 saving model
17 136.15179298037575 102.67196655273438 saving model
18 133.11468178885323

In [25]:
config = {'h':64, 'lr':0.0001, 'num_epochs':50, 'batchsize':32, 'drop':0.7}
seed = 123
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
train_lstm(last_train, label_train, last_dev, label_dev, last_test, label_test, config)

epoch train_loss valid_loss
0 145.86148470924013 113.95573425292969 saving model
1 145.37368956066314 113.41845703125 saving model
2 144.76148441859655 112.84800720214844 saving model
3 144.02312869117372 112.22523498535156 saving model
4 143.5881878080822 111.51858520507812 saving model
5 142.65486689976282 110.6669692993164 saving model
6 141.84251912434897 109.58648681640625 saving model
7 140.47974504743303 108.09255981445312 saving model
8 138.62728046235583 105.80789184570312 saving model
9 135.8606422061012 102.10462188720703 saving model
10 131.5003851027716 95.84728240966797 saving model
11 123.5914768037342 85.19403076171875 saving model
12 109.90644491286506 69.47418975830078 saving model
13 92.20011175246466 51.453765869140625 saving model
14 72.81600225539435 35.94103240966797 saving model
15 56.37120810009184 25.953584671020508 saving model
16 46.952756518409366 20.673221588134766 saving model
17 41.3256456284296 18.431781768798828 saving model
18 39.91825566973005 17.769

In [24]:
config = {'h':64, 'lr':0.0001, 'num_epochs':50, 'batchsize':32, 'drop':0.2}
seed = 123
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
train_lstm(last_train, label_train, last_dev, label_dev, last_test, label_test, config)

epoch train_loss valid_loss
0 145.8687250046503 113.93003845214844 saving model


  "type " + obj.__name__ + ". It won't be checked "


1 145.2564250401088 113.3586196899414 saving model
2 144.67695762997582 112.74771118164062 saving model
3 143.9765813918341 112.06239318847656 saving model
4 143.2735868181501 111.26143646240234 saving model
5 142.30578104654947 110.25984954833984 saving model
6 141.22912379673548 108.90699005126953 saving model
7 139.58508882068452 106.89680480957031 saving model
8 137.15960729689826 103.6801986694336 saving model
9 133.09564862932478 98.20018768310547 saving model
10 125.97628566196987 88.63983154296875 saving model
11 113.95888991582962 73.68605041503906 saving model
12 96.03153610229492 55.171356201171875 saving model
13 74.71212423415412 38.2042121887207 saving model
14 55.68081038338797 26.46379280090332 saving model
15 42.18146242414202 20.259355545043945 saving model
16 34.25321951366606 18.00419044494629 saving model
17 30.38073171888079 17.69477081298828 saving model
18 28.301763307480584 18.02503776550293
19 27.380786214556014 18.455280303955078
20 26.239308947608585 18.7534

In [3]:
last_observation_seq[0]

array([[ 14.3      ,  90.       ,  62.8125   ,  64.       ,  51.5416665],
       [ 14.3      ,  83.       ,  68.       ,  70.       ,  57.75     ],
       [ 14.3      ,  98.       ,  73.875    ,  67.       ,  58.       ],
       [  9.7      , 100.       ,  68.7083335,  67.       ,  49.416667 ],
       [  9.7      ,  98.       ,  71.6041665,  75.       ,  60.645833 ],
       [  9.7      ,  99.       ,  78.1875   ,  74.       ,  64.5833335],
       [  7.4      ,  99.       ,  71.2083335,  65.       ,  53.4375   ]])

In [4]:
fea_seq[0]

array([[ 14.3      ,  90.       ,  62.8125   ,  64.       ,  51.5416665],
       [        nan,  83.       ,  68.       ,  70.       ,  57.75     ],
       [        nan,  98.       ,  73.875    ,  67.       ,  58.       ],
       [  9.7      , 100.       ,  68.7083335,  67.       ,  49.416667 ],
       [        nan,  98.       ,  71.6041665,  75.       ,  60.645833 ],
       [        nan,  99.       ,  78.1875   ,  74.       ,  64.5833335],
       [  7.4      ,  99.       ,  71.2083335,  65.       ,  53.4375   ]])