In [1]:
import os 
os.chdir(os.path.pardir)
# load data from file 
import numpy as np 
save_file_name = ['fea_seq.npy', 'last_observation_seq.npy', 'label_seq.npy', 'masking_seq.npy',
                   'delta_seq.npy', 'train_valid_test_split.npy']
save_folder = 'data/raw/predict-one-day-diff/pol-met-search'
saved_arrays = []
for file_name in save_file_name:
    saved_arrays.append(np.load(os.path.join(save_folder, file_name)))
[fea_seq, last_observation_seq, label_seq, masking_seq, delta_seq, train_valid_test_split] = saved_arrays

In [2]:
# train-test-split 
train_index = [k for k in range(train_valid_test_split[0])]
dev_index = [k for k in range(train_valid_test_split[0], 
                               train_valid_test_split[0] + train_valid_test_split[1])]
test_index = [k for k in range(train_valid_test_split[0] + train_valid_test_split[1],
              train_valid_test_split[0] + train_valid_test_split[1] + train_valid_test_split[2])]

In [3]:
def get_array_by_index_range(nparray_list, label_array, index_range):
    '''
    nparray_list: list of nparrays to select according to index range 
    label_array: select the labels from label array
    '''
    # get non-na index
    non_na_index = []
    for index in index_range:
        if not np.isnan(label_array[index]):
            non_na_index.append(index)
    
    return [k[non_na_index] for k in nparray_list], label_array[non_na_index].reshape(-1)

In [4]:
# split set to train, test and dev sets 
# train set
[fea_train, last_train], label_train =  get_array_by_index_range([fea_seq,last_observation_seq], label_seq, train_index)
# dev set 
[fea_dev, last_dev], label_dev =  get_array_by_index_range([fea_seq, last_observation_seq], label_seq, dev_index)
# test set 
[fea_test, last_test], label_test =  get_array_by_index_range([fea_seq, last_observation_seq], label_seq, test_index)

In [5]:
def normalize_feature(fea_train, array_list):
    """
    array_list: [fea_dev, fea_test, last_train, last_dev, last_test] to normalize 
    """
    train_mean = np.nanmean(fea_train, axis=0)
    train_std = np.nanstd(fea_train, axis=0)
    def norm_arr(nparr):
        return(nparr - train_mean)/train_std
    return (norm_arr(fea_train), [norm_arr(k) for k in array_list])

In [6]:
fea_train, [fea_dev, fea_test, last_train, last_dev, last_test] = normalize_feature(fea_train,
                                                                                   [fea_dev, fea_test, 
                                                                                    last_train, last_dev,
                                                                                    last_test])
# record mean after normalization 
x_mean_aft_nor = np.nanmean(fea_train, axis=0)

In [7]:
# control experiment using last observed value for missing data imputation 
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
from torch.autograd import Variable, grad
from torch.optim.lr_scheduler import ReduceLROnPlateau
import math

In [8]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_dim, dropout):
        """
        input_size - the number of expected features in the input x
        hidden_size - the number of hidden units in state h
        """
        super(LSTM, self).__init__()
        self.h = hidden_size
        self.lstm = nn.LSTMCell(input_size, hidden_size)
        self.fc1 = nn.Linear(hidden_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    
    def forward(self, x):
        """
        x: shape (time_step, n_features)
        """
        t = x.shape[0]
        n = x.shape[1]
        self.hx = torch.zeros(n, self.h)
        self.cx = torch.zeros(n, self.h)
        all_hs = []
        all_cs = []
        # iterate through cells 
        for i in range(t):
            self.hx, self.cx = self.lstm(x[i], (self.hx, self.cx))
            all_hs.append(self.hx)
            all_cs.append(self.cx)
        # last hidden layer last_hs is n * h
        last_hs = all_hs[-1]
        output = F.relu(self.fc1(last_hs))
        output = self.dropout(output)
        output = self.fc2(output)
        return output
    
def train_lstm(X_train, y_train, X_valid, y_valid, X_test, y_test, config):
    # no shuffle, keep original order 
    # swap axes for back propagation 
    def swap_axes(nparr):
        return nparr.swapaxes(0,1)
    X_train = swap_axes(X_train)
    X_valid = swap_axes(X_valid)
    X_test = swap_axes(X_test)
    
    # model parameters
    input_size = X_train.shape[2]
    h = config["h"]
    t = X_train.shape[0]
    output_dim = 1
    dropout = config["drop"]
    
    model = LSTM(input_size, h, output_dim, dropout)
    
    optimizer = optim.Adam(model.parameters(), lr=config["lr"])

    criterion = nn.MSELoss()
    
    device = torch.device('cpu')
    model = model.to(device)
    criterion = criterion.to(device)
    scheduler = ReduceLROnPlateau(optimizer, mode="min", patience=10, factor=0.5, verbose=True)
    
    def train(model, batchsize, X_train, y_train, optimizer, criterion):
        epoch_loss = 0
        model.train()
        total_n = X_train.shape[1]
        num_batches = math.ceil(total_n / batchsize)
        for batch in range(num_batches):
            start = batch*batchsize
            end = (batch+1)*batchsize
            optimizer.zero_grad()
            batch_X = torch.Tensor(X_train[:, start:end])
            batch_y = torch.Tensor(y_train[start:end])
            predictions = model.forward(batch_X).squeeze(1)
            loss = criterion(predictions, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        return epoch_loss / num_batches 
    
    def evaluate(model, X_valid, y_valid, criterion):
        epoch_loss = 0
        model.eval()
        with torch.no_grad():
            batch_X = torch.Tensor(X_valid)
            batch_y = torch.Tensor(y_valid)
            predictions = model.forward(batch_X).squeeze(1)
            epoch_loss = criterion(predictions, batch_y).item()
        return epoch_loss

    def predict(model, X_test):
        epoch_loss = 0
        model.eval()
        with torch.no_grad():
            batch_X = torch.Tensor(X_test)
            predictions = model.forward(batch_X).squeeze(1)
            predictions = predictions.cpu().data.numpy()
        return predictions

    # timing
#     start_time = time.time()
#     predictions = predict(model, X_test)
#     print(predictions.shape)
#     print(predictions)
#     end_time = time.time()
#     print(end_time-start_time)
#     assert False
     
    best_valid = 999999.0
    rand = random.randint(0,100000)
    print('epoch train_loss valid_loss')
    for epoch in range(config["num_epochs"]):
        train_loss = train(model, config["batchsize"], X_train, y_train, optimizer, criterion)
        valid_loss = evaluate(model, X_valid, y_valid, criterion)
        scheduler.step(valid_loss)
        if valid_loss <= best_valid:
            # save model
            best_valid = valid_loss
            print(epoch, train_loss, valid_loss, 'saving model')
            torch.save(model, 'models/lstm_%d.pt' %rand)
        else:
            print(epoch, train_loss, valid_loss)

    model = torch.load('models/lstm_%d.pt' %rand)

    predictions = predict(model, X_test)
    mae = np.mean(np.absolute(predictions-y_test))
    print("mae: ", mae)
    mse = np.mean((predictions - y_test)**2)
    print("mse: ", mse)
#     corr = np.corrcoef(predictions,y_test)[0][1]
#     print("corr: ", corr)
#     true_label = (y_test >= 0)
#     sys.stdout.flush()

In [9]:
last_train[:,:,1:5].shape

(664, 7, 4)

In [13]:
# met 
config = {'h':32, 'lr':0.0001, 'num_epochs':150, 'batchsize':32, 'drop':0.5}
seed = 123
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
train_lstm(last_train[:,:,1:5], label_train, last_dev[:,:,1:5], label_dev, last_test[:,:,1:5], label_test, config)

epoch train_loss valid_loss
0 41.23611295790899 33.683197021484375 saving model
1 41.24767230805897 33.67363739013672 saving model
2 41.266623996552966 33.66393280029297 saving model
3 41.19815717424665 33.654537200927734 saving model
4 41.26398240952265 33.64583969116211 saving model
5 41.23711159115746 33.636905670166016 saving model
6 41.21532349359421 33.627437591552734 saving model
7 41.1984483628046 33.61874771118164 saving model
8 41.2486314319429 33.60905075073242 saving model
9 41.16646385192871 33.598541259765625 saving model
10 41.18278367178781 33.58843231201172 saving model
11 41.15946515401205 33.5783576965332 saving model
12 41.1622321719215 33.56774139404297 saving model
13 41.168164116995676 33.55644989013672 saving model
14 41.14789099920364 33.54535675048828 saving model
15 41.12988553728376 33.535030364990234 saving model
16 41.14571807498024 33.52366638183594 saving model
17 41.14968681335449 33.510929107666016 saving model
18 41.16890407743908 33.499019622802734 s

In [9]:
# met and search 
config = {'h':32, 'lr':0.0001, 'num_epochs':150, 'batchsize':32, 'drop':0.5}
seed = 123
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
train_lstm(last_train[:,:,1:], label_train, last_dev[:,:,1:], label_dev, last_test[:,:,1:], label_test, config)

epoch train_loss valid_loss
0 41.329455511910574 33.67776107788086 saving model


  "type " + obj.__name__ + ". It won't be checked "


1 41.22793061392648 33.67509841918945 saving model
2 41.21859432402111 33.67281723022461 saving model
3 41.24243645440964 33.670467376708984 saving model
4 41.19531086512974 33.668338775634766 saving model
5 41.17391386486235 33.665775299072266 saving model
6 41.20284407479422 33.66217803955078 saving model
7 41.211346399216424 33.6595573425293 saving model
8 41.2637502579462 33.657501220703125 saving model
9 41.18513225373768 33.65495300292969 saving model
10 41.19043486458914 33.65225601196289 saving model
11 41.226323354811896 33.64967346191406 saving model
12 41.2124034336635 33.64716720581055 saving model
13 41.16431862967355 33.645179748535156 saving model
14 41.215364183698384 33.64227294921875 saving model
15 41.14261954171317 33.63984298706055 saving model
16 41.07734071640741 33.63759231567383 saving model
17 41.1756964183989 33.63459777832031 saving model
18 41.16886838277181 33.631126403808594 saving model
19 41.135512851533434 33.627906799316406 saving model
20 41.13559087

In [10]:
# pol only
config = {'h':32, 'lr':0.0001, 'num_epochs':150, 'batchsize':32, 'drop':0.5}
seed = 123
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
train_lstm(last_train[:,:,0:1], label_train, last_dev[:,:,0:1], label_dev, last_test[:,:,0:1], label_test, config)

epoch train_loss valid_loss
0 41.33254668826149 33.734954833984375 saving model
1 41.28713162740072 33.675148010253906 saving model
2 41.2317925407773 33.61861801147461 saving model
3 41.19173722040085 33.562835693359375 saving model
4 41.1420465196882 33.5089225769043 saving model
5 41.031910669235955 33.4533576965332 saving model
6 40.97483653113956 33.39518737792969 saving model
7 40.908630189441496 33.334381103515625 saving model
8 40.86536852518717 33.27157974243164 saving model
9 40.79624048868815 33.20006561279297 saving model
10 40.72873115539551 33.12499237060547 saving model
11 40.57214182899112 33.040409088134766 saving model
12 40.52426038469587 32.944793701171875 saving model
13 40.43034435453869 32.84569549560547 saving model
14 40.299276261102584 32.73468780517578 saving model
15 40.1606638772147 32.61459732055664 saving model
16 40.10060037885393 32.485809326171875 saving model
17 39.98446478162493 32.337337493896484 saving model
18 39.78516633169992 32.16497802734375 s

In [11]:
# pol and met 
config = {'h':128, 'lr':0.0005, 'num_epochs':150, 'batchsize':32, 'drop':0.7}
seed = 123
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
train_lstm(last_train[:,:,0:5], label_train, last_dev[:,:,0:5], label_dev, last_test[:,:,0:5], label_test, config)

epoch train_loss valid_loss
0 40.763465518043155 32.6501350402832 saving model
1 39.394938060215544 30.905275344848633 saving model
2 36.971732911609465 27.10590362548828 saving model
3 32.51917225973947 21.174152374267578 saving model
4 27.750157537914458 19.8355655670166 saving model
5 25.691828954787482 18.996641159057617 saving model
6 25.176917030697776 19.870403289794922
7 23.888593719119118 19.426408767700195
8 24.50892816271101 19.239788055419922
9 23.789821034386044 19.07443618774414
10 23.68046696980794 19.18096160888672
11 23.88068589710054 19.172433853149414
12 22.73766276949928 18.87061882019043 saving model
13 23.794909250168573 19.032934188842773
14 23.320554369971866 18.814205169677734 saving model
15 22.309762273515975 18.818405151367188
16 23.535151118323917 18.973939895629883
17 23.155351956685383 18.98465347290039
18 22.207816169375466 18.78868293762207 saving model
19 23.20317486354283 19.060285568237305
20 21.98801853543236 19.106218338012695
21 22.787301653907413