In [1]:
import os 
os.chdir(os.path.pardir)
# load data from file 
import numpy as np 
save_file_name = ['fea_seq.npy', 'last_observation_seq.npy', 'label_seq.npy', 'masking_seq.npy',
                   'delta_seq.npy', 'train_valid_test_split.npy']
save_folder = 'data/raw/predict-one-day-diff/pol-met-search'
saved_arrays = []
for file_name in save_file_name:
    saved_arrays.append(np.load(os.path.join(save_folder, file_name)))
[fea_seq, last_observation_seq, label_seq, masking_seq, delta_seq, train_valid_test_split] = saved_arrays

In [2]:
# train-test-split 
train_index = [k for k in range(train_valid_test_split[0])]
dev_index = [k for k in range(train_valid_test_split[0], 
                               train_valid_test_split[0] + train_valid_test_split[1])]
test_index = [k for k in range(train_valid_test_split[0] + train_valid_test_split[1],
              train_valid_test_split[0] + train_valid_test_split[1] + train_valid_test_split[2])]

In [3]:
def get_array_by_index_range(nparray_list, label_array, index_range):
    '''
    nparray_list: list of nparrays to select according to index range 
    label_array: select the labels from label array
    '''
    # get non-na index
    non_na_index = []
    for index in index_range:
        if not np.isnan(label_array[index]):
            non_na_index.append(index)
    
    return [k[non_na_index] for k in nparray_list], label_array[non_na_index].reshape(-1)

In [4]:
# split set to train, test and dev sets 
# train set
[fea_train, last_train], label_train =  get_array_by_index_range([fea_seq,last_observation_seq], label_seq, train_index)
# dev set 
[fea_dev, last_dev], label_dev =  get_array_by_index_range([fea_seq, last_observation_seq], label_seq, dev_index)
# test set 
[fea_test, last_test], label_test =  get_array_by_index_range([fea_seq, last_observation_seq], label_seq, test_index)

In [5]:
def normalize_feature(fea_train, array_list):
    """
    array_list: [fea_dev, fea_test, last_train, last_dev, last_test] to normalize 
    """
    train_mean = np.nanmean(fea_train, axis=0)
    train_std = np.nanstd(fea_train, axis=0)
    def norm_arr(nparr):
        return(nparr - train_mean)/train_std
    return (norm_arr(fea_train), [norm_arr(k) for k in array_list])

In [6]:
fea_train, [fea_dev, fea_test, last_train, last_dev, last_test] = normalize_feature(fea_train,
                                                                                   [fea_dev, fea_test, 
                                                                                    last_train, last_dev,
                                                                                    last_test])
# record mean after normalization 
x_mean_aft_nor = np.nanmean(fea_train, axis=0)

In [7]:
# control experiment using last observed value for missing data imputation 
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
from torch.autograd import Variable, grad
from torch.optim.lr_scheduler import ReduceLROnPlateau
import math

In [8]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_dim, dropout):
        """
        input_size - the number of expected features in the input x
        hidden_size - the number of hidden units in state h
        """
        super(LSTM, self).__init__()
        self.h = hidden_size
        self.lstm = nn.GRUCell(input_size, hidden_size)
        self.fc1 = nn.Linear(hidden_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    
    def forward(self, x):
        """
        x: shape (time_step, n_features)
        """
        t = x.shape[0]
        n = x.shape[1]
        self.hx = torch.zeros(n, self.h)
        all_hs = []
        all_cs = []
        # iterate through cells 
        for i in range(t):
            self.hx = self.lstm(x[i], self.hx)
            all_hs.append(self.hx)
        # last hidden layer last_hs is n * h
        last_hs = all_hs[-1]
        output = F.relu(self.fc1(last_hs))
        output = self.dropout(output)
        output = self.fc2(output)
        return output
    
def train_lstm(X_train, y_train, X_valid, y_valid, X_test, y_test, config):
    # no shuffle, keep original order 
    # swap axes for back propagation 
    def swap_axes(nparr):
        return nparr.swapaxes(0,1)
    X_train = swap_axes(X_train)
    X_valid = swap_axes(X_valid)
    X_test = swap_axes(X_test)
    
    # model parameters
    input_size = X_train.shape[2]
    h = config["h"]
    t = X_train.shape[0]
    output_dim = 1
    dropout = config["drop"]
    
    model = LSTM(input_size, h, output_dim, dropout)
    
    optimizer = optim.Adam(model.parameters(), lr=config["lr"])

    criterion = nn.MSELoss()
    
    device = torch.device('cpu')
    model = model.to(device)
    criterion = criterion.to(device)
    scheduler = ReduceLROnPlateau(optimizer, mode="min", patience=10, factor=0.5, verbose=True)
    
    def train(model, batchsize, X_train, y_train, optimizer, criterion):
        epoch_loss = 0
        model.train()
        total_n = X_train.shape[1]
        num_batches = math.ceil(total_n / batchsize)
        for batch in range(num_batches):
            start = batch*batchsize
            end = (batch+1)*batchsize
            optimizer.zero_grad()
            batch_X = torch.Tensor(X_train[:, start:end])
            batch_y = torch.Tensor(y_train[start:end])
            predictions = model.forward(batch_X).squeeze(1)
            loss = criterion(predictions, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        return epoch_loss / num_batches 
    
    def evaluate(model, X_valid, y_valid, criterion):
        epoch_loss = 0
        model.eval()
        with torch.no_grad():
            batch_X = torch.Tensor(X_valid)
            batch_y = torch.Tensor(y_valid)
            predictions = model.forward(batch_X).squeeze(1)
            epoch_loss = criterion(predictions, batch_y).item()
        return epoch_loss

    def predict(model, X_test):
        epoch_loss = 0
        model.eval()
        with torch.no_grad():
            batch_X = torch.Tensor(X_test)
            predictions = model.forward(batch_X).squeeze(1)
            predictions = predictions.cpu().data.numpy()
        return predictions

    # timing
#     start_time = time.time()
#     predictions = predict(model, X_test)
#     print(predictions.shape)
#     print(predictions)
#     end_time = time.time()
#     print(end_time-start_time)
#     assert False
     
    best_valid = 999999.0
    rand = random.randint(0,100000)
    print('epoch train_loss valid_loss')
    for epoch in range(config["num_epochs"]):
        train_loss = train(model, config["batchsize"], X_train, y_train, optimizer, criterion)
        valid_loss = evaluate(model, X_valid, y_valid, criterion)
        scheduler.step(valid_loss)
        if valid_loss <= best_valid:
            # save model
            best_valid = valid_loss
            print(epoch, train_loss, valid_loss, 'saving model')
            torch.save(model, 'models/lstm_%d.pt' %rand)
        else:
            print(epoch, train_loss, valid_loss)

    model = torch.load('models/lstm_%d.pt' %rand)

    predictions = predict(model, X_test)
    mae = np.mean(np.absolute(predictions-y_test))
    print("mae: ", mae)
    mse = np.mean((predictions - y_test)**2)
    print("mse: ", mse)
#     corr = np.corrcoef(predictions,y_test)[0][1]
#     print("corr: ", corr)
#     true_label = (y_test >= 0)
#     sys.stdout.flush()

In [9]:
last_train[:,:,1:5].shape

(664, 7, 4)

In [11]:
# met 
config = {'h':128, 'lr':0.001, 'num_epochs':150, 'batchsize':32, 'drop':0.7}
seed = 123
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
train_lstm(last_train[:,:,1:5], label_train, last_dev[:,:,1:5], label_dev, last_test[:,:,1:5], label_test, config)

epoch train_loss valid_loss
0 41.09159106299991 33.07618713378906 saving model
1 40.80134546189081 32.35593032836914 saving model
2 40.069341750372026 31.087417602539062 saving model
3 38.569817860921226 29.09696388244629 saving model
4 36.835134097508025 27.24900245666504 saving model
5 35.7380188533238 26.383819580078125 saving model
6 35.17970557439895 25.530658721923828 saving model
7 34.20080352964855 25.587596893310547
8 33.50944137573242 25.346126556396484 saving model
9 33.144762311662944 24.983318328857422 saving model
10 32.8745824268886 24.91716766357422 saving model
11 32.41738478342692 24.789276123046875 saving model
12 32.537544159662154 24.948949813842773
13 31.979722204662504 24.50566864013672 saving model
14 32.244703928629555 24.574337005615234
15 31.264505522591726 24.66516876220703
16 31.065198580423992 24.321176528930664 saving model
17 30.451274099804106 24.574617385864258
18 30.8044954481579 24.771865844726562
19 30.84473428272066 24.65963363647461
20 30.86480281

In [12]:
# met and search 
config = {'h':128, 'lr':0.001, 'num_epochs':150, 'batchsize':32, 'drop':0.7}
seed = 123
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
train_lstm(last_train[:,:,1:], label_train, last_dev[:,:,1:], label_dev, last_test[:,:,1:], label_test, config)

epoch train_loss valid_loss
0 41.226154781523206 33.508792877197266 saving model
1 40.854309263683504 33.313995361328125 saving model
2 40.62112553914388 32.999267578125 saving model
3 39.925202551342196 32.49979019165039 saving model
4 39.056315013340544 32.13646697998047 saving model
5 35.947140103294736 31.79549217224121 saving model
6 31.169902710687545 32.542999267578125
7 26.50968869527181 34.175052642822266
8 23.220348176502046 34.171390533447266
9 18.813694045657204 33.30949783325195
10 15.745402699425107 35.11029815673828
11 13.23677971249535 33.449249267578125
12 11.91861867904663 33.66128921508789
13 9.694603216080438 34.45226287841797
14 10.093519142695836 34.142948150634766
15 8.599857103256952 33.89802551269531
Epoch    17: reducing learning rate of group 0 to 5.0000e-04.
16 7.089729729152861 34.558223724365234
17 6.2807410671597435 35.77287292480469
18 4.65983399890718 35.13926696777344
19 4.835804786000933 34.67116165161133
20 4.223460180418832 34.79456329345703
21 4.62

In [13]:
# pol only
config = {'h':128, 'lr':0.001, 'num_epochs':150, 'batchsize':32, 'drop':0.7}
seed = 123
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
train_lstm(last_train[:,:,0:1], label_train, last_dev[:,:,0:1], label_dev, last_test[:,:,0:1], label_test, config)

epoch train_loss valid_loss
0 39.545395397004626 29.60646629333496 saving model
1 32.19648465656099 20.465513229370117 saving model
2 27.154795374189103 20.411623001098633 saving model
3 26.70655595688593 19.806772232055664 saving model
4 25.66730526515416 19.702316284179688 saving model
5 25.613938513256254 19.571622848510742 saving model
6 25.043526467822847 19.318038940429688 saving model
7 25.299606414068315 19.432289123535156
8 25.253958066304524 19.438507080078125
9 25.96671099889846 19.20871353149414 saving model
10 24.311976841517858 19.317289352416992
11 24.9241973786127 19.489521026611328
12 25.539459773472377 19.364763259887695
13 25.92112681979225 19.33141326904297
14 24.435290291195823 19.275493621826172
15 24.8737089520409 19.43120765686035
16 25.087226913088845 19.16067886352539 saving model
17 24.985576629638672 19.35442352294922
18 24.06506429399763 19.205854415893555
19 24.048133623032342 19.488351821899414
20 24.351695651099796 19.322265625
21 25.19601276942662 19.18

In [14]:
# pol and met 
config = {'h':128, 'lr':0.001, 'num_epochs':150, 'batchsize':32, 'drop':0.7}
seed = 123
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
train_lstm(last_train[:,:,0:5], label_train, last_dev[:,:,0:5], label_dev, last_test[:,:,0:5], label_test, config)

epoch train_loss valid_loss
0 39.67372976030622 29.619243621826172 saving model
1 34.4580473672776 21.728351593017578 saving model
2 27.089752469744003 21.53615379333496 saving model
3 24.696594283694314 19.295978546142578 saving model
4 23.30074846176874 19.840543746948242
5 23.87181722550165 18.851125717163086 saving model
6 23.003882453555153 18.976682662963867
7 22.80754225594657 19.034927368164062
8 22.72718702043806 19.237028121948242
9 22.94171991802397 18.40932273864746 saving model
10 22.388197263081867 18.758708953857422
11 22.492624964032853 18.938552856445312
12 22.287170909699938 18.31076431274414 saving model
13 20.707391057695663 18.631088256835938
14 21.157987594604492 18.076000213623047 saving model
15 21.196017174493697 18.58933448791504
16 20.174860522860573 19.09343910217285
17 20.67708728426979 17.642457962036133 saving model
18 20.06414558773949 18.677982330322266
19 20.194218998863583 18.120304107666016
20 20.13174243200393 17.89269256591797
21 20.09222039722261 