In [1]:
import numpy as np
import pandas as pd
import csv

from tqdm import tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
train_len = 25253
validation_len = 9471
test_len = 13794

def load_features(filename,
                  skip_header=True,
                  skip_instname=True,
                  delim=' ',
                  num_lines=0):
    if num_lines == 0:
        num_lines = get_num_lines(filename, skip_header)

    data = np.empty(
        (num_lines, 25), float)

    with open(filename, 'r') as csv_file:
        if skip_header:
            next(csv_file)
        c = 0
        for line in tqdm(csv_file):
            offset = 0
            if skip_instname:
                offset = line.find(delim) + 1
            data[c, :] = np.fromstring(line[offset:], dtype=float, sep=delim)
            c += 1

    return data

def load_batch_features(filename, start_index=0, amount=0):
    delim = ' '
    
    data = np.empty((amount, 25), float)
    
    with open(filename, 'r') as csv_file:
        for i, line in tqdm(enumerate(csv_file)):
            if i < start_index:
                continue
            if i >= start_index + amount:
                break
            index = i - start_index
            data[index, :] = np.fromstring(line, dtype=float, sep=delim)
    return data
    
def load_batch_labels(filename, start_index=1, amount=0):
    labels = np.empty((amount, 3), float)
    delim = ','
    
    with open(filename, 'r') as csv_file:
        for i, line in tqdm(enumerate(csv_file)):
            if i < start_index:
                continue
            if i >= start_index + amount:
                break
            cols = np.fromstring(line, dtype=float, sep=delim)
            index = i - start_index
            labels[index, :] = cols[1:]
    return labels
    
def get_num_lines(filename, skip_header):
    with open(filename, 'r') as csv_file:
        if skip_header:
            next(csv_file)
        c = 0
        for line in csv_file:
            c += 1
    return c

def load_labels(filename, col_labels=1, gen_headers=True, delim=','):
    headers = []
    labels = []
    
    with open(filename, 'r') as csv_file:
        for i, line in tqdm(enumerate(csv_file)):            
            cols = np.fromstring(line, dtype=float, sep=delim)
            if i == 0:
                headers = line.rstrip().split(delim)[col_labels:]                
                continue
            
            labels.append(cols[col_labels:])
    if gen_headers:
        return np.array(labels), headers
    else:
        return np.array(labels)
    
    
def get_scaler(x, y):
    x_scaler = StandardScaler()
    x_scaler.fit(x)
    y_scaler = StandardScaler()
    y_scaler.fit(y)
  
    return x_scaler, y_scaler

def scale_data(scaler, data):
    if data.ndim > 2:
        data = data.reshape(-1, data.shape[2])
    scaled = scaler.transform(data)
    
    return scaled

# def get_scaler(x, y):
#     scaler = StandardScaler()
#     y_flatten = np.empty((len(x) * 399, 3))
#     index = 0
#     for i in range(0, len(x)):
#         while index < 399 * (i + 1):
#           y_flatten[index, 0] = y[i, 0]
#           y_flatten[index, 1] = y[i, 1]
#           y_flatten[index, 2] = y[i, 2]
#           index += 1

#     concatenated = np.concatenate((x.reshape(-1, x.shape[2]), y_flatten), axis=1)
#     scaler.fit(concatenated)
  
#     return scaler

# def scale_data(scaler, x, y):

#     x_flatten = x.reshape(-1, x.shape[2])
#     y_flatten = np.empty((x_flatten.shape[0], 3))
#     index = 0
#     for i in range(0, len(x)):
#         while index < 399 * (i + 1):
#             y_flatten[index, 0] = y[i, 0]
#             y_flatten[index, 1] = y[i, 1]
#             y_flatten[index, 2] = y[i, 2]
#             index += 1

#     concatenated = np.concatenate((x_flatten, y_flatten), axis=1)
#     scaled = scaler.fit_transform(concatenated)
#     x_rev = scaled[:, :25].reshape(x.shape)

#     index = 0  
#     while index < len(y_flatten):
#         y[index, 0] = scaled[index * 399, 25]
#         y[index, 1] = scaled[index * 399, 26]
#         y[index, 2] = scaled[index * 399, 27]
#         index += 1


#     return x_rev, y


### Batch loading to train LSTM-RNN

- First, load all data to get scalers that covers for each partition data
- Batching the data to train
- Batching the data to validation

In [3]:
data_path = './Functional_features/'

# load all data to get a scaler that covers all data
print("Loading training samples...")
x_train = load_features(data_path+'train.txt', skip_header=False, skip_instname=False)
y_train, headers = load_labels(data_path+'train_labels.txt', gen_headers=True)

x_train_scaler, y_train_scaler = get_scaler(x_train, y_train)
# x_train = x_train.reshape((25253, 399, 25))
x_train = 0
y_train = 0

print("Loading validation samples...")
x_validation = load_features(data_path+'validation.txt', skip_header = False, skip_instname=False)
# x_validation = x_validation.reshape((9471, 399, 25))
y_validation = load_labels(data_path+'validation_labels.txt', gen_headers=False)

x_validation_scaler, y_validation_scaler = get_scaler(x_validation, y_validation)
x_validation = 0
y_validation = 0

# # print("Loading testing samples...")
# # x_test = load_features(data_path+'test.txt', skip_header = False, skip_instname=False)
# # x_test = x_test.reshape((13794, 399, 25))
# # y_test = load_labels(data_path+'test_labels.txt', gen_headers=False)

# print('x_train shape:', x_train.shape)
# print('y_train shape:', y_train.shape)
# print('x_validation shape:', x_validation.shape)
# print('y_validation shape:', y_validation.shape)
# # print('x_test shape:', x_test.shape)
# # print('y_test shape:', y_test.shape)

Loading training samples...


10075947it [01:59, 84549.79it/s]
  cols = np.fromstring(line, dtype=float, sep=delim)
25254it [00:00, 173985.31it/s]


Loading validation samples...


3778929it [00:45, 83388.69it/s]
  cols = np.fromstring(line, dtype=float, sep=delim)
9472it [00:00, 146780.88it/s]


### Building RNN-LSTM model


In [4]:
import keras.backend as K
from keras.models import Model, save_model, load_model, Sequential
from keras.layers import Input, Dense, Masking, LSTM, Dropout, TimeDistributed, Bidirectional
from tensorflow.keras.optimizers import RMSprop, Adam

from numpy.random import seed
from tensorflow.keras.utils import set_random_seed


2022-05-11 01:34:40.235262: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-11 01:34:40.235298: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [17]:
def create_model(num_units_1=64, num_units_2=32):
    model = Sequential()
    model.add(LSTM(units=num_units_1, input_dim=25, activation='tanh', return_sequences=True))
    model.add(Dropout(0.1))
    model.add(LSTM(units=num_units_2, return_sequences=False))
    model.add(Dropout(0.1))
    model.add(Dense(3))
    rms = RMSprop(learning_rate=0.001)
    model.compile(loss='mse', optimizer=rms)
    return model

In [18]:
model = create_model()

In [13]:
batch = 5
start_index = 0
label_start = 1
time_step = 399
n_features = 25
epochs = 50

chunk = round(train_len / batch)
chunks = []
copied_len = train_len
for i in range(batch):
    if i == batch - 1:
        chunks.append(copied_len)
        break
    chunks.append(chunk)
    copied_len -= chunk
chunks

[5051, 5051, 5051, 5051, 5049]

In [19]:
for i in range(batch):
    x_train = load_batch_features(data_path+'train.txt', start_index, chunks[i] * time_step)
    x_train = x_train.reshape(chunks[i], time_step, 25)
    x_scaled = scale_data(x_train_scaler, x_train)   
    x_scaled = x_scaled.reshape(chunks[i], time_step, n_features)
    y_train = load_batch_labels(data_path+'train_labels.txt', label_start, chunks[i])
    y_scaled = scale_data(y_train_scaler, y_train)
    
    start_index += chunks[i]
    label_start += chunks[i]
    epoch = 1
    
    model.fit(x_scaled, y_scaled, epochs=epochs)
        


2025451it [00:24, 82316.79it/s]
15154it [00:00, 205616.84it/s]


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50

KeyboardInterrupt: 