# Prep

In [1]:
'''
CONSTANTS
'''

FILENAME = "data/QUANDL.csv"


VALIDATION_SPLIT = 0.2
DROPOUT_VALUE = 0.2
EPOCHS = 10
BATCH_SIZE = 10
TIMESTEP_SIZE = 30
USECOLS = [1, 2, 3, 4, 5, 6, 7]

# Data Generator

In [2]:
import itertools as it
import numpy as np
import pandas as pd

from keras.utils import Sequence
# sequence_data is a class representing a dataset that implements generator as its getter
import itertools as it
import numpy as np
import pandas as pd

from keras.utils import Sequence
# sequence_data is a class representing a dataset that implements generator as its getter
class sequence_data(Sequence):
    # Generate a single datum from the file
    def read_raw_datum(self): 
        for datum in pd.read_csv(self.__filename, chunksize=1, usecols=self.__usecols):
            yield np.array(datum.values[0])
    
    def __get_max(self):
        # Manual, since I can't seem to find a library to do this
        result = np.array([])
        for row in self.read_raw_datum():
            if result.size == 0: 
                result = row
            else:
                for col in range(row.shape[0]):
                    if result[col] < row[col]:
                        result[col] = row[col]
        return result
    
    def __get_min(self):
        result = np.array([])
        for row in self.read_raw_datum():
            if result.size == 0: 
                result = row
            else:
                for col in range(row.shape[0]):
                    if result[col] > row[col]:
                        result[col] = row[col]
        return result
    
    def get_attr_length(self):
        return len(self.__usecols)
    
    # Get a normalized data from a csv, within a range
    def read_data(self, start, stop):
        for datum in it.islice(self.read_raw_datum(), start, stop):
            yield np.array((datum - self.__min) / (self.__max - self.__min))       
    
    def __init__(self, filename, start_idx, stop_idx, usecols, timestep_size, batch_size):
        self.__filename = filename
        self.__usecols = usecols
        self.__start_idx = start_idx
        self.__stop_idx = stop_idx
        self.__max = self.__get_max()
        self.__min = self.__get_min()
        #self.__max, self.__min = self.__get_max_min()
        self.__timestep_size = timestep_size
        self.__batch_size = batch_size
    
    def __len__(self):
        #return self.__stop_idx - self.__start_idx - self.__timestep_size
        return int(np.ceil((self.__stop_idx - self.__start_idx - self.__timestep_size)/float(self.__batch_size)))
    
    def __getitem__(self, batch_idx):
        x_batch_buffer = []
        y_batch_buffer = []
        x_timestep_buffer = []
        # Collecting timesteps for a batch
        for timestep_idx in range(self.__batch_size):
            stop = min(self.__start_idx + batch_idx + timestep_idx + self.__timestep_size, self.__stop_idx)
            if not x_timestep_buffer:
                start = self.__start_idx + batch_idx + timestep_idx
            else:
                start = stop - 1
                del x_timestep_buffer[0]
            # Collecting data for a timestep    
            for datum in self.read_data(start, stop + 1):
                x_timestep_buffer.append(datum)
            y_timestep_buffer = x_timestep_buffer.pop()
            x_batch_buffer.append(x_timestep_buffer)
            y_batch_buffer.append(y_timestep_buffer)
            if stop == self.__stop_idx: break
        return np.array(x_batch_buffer), np.array(y_batch_buffer)
    
    def getitem(self, idx):
        return self.__getitem__(idx)

Using TensorFlow backend.


# Data and Model Definition

In [3]:
train_data = sequence_data(FILENAME, 0, 2600, USECOLS, TIMESTEP_SIZE, BATCH_SIZE)
vali_data = sequence_data(FILENAME, 2600, 2900, USECOLS, TIMESTEP_SIZE, BATCH_SIZE)

In [5]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout
from keras.layers.recurrent import LSTM
from keras.layers import LeakyReLU, Flatten

'''
Model initialization
'''

model = Sequential()
# Input layer
model.add(LSTM(len(USECOLS), 
    batch_input_shape=(BATCH_SIZE, TIMESTEP_SIZE, len(USECOLS)),
    return_sequences=True))
model.add(LeakyReLU())
# Hidden layer
model.add(LSTM(32))
model.add(LeakyReLU())
model.add(Dropout(DROPOUT_VALUE))
# Output layer
model.add(Dense(len(USECOLS), activation='linear'))

model.compile(loss='mse', optimizer='adam')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (10, 30, 7)               420       
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (10, 30, 7)               0         
_________________________________________________________________
lstm_2 (LSTM)                (10, 32)                  5120      
_________________________________________________________________
leaky_re_lu_2 (LeakyReLU)    (10, 32)                  0         
_________________________________________________________________
dropout_1 (Dropout)          (10, 32)                  0         
_________________________________________________________________
dense_1 (Dense)              (10, 7)                   231       
Total params: 5,771
Trainable params: 5,771
Non-trainable params: 0
_________________________________________________________________


In [None]:
'''
Training
'''
model.fit_generator(train_data,
    validation_data=vali_data,
    shuffle=False,
    epochs=EPOCHS,
    workers=4)

model.save("saved_model/test.h5")

Epoch 1/10

def read_raw_datum(filename): # generator
    for datum in pd.read_csv(filename, chunksize=1, usecols=[1, 2, 3, 4, 5, 6, 7]):
        yield datum.values[0]

def get_max(filename): # returns a numpy array
    # Manual, since I can't seem to find a library to do this
    result = np.array([])
    for row in read_raw_datum(filename):
        if result.size == 0: 
            result = row
        else:
            for col in range(row.shape[0]):
                if result[col] < row[col]:
                    result[col] = row[col]
    return result

def get_min(filename): 
    result = np.array([])
    for row in read_raw_datum(filename):
        if result.size == 0: 
            result = row
        else:
            for col in range(row.shape[0]):
                if result[col] > row[col]:
                    result[col] = row[col]
    return result

def read_datum(filename, start, stop):
    ma = get_max(filename)
    mi = get_min(filename)
    while True:
        for datum in it.islice(read_raw_datum(filename), start, stop):
            try:
                yield np.reshape(np.array([(datum - mi) / (ma - mi)]), (1, 1, datum.shape[0]))
            except StopIteration:
                break

def read_data2(filename, start, stop):
    ma = get_max(filename)
    mi = get_min(filename)
    for datum in it.islice(read_raw_datum(filename), start, stop):
        try:
            yield np.reshape(np.array([(datum - mi) / (ma - mi)]), (1, 1, datum.shape[0]))
        except StopIteration:
            break