In [47]:
import os
import numpy as np
import pandas as pd
import pickle
from multiprocessing import Pool, cpu_count # Multiprocessing package, speed up the process to get samples
import numexpr as ne # Fast way to navigate and search in pd.DataFrame objects
import time # Time the program
from sklearn.model_selection import train_test_split

from keras import Sequential
from keras.layers import Conv1D, Conv2D, SeparableConv2D, SeparableConv1D, MaxPooling1D, MaxPooling2D, Flatten,\
                         Dense, Activation, Dropout
from keras.optimizers import RMSprop, Adam, Adagrad, SGD
from IPython.display import clear_output

import tensorflow as tf
from keras.callbacks import Callback
from keras import backend as K

from keras.engine.topology import Layer, InputSpec

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from keras import backend as K
from keras.utils.conv_utils import normalize_data_format

from keras.utils import conv_utils
from keras.legacy import interfaces


from keras import backend as K
from keras.engine.topology import Layer


In [14]:
image_size = 16

In [5]:
# Add this class from 'get training and testing samples.ipynb'
# Otherwise, will not be able to load in data (pickles of lists of instances of CNNSamples) previously saved

class CNNSamples:
    
    def __init__(self, stockData, 
               data_len=64, 
               image_size=16, 
               retrain_freq=5):

        self.permno = stockData.PERMNO.iloc[0]
        self.data = stockData.drop('PERMNO', axis=1).T
        self.data_len = data_len
        self.image_size = image_size
        self.retrain_freq = retrain_freq
        self.GADFSample, self.GASFSample = [], []
        self.nDays = self.data.shape[1]
        self.nSamples = 0
    
    
    def getTimeSeriesCNNSample(self):
        
        gadf, gasf = GADF(self.image_size), GASF(self.image_size)

        for i in range(self.data_len, self.nDays, self.retrain_freq):
            series = self.data.iloc[:, i-self.data_len:i]
            self.GADFSample.append(gadf.fit_transform(series).T)
            self.GASFSample.append(gasf.fit_transform(series).T)

        self.nSamples = len(self.GADFSample)

        return self

In [18]:
def getTrainingDataFromPath(feature_path = '/Volumes/Seagate Backup Plus Drive/deep learning data/'+ \
                                            str(image_size)+'-pixel/',
                            target_path = '/Volumes/Seagate Backup Plus Drive/deep learning data/target/',
                            data_type = 'GADF',
                            image_size = 16,
                            train_val_size = 2/3,
                            train_size = 0.75):
    
    """ This function helps load in all TS-Image samples, and organize them into trainable manner. 
        Parameters explained:
        
        1) feature_path, target_path:
            Path for saved data. 'feature_path': X data; 'target_path': Y data.
            Notice: X data has to be named as 'CNNSamples_1', 'CNNSamples_2', ... etc.
                    Y data has to be named as 'CNNSamples_target'
                    
        2) data_type:
            Takes value only from one of 'GADF' and 'GASF'. Specify the type of TS-Image type.
        
        3) image_size:
            Image_size of each sample.
            
        4) train_val_size:
            Portion of sample taken to do training+validation
        
        5) train_size:
            Portion of sample taken from training+validation data to do training
            
    """
    
    
    data_file_num = len(os.listdir(feature_path)) - 1
    all_data = []
    for i in range(data_file_num):
        with open(feature_path+'CNNSamples_'+str(i+1), 'rb') as pick:
            all_data += pickle.load(pick)
            pick.close()
    
    X = []
    if data_type == 'GADF':
        for obj in all_data:
            X += (obj.GADFSample[:int(obj.nSamples*train_val_size)+1])
    else:
        for obj in all_data:
            X += (obj.GASFSample[:int(obj.nSamples*train_val_size)+1])
    
    X = np.array(X).reshape((len(all_data)*len(obj.nSamples), image_size, image_size, 4))
    
    with open(target_path+'CNNSamples_target', 'rb') as pick2:
        Ytmp = pickle.load(pick2)
        pick2.close()
        
    Y = np.array([y[1][:int(obj.nSamples*train_val_size)+1] for y in Ytmp]).reshape((len(X), 3))
    
    X_train, X_val, Y_train, Y_val = train_test_split(X, Y, train_size=train_size, shuffle=False)
    
    return (X_train, X_val, Y_train, Y_val)

In [9]:
X_train, X_val, Y_train, Y_val = getTrainingDataFromPath()



In [10]:
print('training set size:', len(X_train))
print('validation set size:', len(X_val))
print('number of classes:', Y_train.shape[1])

training set size: 740137
validation set size: 246713
number of classes: 3


In [11]:
print('historical data distribution:', Y_train.sum(axis=0)/len(Y_train))
print('validation data distribution:', Y_val.sum(axis=0)/len(Y_val))

historical data distribution: [0.37615063 0.27406142 0.34978794]
validation data distribution: [0.39027939 0.24492832 0.36479229]


## Condensed AlexNet
AlexNet defined below. <br>
Instead of a network structure with Original AlexNet (2014):<br>
Conv2D$\rightarrow$MaxPooling2D$\rightarrow$Conv2D$\rightarrow$MaxPooling2D$\rightarrow$Conv2D$\rightarrow$Conv2D$\rightarrow$Conv2D$\rightarrow$MaxPooling2D$\rightarrow$Dropout$\rightarrow$FullyConnected$\rightarrow$Dropout$\rightarrow$FullyConnected$\rightarrow$Dropout$\rightarrow$Output<br><br>
its architecture is now both parametrically and layer reduced:<br>
Conv2D$\rightarrow$MaxPooling2D$\rightarrow$Conv2D$\rightarrow$MaxPooling2D$\rightarrow$Conv2D$\rightarrow$MaxPooling2D$\rightarrow$Dropout$\rightarrow$FullyConnected$\rightarrow$Dropout$\rightarrow$FullyConnected$\rightarrow$Dropout$\rightarrow$Output<br>
In other words, two convolutional layers are dropped from original AlexNet, to account for the fact that image size is much smaller in my case. Also, the pool_size, strides, filters are all reduced to make my case more paramatrically parsimonious. Activation 'tanh' is used, to map data to [-1,1], which makes more sense for time-series. Also, zero padding is used to preserve more information over each layer.<br><br>
Apart from the architecture changes, max pooling method is modified to account for difference of image and TS-Image. Generally, MaxPooling2D output the largest element from the block it filters. Now I define a new kind of pooling layer called MaxAbsPooling2D, which outputs the value with largest absolute value (but sign-preserving). This is to account for the fact that strong negative correlation of time series also plays an important role in forecasting time-series.

In [101]:
class MaxAbsPooling2D(Layer):
    
    """Max Absolute value pooling
    
    Derived Layer class
    The pooling operation is defined in member function '_pooling_function'
    
    """
    
    def __init__(self, pool_size=(2, 2), strides=None, padding='same',
                 data_format='channels_last', **kwargs):
        super(MaxAbsPooling2D, self).__init__(**kwargs)
        if strides is None:
            strides = pool_size
        self.pool_size = conv_utils.normalize_tuple(pool_size, 2, 'pool_size')
        self.strides = conv_utils.normalize_tuple(strides, 2, 'strides')
        self.padding = conv_utils.normalize_padding(padding)
        self.data_format = normalize_data_format(data_format)
        self.input_spec = InputSpec(ndim=4)

    def compute_output_shape(self, input_shape):
        if self.data_format == 'channels_first':
            rows = input_shape[2]
            cols = input_shape[3]
        elif self.data_format == 'channels_last':
            rows = input_shape[1]
            cols = input_shape[2]
        rows = conv_utils.conv_output_length(rows, self.pool_size[0],
                                             self.padding, self.strides[0])
        cols = conv_utils.conv_output_length(cols, self.pool_size[1],
                                             self.padding, self.strides[1])
        if self.data_format == 'channels_first':
            return (input_shape[0], input_shape[1], rows, cols)
        elif self.data_format == 'channels_last':
            return (input_shape[0], rows, cols, input_shape[3])

    def _pooling_function(self, inputs, pool_size, strides,
                              padding, data_format):
        
        # Output the original maxpooling2d first
        # Output a maxpooling2d on abs(inputs)
        # Find the difference between two outputs, where negative numbers have larger absolute values
        # Multiply the boolean matrix representing difference with the second output and get the signed pooled outputs
        
        output1 = K.pool2d(inputs, pool_size, strides,
                           padding, data_format,
                           pool_mode='max')
        output2 = K.pool2d(K.abs(inputs), pool_size, strides,
                           padding, data_format,
                           pool_mode='max')
        difference = 2*K.cast(K.equal(output1, output2), dtype=float)-1
        output =(output2*difference)

        return output

    def call(self, inputs):
        output = self._pooling_function(inputs=inputs,
                                        pool_size=self.pool_size,
                                        strides=self.strides,
                                        padding=self.padding,
                                        data_format=self.data_format)
        return output


class getPredictionAfterEpoch(Callback):
    
    """Callback that records prediction on validation data over epochs of training
       To call bayesian neural network method, one just average the prediction from last N epochs of neural net
    """
    
    def __init__(self):
        self.outputs = []
    
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        self.outputs.append(self.model.predict(self.validation_data[0]))



class CondensedAlexNet:
    def __init__(self,
                image_size=16, 
                channel_size=4,
                nClass=3,
                filters=[6, 8, 4], 
                kernel_size=[2, 2, 2], 
                pool_size=[2, 2, 2],
                strides=[1, 1, 1, 1, 1, 1],
                MaxAbsPool=True, 
                padding=['same']*8, 
                conv_activation='tanh',
                denseNeurons=[64, 20], 
                dropout=0.2,
                dense_activation='tanh',
                isBayesian=False):
        
        self.nClass = nClass
        self.image_size = image_size
        self.channel_size = channel_size
        self.filters = filters
        self.kernel_size = kernel_size
        self.pool_size = pool_size
        self.strides = strides
        self.MaxAbsPool = MaxAbsPool
        self.padding = padding
        self.conv_activation = conv_activation
        self.denseNeurons = denseNeurons
        self.dropout = dropout
        self.dense_activation = dense_activation
        self.isBayesian = isBayesian
        self.model = Sequential()
        self.model.add(Conv2D(input_shape=(self.image_size, self.image_size, self.channel_size), 
                              filters=self.filters[0], kernel_size=(self.kernel_size[0], self.kernel_size[0]),
                              strides=(self.strides[0], self.strides[0]), data_format='channels_last',
                              activation=self.conv_activation, padding=self.padding[0]))
        
        self.model.add(MaxPooling2D(pool_size=(self.pool_size[0],self.pool_size[0]), 
                                       strides=(self.strides[1],self.strides[1]), padding=self.padding[1]))
        
        self.model.add(Conv2D(filters=self.filters[1], kernel_size=(self.kernel_size[1], self.kernel_size[1]),
                              strides=(self.strides[2], self.strides[2]),
                              activation=self.conv_activation, padding=self.padding[2]))
        
        self.model.add(MaxPooling2D(pool_size=(self.pool_size[1],self.pool_size[1]), 
                                       strides=(self.strides[3],self.strides[3]), padding=self.padding[3]))
        
        self.model.add(Conv2D(filters=self.filters[2], kernel_size=(self.kernel_size[2], self.kernel_size[2]),
                              strides=(self.strides[4], self.strides[4]),
                              activation=self.conv_activation, padding=self.padding[4]))
        
        self.model.add(MaxPooling2D(pool_size=(self.pool_size[2],self.pool_size[2]), 
                                       strides=(self.strides[5],self.strides[5]), padding=self.padding[5]))
        
        self.model.add(Dropout(self.dropout))
        
        self.model.add(Flatten())
        
        self.model.add(Dense(self.denseNeurons[0], activation=self.dense_activation))
        self.model.add(Dropout(self.dropout))
        self.model.add(Dense(self.denseNeurons[1], activation=self.dense_activation))
        self.model.add(Dropout(self.dropout))
        self.model.add(Dense(self.nClass, activation='softmax'))
        
        self.model.compile(loss='categorical_crossentropy',
                           optimizer=SGD(lr=0.01, momentum=0.9, nesterov=True),
                           metrics=['accuracy'])
        
        
    def fit(self, X, Y, valX=None, valY=None, verbose=1, epochs=20, batch_size=256):
        
        if self.isBayesian:
            self.myCallBack = getPredictionAfterEpoch()
            self.model.fit(X,Y, validation_data=(valX, valY),
                           epochs=epochs, batch_size=batch_size, verbose=verbose, callbacks=[self.myCallBack])
    
        else:
            self.myCallBack = None
            self.model.fit(X,Y, validation_data=(valX, valY),
                           epochs=epochs, batch_size=batch_size, verbose=verbose)
    def predict(self, x):
        return self.model.predict(x)
    
    def predict_classes(self, x):
        return self.model.predict_classes(x)
    
            

In [99]:
an = CondensedAlexNet()
an.fit(X = X_train, Y = Y_train, valX = X_val, valY = Y_val, batch_size=128)

Train on 740137 samples, validate on 246713 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [102]:
an_maxpool = CondensedAlexNet()
an_maxpool.fit(X = X_train, Y = Y_train, valX = X_val, valY = Y_val, batch_size=128)

Train on 740137 samples, validate on 246713 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
