In [1]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.3
set_session(tf.Session(config=config))

Using TensorFlow backend.


In [2]:
import functools
import numpy as np
import pandas as pd

from scipy.special import expit

import sklearn as sk
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from tensorflow import keras
import tensorflow as tf

def split_companies_train_dev_test(companies):
    "Return train, dev, test set for companies"
    train, test = train_test_split(companies, test_size=0.1, stratify = companies.sector)
    train, dev = train_test_split(train, test_size=0.1, stratify = train.sector)
    return train, dev, test


def filter_stocks(stocks, tickers):
    return stocks.loc[tickers]


def df_to_ts(df):
    res = df.copy()
    res.index = pd.DatetimeIndex(pd.to_datetime(res.date))
    res.drop('date', axis=1)
    return res


def log_softmax(x):
    return x - np.log(np.sum(np.exp(x)))


def sigmoid(x):
    return expit(x)


def sample_correlation(df, window_size=63):
    idx = np.random.randint(0, df.shape[0]-window_size)
    ts = df[idx:idx+window_size]
    fmap = lambda s: ts['pct_return'].corr(ts[s])
    indices = ts.columns.tolist()[1:]
    correlations = np.array(list(map(fmap, indices)))
    return correlations


def create_correlation_score(df, sample_size=1):
    res = np.array([log_softmax(sample_correlation(df)/0.05)
                    for i in range(sample_size)])
    return np.exp(np.nanmean(res, 0))


def load_data(stock_filename=None, indices_filename=None):

    if stock_filename is None:
        stock_filename = '../../data/processed/wiki_stocks_returns.csv'

    if indices_filename is None:
        indices_filename = '../../data/processed/wiki_indices_returns.csv'

    stocks = pd.read_csv(stock_filename, index_col=False) # long format
    indices = pd.read_csv(indices_filename, index_col=False) # wide format

    # Implementation of hierarchical clustering
    drop_column = lambda df,i=0: df.drop(df.columns[i], axis=1)

    stocks = drop_column(stocks)
    stocks = stocks.drop('name', axis=1)
    stocks = stocks.dropna()

    companies = stocks.groupby('ticker').first().reset_index()
    sectors_counts = companies.sector.value_counts()
    sectors_proportions = sectors_counts/sectors_counts.sum()
    sectors_unique = sectors_counts.index.tolist()

    stocks = stocks.set_index('ticker')

    indices_ts = df_to_ts(indices[['date'] + sectors_unique])
    stocks_ts = df_to_ts(stocks.reset_index())

    stocks_all = pd.merge(stocks_ts, indices_ts, 'left')
    stocks_all = stocks_all.dropna() # loss of 200 000 observations
    stocks_all = stocks_all.drop('sector', axis=1)
    stocks_all = stocks_all.groupby('ticker').apply(df_to_ts)
    stocks_all = stocks_all.drop(['ticker', 'date'], axis=1)
    stocks_all = stocks_all.rename(columns={'close': 'pct_return'})

    label_encoder = preprocessing.LabelEncoder()
    label_encoder.fit(sectors_counts.index.tolist())
    ticker_to_sector = dict(zip(companies.ticker, label_encoder.transform(companies.sector)))

    return stocks_all, companies, label_encoder, ticker_to_sector

def sectors_statistics(companies):
    sectors_counts = companies.sector.value_counts()
    sectors_proportions = sectors_counts/sectors_counts.sum()
    sectors_unique = sectors_counts.index.tolist()
    return sectors_counts, sectors_proportions, sectors_unique


def add_common_layers(y):
    y = keras.layers.BatchNormalization()(y)
    y = keras.layers.LeakyReLU()(y)
    return y


def grouped_convolution(y, nb_channels, _strides, cardinality=4):
    # when `cardinality` == 1 this is just a standard convolution
    if cardinality == 1:
        return keras.layers.Conv1D(nb_channels, kernel_size=10, strides=_strides, padding='same')(y)

    assert not nb_channels % cardinality
    _d = nb_channels // cardinality

    # in a grouped convolution layer, input and output channels are divided into `cardinality` groups,
    # and convolutions are separately performed within each group
    groups = []
    for j in range(cardinality):
        group = keras.layers.Lambda(lambda z: z[:, :, j * _d:j * _d + _d])(y)
        groups.append(keras.layers.Conv1D(_d, kernel_size=10, strides=_strides, padding='same')(group))

    # the grouped convolutional layer concatenates them as the outputs of the layer
    y = keras.layers.concatenate(groups)

    return y


def residual_block(y, nb_channels_in, nb_channels_out, cardinality=4, _strides=1, _project_shortcut=False):
    """
    Our network consists of a stack of residual blocks. These blocks have the same topology,
    and are subject to two simple rules:
    - If producing spatial maps of the same size, the blocks share the same hyper-parameters (width and filter sizes).
    - Each time the spatial map is down-sampled by a factor of 2, the width of the blocks is multiplied by a factor of 2.
    """
    shortcut = y
    kl = keras.layers
    # we modify the residual building block as a bottleneck design to make the network more economical
    y = kl.Conv1D(nb_channels_in, kernel_size=1, strides=1, padding='same')(y)
    y = add_common_layers(y)

    # ResNeXt (identical to ResNet when `cardinality` == 1)
    y = grouped_convolution(y, nb_channels_in, _strides=_strides)
    y = add_common_layers(y)

    y = kl.Conv1D(nb_channels_out, kernel_size=1, strides=1, padding='same')(y)
    # batch normalization is employed after aggregating the transformations and before adding to the shortcut
    y = kl.BatchNormalization()(y)

    # identity shortcuts used directly when the input and output are of the same dimensions
    if _project_shortcut or _strides != 1:
        # when the dimensions increase projection shortcut is used to match dimensions (done by 1×1 convolutions)
        # when the shortcuts go across feature maps of two sizes, they are performed with a stride of 2
        shortcut = kl.Conv1D(nb_channels_out, kernel_size=1, strides=_strides, padding='same')(shortcut)
        shortcut = kl.BatchNormalization()(shortcut)

    y = kl.add([shortcut, y])

    # relu is performed right after each batch normalization,
    # expect for the output of the block where relu is performed after the adding to the shortcut
    y = kl.LeakyReLU()(y)

    return y



# reparameterization trick
# instead of sampling from Q(z|X), sample eps = N(0,I)
# z = z_mean + sqrt(var)*eps
def sampling(args):
    """Reparameterization trick by sampling fr an isotropic unit Gaussian.
    # Arguments:
        args (tensor): mean and log of variance of Q(z|X)
    # Returns:
        z (tensor): sampled latent vector
    """
    z_mean, z_log_var = args
    batch = tf.shape(z_mean)[0]
    dim = keras.backend.int_shape(z_mean)[1]
    # by default, random_normal has mean=0 and std=1.0
    epsilon = tf.random_normal(shape=(batch, dim))
    return z_mean + tf.exp(0.5 * z_log_var) * epsilon

class CovarianceLayer(keras.layers.Layer):

    def __init__(self, num_classes=None, **kwargs):
        if num_classes is None:
            num_classes = 16
        self.num_classes = num_classes
        super(CovarianceLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        # Create a trainable weight variable for this layer.
        super(CovarianceLayer, self).build(input_shape)  # Be sure to call this at the end

    def call(self, inputs):
        series_input, environment_input = inputs
        series_input_multiple = tf.tile(series_input, [1, 1, self.num_classes])
        covariances = tf.reduce_mean(series_input_multiple * environment_input, axis=1);
        return covariances

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_dim)


def random_subset(df, window_size=21):
    idx = np.random.randint(0, df.shape[0]-window_size)
    ts = df[idx:idx+window_size]
    return ts


def make_keras_subset(dataset_type, companies_data, stocks_data, label_encoder, batch_size, window_size=21):
    idx = np.random.choice(companies_data[dataset_type].shape[0], batch_size)
    df = companies_data[dataset_type].iloc[idx]

    model_input_data = [random_subset(stocks_data[dataset_type].loc[t], window_size) for t in df.ticker]
    model_series_input = np.array([df['pct_return'].values for df in model_input_data])
    model_series_input = model_series_input.reshape(-1, window_size, 1)

    model_environment_input = np.array([df.iloc[:, 1:].values for df in model_input_data])

    y_true = label_encoder.transform(df.sector)

    return model_series_input, model_environment_input, y_true


class StocksSequence(keras.utils.Sequence):

    def __init__(self, stocks_data,  companies_data, window_size, label_encoder, batch_size):
        self.stocks_data = stocks_data
        self.batch_size = batch_size
        self.label_encoder = label_encoder
        self.companies_data = companies_data
        self.window_size = window_size

    def __len__(self):
        return int(np.ceil(self.stocks_data.shape[0] / float(self.batch_size)))

    def __getitem__(self, idx):

        idx = np.random.choice(self.companies_data.shape[0], self.batch_size)
        df = self.companies_data.iloc[idx]
        model_input_data = [random_subset(self.stocks_data.loc[t], self.window_size) for t in df.ticker]
        model_series_input = np.array([df['pct_return'].values for df in model_input_data])
        model_series_input = model_series_input.reshape(-1, self.window_size, 1)
        model_environment_input = np.array([df.iloc[:, 1:].values for df in model_input_data])
        y_true = self.label_encoder.transform(df.sector)

        return [model_series_input, model_environment_input], y_true





In [3]:

def make_model(num_classes=16, window_size=21, latent_dim=32):
    kl = keras.layers

    series_input = keras.layers.Input(shape=(window_size, 1), dtype='float32', name='series_input')
    environment_input = kl.Input(shape=(window_size, num_classes), dtype='float32', name='environment_input')
    tanh_layer = kl.Lambda(lambda y: tf.tanh(tf.scalar_mul(5, y)))
    
    x = kl.Concatenate()([series_input, environment_input])
    x = tanh_layer(x)
    x = kl.Conv1D(16, 5, 1, activation='relu')(x)
    x = residual_block(x, 16, 16, 2)
    # x = kl.Dropout(0.001)(x)
    x = kl.Conv1D(32, 5, 2, activation='relu')(x)
    # x = kl.Dropout(0.05)(x)
    x = residual_block(x, 32, 32, 2)
    # x = kl.Dropout(0.05)(x)
    x = kl.Conv1D(64, 5, 2, activation='relu')(x)
    # x = kl.Dropout(0.1)(x)
    # x = kl.MaxPool1D()(x)

    covariances = CovarianceLayer(num_classes)([tanh_layer(series_input), tanh_layer(environment_input)])
    covariances = kl.Dense(32, activation='relu')(covariances)
    covariances = kl.Dense(64, activation='relu')(covariances)

    x = kl.Flatten()(x)
    x = kl.Dense(64, 'relu')(x)
    x = kl.Dropout(0.1)(x)
    z = kl.Add(name='Embedding')([x, covariances])
    z = kl.Dense(64, 'relu')(z)
    z = kl.Dense(32, 'relu')(z)
    x_pred = kl.Dense(num_classes, 'softmax')(z)

    model = keras.Model(inputs = [series_input, environment_input], outputs=[x_pred], name='Classifier')

    # kl_batch = - .5 * tf.reduce_sum(1 + x_log_var - tf.square(x_mu) - tf.exp(x_log_var), axis=-1)
    # model.add_loss(kl_batch)

    return model

In [4]:
# Make train dev test set.
np.random.seed(42)

### Feature engineering

stock_filename = '../data/processed/wiki_stocks_returns.csv'
indices_filename = '../data/processed/wiki_indices_returns.csv'

stocks_all, companies, label_encoder, ticker_to_sector = load_data(stock_filename, indices_filename)
sectors_counts, sectors_proportions, sectors_unique = sectors_statistics(companies)

max_proportion_baseline = sectors_proportions.max()
biggest_sector = sectors_proportions.argmax()

print("Most representated class:", biggest_sector, ', with proportion of ', round(100*max_proportion_baseline, 2), '%.')
# Accuracy of our models should be better than max_proportion_baseline.

companies_data = {}
data_split = split_companies_train_dev_test(companies)
for i, k in enumerate(['train', 'dev', 'test']):
    companies_data[k] = data_split[i]
stocks_data = {k: filter_stocks(stocks_all, v.ticker) for k, v in companies_data.items()}


Most representated class: Financial Services , with proportion of  13.09 %.


In [5]:
window_size = 63
model = make_model(window_size=window_size)

In [6]:
if False:
    model.load_weights('checkpoint/model_weights.json')

In [7]:
import math
def schedule_fn(epoch):
   initial_lrate = 0.001
   drop = 0.5
   epochs_drop = 2.0
   epoch = epoch % 20 
   epoch_pow = math.floor((1+epoch)/epochs_drop)
   lrate = initial_lrate * math.pow(drop, epoch_pow)
   return lrate

In [8]:
batch_size = 64

optimizer = keras.optimizers.Adam(0.001)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
print(model.summary())

callbacks = [
    keras.callbacks.ModelCheckpoint('checkpoint/model_weights.json', monitor='val_acc', verbose=1, save_best_only=True, mode='max'),
    keras.callbacks.TensorBoard('logs/seventh'),
    keras.callbacks.LearningRateScheduler(schedule_fn)
]

stocks_sequence_training = StocksSequence(stocks_data['train'], companies_data['train'], window_size, label_encoder, batch_size)
stocks_sequence_validation = StocksSequence(stocks_data['dev'], companies_data['dev'], window_size, label_encoder, batch_size)

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
series_input (InputLayer)       (None, 63, 1)        0                                            
__________________________________________________________________________________________________
environment_input (InputLayer)  (None, 63, 16)       0                                            
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 63, 17)       0           series_input[0][0]               
                                                                 environment_input[0][0]          
______________________________________________________________

In [None]:
model.fit_generator(
    stocks_sequence_training, steps_per_epoch=2000, epochs=400, callbacks=callbacks,
    validation_data = stocks_sequence_validation, validation_steps=100, workers=4, max_queue_size=20, verbose=1,
    use_multiprocessing=True)

stocks_sequence_test = StocksSequence(stocks_data['test'], companies_data['test'], window_size, label_encoder, batch_size)

model.evaluate_generator(stocks_sequence_test, 400, workers=2, use_multiprocessing=True)

Epoch 1/400

Epoch 2/400

Epoch 3/400

Epoch 4/400

Epoch 5/400

Epoch 6/400

Epoch 7/400

Epoch 8/400

Epoch 9/400

Epoch 10/400

Epoch 11/400

Epoch 12/400

Epoch 13/400

Epoch 14/400

Epoch 15/400

Epoch 16/400

Epoch 17/400

Epoch 18/400

Epoch 19/400

Epoch 20/400

Epoch 21/400

Epoch 22/400

Epoch 23/400



Epoch 24/400

Epoch 25/400

Epoch 26/400

Epoch 27/400

Epoch 28/400

Epoch 29/400

Epoch 30/400

Epoch 31/400

Epoch 32/400

Epoch 33/400

Epoch 34/400

Epoch 35/400

Epoch 36/400

Epoch 37/400

Epoch 38/400

Epoch 39/400

Epoch 40/400

Epoch 41/400

Epoch 42/400

Epoch 43/400

Epoch 44/400

Epoch 45/400



Epoch 46/400

Epoch 47/400

Epoch 48/400

Epoch 49/400

Epoch 50/400

Epoch 51/400

Epoch 52/400

Epoch 53/400

Epoch 54/400

Epoch 55/400

Epoch 56/400

Epoch 57/400

Epoch 58/400

Epoch 59/400

Epoch 60/400

Epoch 61/400

Epoch 62/400

Epoch 63/400

Epoch 64/400

Epoch 65/400

Epoch 66/400

Epoch 67/400

Epoch 68/400

Epoch 69/400



Epoch 70/400

Epoch 71/400

Epoch 72/400

Epoch 73/400

Epoch 74/400

Epoch 75/400

Epoch 76/400

Epoch 77/400

Epoch 78/400

Epoch 79/400

Epoch 80/400

Epoch 81/400

Epoch 82/400

Epoch 83/400

Epoch 84/400

Epoch 85/400

Epoch 86/400

Epoch 87/400

Epoch 88/400

Epoch 89/400

Epoch 90/400

Epoch 91/400

Epoch 92/400



Epoch 93/400

Epoch 94/400

Epoch 95/400

Epoch 96/400

Epoch 97/400

Epoch 98/400

Epoch 99/400

Epoch 100/400

Epoch 101/400

Epoch 102/400

Epoch 103/400

Epoch 104/400

Epoch 105/400

Epoch 106/400

Epoch 107/400

Epoch 108/400

Epoch 109/400

Epoch 110/400

Epoch 111/400

Epoch 112/400

Epoch 113/400

Epoch 114/400

Epoch 115/400



Epoch 116/400

Epoch 117/400

Epoch 118/400

Epoch 119/400

Epoch 120/400

Epoch 121/400

Epoch 122/400

Epoch 123/400

Epoch 124/400

Epoch 125/400

Epoch 126/400

Epoch 127/400

Epoch 128/400

Epoch 129/400

Epoch 130/400

Epoch 131/400

Epoch 132/400

Epoch 133/400

Epoch 134/400

Epoch 135/400

Epoch 136/400

Epoch 137/400



Epoch 138/400

Epoch 139/400

Epoch 140/400

Epoch 141/400

Epoch 142/400

Epoch 143/400

Epoch 144/400

Epoch 145/400

Epoch 146/400

Epoch 147/400

Epoch 148/400

Epoch 149/400

Epoch 150/400

Epoch 151/400

Epoch 152/400

Epoch 153/400

Epoch 154/400

Epoch 155/400

Epoch 156/400

Epoch 157/400

Epoch 158/400

Epoch 159/400

Epoch 160/400

Epoch 161/400



Epoch 162/400

Epoch 163/400

Epoch 164/400

Epoch 165/400

Epoch 166/400

Epoch 167/400

Epoch 168/400

Epoch 169/400

Epoch 170/400

Epoch 171/400

Epoch 172/400

Epoch 173/400

Epoch 174/400

Epoch 175/400

Epoch 176/400

Epoch 177/400

Epoch 178/400

Epoch 179/400

Epoch 180/400

Epoch 181/400

Epoch 182/400

Epoch 183/400

Epoch 184/400



Epoch 185/400

Epoch 186/400

Epoch 187/400

Epoch 188/400

Epoch 189/400

Epoch 190/400

Epoch 191/400

Epoch 192/400

Epoch 193/400

Epoch 194/400

Epoch 195/400

Epoch 196/400

Epoch 197/400

Epoch 198/400

Epoch 199/400

Epoch 200/400

Epoch 201/400

Epoch 202/400

Epoch 203/400

Epoch 204/400

Epoch 205/400

Epoch 206/400

Epoch 207/400

Epoch 208/400



Epoch 209/400

Epoch 210/400

Epoch 211/400

Epoch 212/400

Epoch 213/400

Epoch 214/400

Epoch 215/400

Epoch 216/400

Epoch 217/400

Epoch 218/400

Epoch 219/400

Epoch 220/400

Epoch 221/400

Epoch 222/400

Epoch 223/400

Epoch 224/400

Epoch 225/400

Epoch 226/400

Epoch 227/400

Epoch 228/400

Epoch 229/400

Epoch 230/400

Epoch 231/400

Epoch 232/400



Epoch 233/400

Epoch 234/400

Epoch 235/400

Epoch 236/400

Epoch 237/400

Epoch 238/400

Epoch 239/400

Epoch 240/400

Epoch 241/400

Epoch 242/400

Epoch 243/400

Epoch 244/400

Epoch 245/400

Epoch 246/400

Epoch 247/400

Epoch 248/400

Epoch 249/400

Epoch 250/400

Epoch 251/400

Epoch 252/400

Epoch 253/400

Epoch 254/400

Epoch 255/400

Epoch 256/400



Epoch 257/400

Epoch 258/400

Epoch 259/400

Epoch 260/400

Epoch 261/400

Epoch 262/400

Epoch 263/400

Epoch 264/400

Epoch 265/400

Epoch 266/400

Epoch 267/400

Epoch 268/400

Epoch 269/400

Epoch 270/400

Epoch 271/400

Epoch 272/400

Epoch 273/400

Epoch 274/400

Epoch 275/400

Epoch 276/400

Epoch 277/400

Epoch 278/400

Epoch 279/400



Epoch 280/400

Epoch 281/400

Epoch 282/400

Epoch 283/400

Epoch 284/400

Epoch 285/400

Epoch 286/400

Epoch 287/400

Epoch 288/400

Epoch 289/400

Epoch 290/400

Epoch 291/400