In [1]:
import os
import tensorflow as tf
import sklearn as sk
from tensorflow import keras

from sklearn import manifold
from keras.backend.tensorflow_backend import set_session

config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.15
set_session(tf.Session(config=config))


Using TensorFlow backend.


In [2]:
import functools
import numpy as np
import pandas as pd
from scipy.special import expit

import sklearn as sk
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix, classification_report

from tensorflow import keras
import tensorflow as tf

def split_companies_train_dev_test(companies):
    "Return train, dev, test set for companies"
    train, test = train_test_split(companies, test_size=0.1, stratify = companies.sector)
    train, dev = train_test_split(train, test_size=0.1, stratify = train.sector)
    return train, dev, test


def filter_stocks(stocks, tickers):
    return stocks.loc[tickers]


def df_to_ts(df):
    res = df.copy()
    res.index = pd.DatetimeIndex(pd.to_datetime(res.date))
    res.drop('date', axis=1)
    return res


def log_softmax(x):
    return x - np.log(np.sum(np.exp(x)))


def sigmoid(x):
    return expit(x)


def load_data(stock_filename=None, indices_filename=None):

    if stock_filename is None:
        stock_filename = '../../data/processed/wiki_stocks_returns.csv'

    if indices_filename is None:
        indices_filename = '../../data/processed/wiki_indices_returns.csv'

    stocks = pd.read_csv(stock_filename, index_col=False) # long format
    indices = pd.read_csv(indices_filename, index_col=False) # wide format

    # Implementation of hierarchical clustering
    drop_column = lambda df,i=0: df.drop(df.columns[i], axis=1)

    stocks = drop_column(stocks)
    stocks = stocks.drop('name', axis=1)
    stocks = stocks.dropna()

    companies = stocks.groupby('ticker').first().reset_index()
    sectors_counts = companies.sector.value_counts()
    sectors_proportions = sectors_counts/sectors_counts.sum()
    sectors_unique = sectors_counts.index.tolist()

    stocks = stocks.set_index('ticker')

    indices_ts = df_to_ts(indices[['date'] + sectors_unique])
    stocks_ts = df_to_ts(stocks.reset_index())

    stocks_all = pd.merge(stocks_ts, indices_ts, 'left')
    stocks_all = stocks_all.dropna() # loss of 200 000 observations
    stocks_all = stocks_all.drop('sector', axis=1)
    stocks_all = stocks_all.groupby('ticker').apply(df_to_ts)
    stocks_all = stocks_all.drop(['ticker', 'date'], axis=1)
    stocks_all = stocks_all.rename(columns={'close': 'pct_return'})

    label_encoder = preprocessing.LabelEncoder()
    label_encoder.fit(sectors_counts.index.tolist())
    ticker_to_sector = dict(zip(companies.ticker, label_encoder.transform(companies.sector)))

    return stocks_all, companies, label_encoder, ticker_to_sector

def sectors_statistics(companies):
    sectors_counts = companies.sector.value_counts()
    sectors_proportions = sectors_counts/sectors_counts.sum()
    sectors_unique = sectors_counts.index.tolist()
    return sectors_counts, sectors_proportions, sectors_unique

def random_subset(df, window_size=21):
    if df.shape[0]-window_size < 0:
        return df
    idx = np.random.randint(0, df.shape[0]-window_size)
    ts = df[idx:idx+window_size]
    return ts

class StocksSequence(keras.utils.Sequence):

    def __init__(self, stocks_data,  companies_data, window_size, label_encoder,
                 batch_size, mode_key='train'):
        self.stocks_data = stocks_data
        self.batch_size = batch_size
        self.label_encoder = label_encoder
        self.companies_data = companies_data
        self.window_size = window_size
        self.mode_key = mode_key
        self.classes = []

        _, sectors_proportion, _ = sectors_statistics(companies_data)
        self.sectors_proportion = sectors_proportion

    def __len__(self):
        return int(np.ceil(self.stocks_data.shape[0] / float(self.batch_size)))

    def __getitem__(self, idx):

        idx = np.random.choice(self.companies_data.shape[0], self.batch_size)
        df = self.companies_data.iloc[idx]
        model_input_data = [random_subset(self.stocks_data.loc[t], self.window_size)
                            for t in df.ticker]
        model_input = np.array([df.values for df in model_input_data])
        # correlation = np.array([np.corrcoef(x.T + 0.001*np.random.randn(*x.T.shape))[0, 1:] for x in model_input])
        if self.mode_key != 'infer':
            y_true = self.label_encoder.transform(df.sector)

        if self.mode_key == 'infer':
            if 'sector' in df.columns:
                self.classes.extend(self.label_encoder.transform(df.sector))

        if self.mode_key == 'infer':
            return model_input

        return model_input, y_true



In [4]:
def normalize_tensor(x):
    """
    x: a B x T x F tensor
    """
    epsilon = 1e-16
    m, s = tf.nn.moments(x, axes=[1], keep_dims=True)
    z = (x - m)/(tf.sqrt(s)+epsilon)
    return z

def correlation_function(x):
    x_normalized = normalize_tensor(x)
    correlations = tf.keras.backend.batch_dot(
        x_normalized, tf.transpose(x_normalized, [0, 2, 1]), axes=[1, 2])
    correlations = correlations/tf.cast(tf.shape(x_normalized)[1], tf.float32)
    return correlations

def correlation_roll(x):
    x_normalized = normalize_tensor(x)
    cross_product = x_normalized[:, :, 0, tf.newaxis] * x_normalized[:, :, 1:]
    cross_product = cross_product/tf.cast(tf.shape(x_normalized)[1], tf.float32)
    return cross_product

In [5]:
tf.reset_default_graph()
model = keras.models.load_model('checkpoint/model_weights_thirtythird.json')

In [6]:
stocks_all, companies, label_encoder, ticker_to_sector = load_data(
    '../data/processed/wiki_stocks_returns.csv',
    '../data/processed/wiki_indices_returns.csv')

sectors_counts, sectors_proportions, sectors_unique = sectors_statistics(companies)
companies_data = {}
data_split = split_companies_train_dev_test(companies)
for i, k in enumerate(['train', 'dev', 'test']):
    companies_data[k] = data_split[i]
stocks_data = {k: filter_stocks(stocks_all, v.ticker) for k, v in companies_data.items()}

In [7]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
series_input (InputLayer)       (None, 63, 17)       0                                            
__________________________________________________________________________________________________
gaussian_noise_1 (GaussianNoise (None, 63, 17)       0           series_input[0][0]               
__________________________________________________________________________________________________
lambda_6 (Lambda)               (None, 63, 17)       0           gaussian_noise_1[0][0]           
__________________________________________________________________________________________________
lambda_5 (Lambda)               (None, 63, 16)       0           gaussian_noise_1[0][0]           
__________________________________________________________________________________________________
lambda_7 (

In [8]:
from IPython.display import SVG
from tensorflow.keras import utils as keras_utils

keras_utils.plot_model(model, 'model_keras.png')


In [9]:
sequence_generator_test = StocksSequence(
    stocks_data['test'], companies_data['test'], 63, label_encoder, 256, 'eval')
y = model.evaluate_generator(sequence_generator_test, 300)

sequence_generator_infer = StocksSequence(
    stocks_data['test'], companies_data['test'], 63, label_encoder, 256, 'infer')

prediction = model.predict_generator(sequence_generator_infer, 300)

In [10]:

y_pred = np.argmax(prediction, 1)
y_true = np.array(sequence_generator_infer.classes[:y_pred.shape[0]])


conf_mat = confusion_matrix(y_true, y_pred)

conf_df = pd.DataFrame(conf_mat, columns=label_encoder.classes_.tolist(), index=label_encoder.classes_.tolist())
conf_df.to_csv('confusion_matrix.csv')
conf_df_ratio = conf_df.apply(lambda x: 100*round(x/x.sum(), 4), 0)
conf_df_ratio.to_csv('confusion_matrix_ratio.csv')

In [11]:
conf_df

Unnamed: 0,Business Services,Chemicals,Communication Equipment,Communication Services,Consumer Packaged Goods,Drug Manufacturers,Entertainment,Financial Services,Industrial Products,Insurance,Manufacturing - Apparel & Furniture,Medical,Oil and Gas,REITs,Retail - Apparel & Specialty,Technology
Business Services,43,54,13,40,64,11,1547,434,83,74,35,216,49,166,152,247
Chemicals,24,21,3,13,67,8,141,208,264,68,1637,66,258,77,140,178
Communication Equipment,3,3,737,20,23,10,23,165,96,17,5,43,21,12,58,825
Communication Services,30,10,15,980,77,13,78,183,38,14,36,86,64,234,57,169
Consumer Packaged Goods,67,95,22,66,1743,39,205,192,77,32,75,408,256,187,263,338
Drug Manufacturers,62,16,9,32,109,776,59,124,23,13,40,480,72,110,60,177
Entertainment,885,8,21,15,35,1,216,389,121,20,62,187,103,220,134,598
Financial Services,65,81,27,124,120,17,568,6676,172,558,69,452,262,511,294,461
Industrial Products,89,39,18,29,180,23,477,448,1895,60,247,208,362,266,257,772
Insurance,47,33,28,19,86,2,105,1108,45,687,20,216,164,238,106,183


In [12]:
conf_df_ratio

Unnamed: 0,Business Services,Chemicals,Communication Equipment,Communication Services,Consumer Packaged Goods,Drug Manufacturers,Entertainment,Financial Services,Industrial Products,Insurance,Manufacturing - Apparel & Furniture,Medical,Oil and Gas,REITs,Retail - Apparel & Specialty,Technology
Business Services,2.72,3.63,1.21,2.47,2.06,0.98,33.97,3.56,2.27,4.4,1.35,2.73,0.53,2.34,2.03,2.39
Chemicals,1.52,1.41,0.28,0.8,2.15,0.71,3.1,1.71,7.22,4.05,63.08,0.83,2.79,1.08,1.87,1.72
Communication Equipment,0.19,0.2,68.49,1.23,0.74,0.89,0.51,1.35,2.62,1.01,0.19,0.54,0.23,0.17,0.78,7.97
Communication Services,1.9,0.67,1.39,60.49,2.47,1.15,1.71,1.5,1.04,0.83,1.39,1.09,0.69,3.29,0.76,1.63
Consumer Packaged Goods,4.23,6.38,2.04,4.07,55.97,3.46,4.5,1.57,2.1,1.9,2.89,5.15,2.77,2.63,3.52,3.26
Drug Manufacturers,3.92,1.08,0.84,1.98,3.5,68.86,1.3,1.02,0.63,0.77,1.54,6.06,0.78,1.55,0.8,1.71
Entertainment,55.91,0.54,1.95,0.93,1.12,0.09,4.74,3.19,3.31,1.19,2.39,2.36,1.11,3.09,1.79,5.78
Financial Services,4.11,5.44,2.51,7.65,3.85,1.51,12.47,54.74,4.7,33.21,2.66,5.7,2.83,7.19,3.93,4.45
Industrial Products,5.62,2.62,1.67,1.79,5.78,2.04,10.47,3.67,51.79,3.57,9.52,2.62,3.92,3.74,3.44,7.46
Insurance,2.97,2.22,2.6,1.17,2.76,0.18,2.31,9.09,1.23,40.89,0.77,2.73,1.77,3.35,1.42,1.77


In [13]:
import seaborn as sns
import matplotlib.pylab as plt
sns.set()
plt.rcParams['figure.figsize'] = (15, 9)
sns_plot = sns.heatmap(conf_df_ratio)
sns_plot.figure.savefig('confusion_matrix.png', width=20, height=9)
plt.show()

<matplotlib.figure.Figure at 0x7f27016e5048>

In [14]:
print(classification_report(y_true, y_pred, target_names=label_encoder.classes_.tolist()))

                                     precision    recall  f1-score   support

                  Business Services       0.03      0.01      0.02      3228
                          Chemicals       0.01      0.01      0.01      3173
            Communication Equipment       0.68      0.36      0.47      2061
             Communication Services       0.60      0.47      0.53      2084
            Consumer Packaged Goods       0.56      0.43      0.49      4065
                 Drug Manufacturers       0.69      0.36      0.47      2162
                      Entertainment       0.05      0.07      0.06      3015
                 Financial Services       0.55      0.64      0.59     10457
                Industrial Products       0.52      0.35      0.42      5370
                          Insurance       0.41      0.22      0.29      3087
Manufacturing - Apparel & Furniture       0.02      0.01      0.01      3149
                            Medical       0.51      0.64      0.57      627

In [15]:
# model.save_weights('checkpoint/model_seventeenth_first.json')

In [16]:
class ClassPredictor:
    """
    Predicts the class of a stocks
    """
    def __init__(self, model, window_size, prediction_sample=10, mode_key='train'):
        self.model = model
        self.window_size = window_size
        self.mode_key = mode_key
        self.prediction_sample = prediction_sample

    def predict(self, stocks_data):
        model = self.model
        batch_df_data = [random_subset(stocks_data, self.window_size) for i in range(self.prediction_sample)]
        batch_data = np.array([df.values for df in batch_df_data])
        correlation = np.array([np.corrcoef(x.T + 0.001*np.random.randn(*x.T.shape))[0, 1:] for x in batch_data])
        
        return model.predict_on_batch([batch_data, correlation])

    def predict_class(self, keras_output):
        return pd.DataFrame(np.argmax(np.array(keras_output), 2)).apply(
            lambda s: np.argmax(s.value_counts()), 1)

class_predictor = ClassPredictor(model, 63, 25, 'predict')
df = stocks_data['test']
companies = companies_data['test']

res = []
for t in companies.ticker:
    res.append(class_predictor.predict(df.loc[t]))

predictions = class_predictor.predict_class(res)


ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 1 array(s), but instead got the following list of 2 arrays: [array([[[ 0.01323392,  0.00413986,  0.00980785, ...,  0.00497035,
          0.00414119, -0.00735208],
        [ 0.00607064,  0.00198207, -0.00058673, ..., -0.00316818,
          0.00271987, -0.004593...

In [None]:
y_true = label_encoder.transform(companies.sector)
print(classification_report(y_true, predictions.values, target_names=label_encoder.classes_.tolist()))


In [None]:
stocks_data_all = pd.concat( [stocks_data['train'], stocks_data['dev'],
                              stocks_data['test']], 0)

companies_data_all = pd.concat([companies_data['train'], companies_data['dev'],
                           companies_data['test']], 0)

In [None]:

model.summary()
layer = model.get_layer('Embedding')
model_embedding = keras.models.Model(model.input, layer.output)

sequence_generator_infer = StocksSequence(
    stocks_data_all, companies_data_all, 63, label_encoder, 256, 'infer')

model_embedding.predict_generator(sequence_generator_infer, 10)

embedding_predictor = ClassPredictor(model_embedding, 63, 25, 'predict')

res = [embedding_predictor.predict(stocks_data_all.loc[t]) for t in companies_data_all.ticker]

average_embedding = np.mean(np.array(res), axis=1)




In [None]:
metadata_path = 'logs/embedding/metadata.tsv'
companies_data_all[['ticker', 'sector']].to_csv(metadata_path, sep='\t', index=False)

In [None]:
tf.reset_default_graph()
with tf.variable_scope('embedding', reuse=tf.AUTO_REUSE):
    embedding_variable = tf.get_variable('stock_embedding', initializer=tf.constant(np.array(average_embedding)))

In [None]:
projector = tf.contrib.tensorboard.plugins.projector

## Running TensorFlow Session
with tf.Session() as sess:
    saver = tf.train.Saver([embedding_variable])
    sess.run(embedding_variable.initializer)
    saver.save(sess, os.path.join('logs/embedding', 'embedding_variable.ckpt'))
    config = projector.ProjectorConfig()
    # One can add multiple embeddings.
    embedding = config.embeddings.add()
    embedding.tensor_name = embedding_variable.name
    # Link this tensor to its metadata(Labels) file
    embedding.metadata_path = './metadata.tsv'
    # Saves a config file that TensorBoard will read during startup.

    projector.visualize_embeddings(tf.summary.FileWriter('logs/embedding'), config)


In [None]:
tsne_points = manifold.TSNE(perplexity=50, learning_rate=10, n_iter=1000).fit_transform(average_embedding)

In [None]:
tsne_df = pd.DataFrame(tsne_points, columns=['x', 'y'])
tsne_df['sector'] = companies_data_all.sector.values

In [None]:
plt.rcParams['figure.figsize'] = (15, 9)
sns_plot = sns.scatterplot(x='x', y='y', hue='sector', data=tsne_df)
plt.show()

In [None]:
sns_plot.figure.savefig('tsne.png')

# Analysis

Usually the clusert a well separated. Financial services and REITs are well
separated and easily identified. The model seems to have difficulty to
differentiate somes chemical companies as their embedding seems to be close to
some industrial production companies.

In general, the more companies we had in the raw dataset the more precise the
groups are.



In [None]:
# Silouhette in the embedding space
from sklearn.metrics import silhouette_score, silhouette_samples
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

X = average_embedding
y = companies_data_all.sector

range_n_clusters = [len(label_encoder.classes_)]

for n_clusters in range_n_clusters:
    # Create a subplot with 1 row and 2 columns
    fig, ax1 = plt.subplots(1, 1)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.2, 0.6])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    cluster_labels = label_encoder.transform(y)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, label_encoder.inverse_transform(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.2, 0, 0.2, 0.4, 0.6])


plt.show()