In [1]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.1
set_session(tf.Session(config=config))

Using TensorFlow backend.


In [2]:
import functools
import numpy as np
import pandas as pd

from scipy.special import expit

import sklearn as sk
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from tensorflow import keras
import tensorflow as tf

def split_companies_train_dev_test(companies):
    "Return train, dev, test set for companies"
    train, test = train_test_split(companies, test_size=0.1, stratify = companies.sector)
    train, dev = train_test_split(train, test_size=0.1, stratify = train.sector)
    return train, dev, test


def filter_stocks(stocks, tickers):
    return stocks.loc[tickers]


def df_to_ts(df):
    res = df.copy()
    res.index = pd.DatetimeIndex(pd.to_datetime(res.date))
    res.drop('date', axis=1)
    return res


def log_softmax(x):
    return x - np.log(np.sum(np.exp(x)))


def sigmoid(x):
    return expit(x)


def sample_correlation(df, window_size=63):
    idx = np.random.randint(0, df.shape[0]-window_size)
    ts = df[idx:idx+window_size]
    fmap = lambda s: ts['pct_return'].corr(ts[s])
    indices = ts.columns.tolist()[1:]
    correlations = np.array(list(map(fmap, indices)))
    return correlations


def create_correlation_score(df, sample_size=1):
    res = np.array([log_softmax(sample_correlation(df)/0.05)
                    for i in range(sample_size)])
    return np.exp(np.nanmean(res, 0))


def load_data(stock_filename=None, indices_filename=None):

    if stock_filename is None:
        stock_filename = '../../data/processed/wiki_stocks_returns.csv'

    if indices_filename is None:
        indices_filename = '../../data/processed/wiki_indices_returns.csv'

    stocks = pd.read_csv(stock_filename, index_col=False) # long format
    indices = pd.read_csv(indices_filename, index_col=False) # wide format

    # Implementation of hierarchical clustering
    drop_column = lambda df,i=0: df.drop(df.columns[i], axis=1)

    stocks = drop_column(stocks)
    stocks = stocks.drop('name', axis=1)
    stocks = stocks.dropna()

    companies = stocks.groupby('ticker').first().reset_index()
    sectors_counts = companies.sector.value_counts()
    sectors_proportions = sectors_counts/sectors_counts.sum()
    sectors_unique = sectors_counts.index.tolist()

    stocks = stocks.set_index('ticker')

    indices_ts = df_to_ts(indices[['date'] + sectors_unique])
    stocks_ts = df_to_ts(stocks.reset_index())

    stocks_all = pd.merge(stocks_ts, indices_ts, 'left')
    stocks_all = stocks_all.dropna() # loss of 200 000 observations
    stocks_all = stocks_all.drop('sector', axis=1)
    stocks_all = stocks_all.groupby('ticker').apply(df_to_ts)
    stocks_all = stocks_all.drop(['ticker', 'date'], axis=1)
    stocks_all = stocks_all.rename(columns={'close': 'pct_return'})

    label_encoder = preprocessing.LabelEncoder()
    label_encoder.fit(sectors_counts.index.tolist())
    ticker_to_sector = dict(zip(companies.ticker, label_encoder.transform(companies.sector)))

    return stocks_all, companies, label_encoder, ticker_to_sector

def sectors_statistics(companies):
    sectors_counts = companies.sector.value_counts()
    sectors_proportions = sectors_counts/sectors_counts.sum()
    sectors_unique = sectors_counts.index.tolist()
    return sectors_counts, sectors_proportions, sectors_unique


def random_subset(df, window_size=21):
    idx = np.random.randint(0, df.shape[0]-window_size)
    ts = df[idx:idx+window_size]
    return ts


In [3]:
# Make train dev test set.
np.random.seed(42)

### Feature engineering

stock_filename = '../data/processed/wiki_stocks_returns.csv'
indices_filename = '../data/processed/wiki_indices_returns.csv'

stocks_all, companies, label_encoder, ticker_to_sector = load_data(stock_filename, indices_filename)
sectors_counts, sectors_proportions, sectors_unique = sectors_statistics(companies)

max_proportion_baseline = sectors_proportions.max()
biggest_sector = sectors_proportions.argmax()

print("Most representated class:", biggest_sector, ', with proportion of ', round(100*max_proportion_baseline, 2), '%.')
# Accuracy of our models should be better than max_proportion_baseline.

companies_data = {}
data_split = split_companies_train_dev_test(companies)
for i, k in enumerate(['train', 'dev', 'test']):
    companies_data[k] = data_split[i]
stocks_data = {k: filter_stocks(stocks_all, v.ticker) for k, v in companies_data.items()}


Most representated class: Financial Services , with proportion of  13.09 %.


In [4]:
### Correlation scores

def fmap(companie_ticker, dataset):
    return create_correlation_score(dataset.loc[companie_ticker])

def base_model():

    if True:
        accuracies = {}.fromkeys(['train', 'dev', 'test'], 0)
        accuracies_df = pd.Series(accuracies)
        scores_prediction = dict(accuracies)
        predictions = {}
        n_sample = 10
        for i in range(n_sample):
            if i % 5 == 0:
                print('Sample ', i)
            for dataset_type in ['train', 'dev', 'test']:
                tickers = companies_data[dataset_type].ticker.tolist()
                fmap_partial = functools.partial(fmap, dataset=stocks_data[dataset_type])
                scores = dict(zip(tickers, map(fmap_partial, tickers)))
                scores = pd.DataFrame(scores, index=stocks_all.columns[1:])
                scores = scores.dropna(axis=1)
                y_pred = label_encoder.transform(scores.apply(np.argmax))
                y_true = np.array([ticker_to_sector[k] for k in scores.columns])
                accuracies[dataset_type] = accuracy_score(y_true, y_pred)
                scores_prediction[dataset_type] += scores
                # predictions[dataset_type] = y_pred
            accuracies = pd.Series(accuracies)
            accuracies_df += accuracies
        print("Using sample correlation, we have the followin accuracies:\n", accuracies_df/n_sample)
        # dev      0.581667
        # test     0.607123
        # train    0.587883
    return accuracies_df/n_sample

accuracies_base = base_model()

Sample  0
Sample  5




Using sample correlation, we have the followin accuracies:
 dev      0.589394
test     0.591781
train    0.589370
dtype: float64
