# Algorithmic Market Efficiency - Prediction Analysis Code

In [None]:
import numpy as np
import pandas as pd
import scipy as sp
import statsmodels.api as sm
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.ticker as mtick
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

import pickle
import time
import datetime
from datetime import datetime
import warnings
import os
import sys

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import scale
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

from scipy.interpolate import UnivariateSpline
from statsmodels.regression.linear_model import OLS
from statsmodels.regression.rolling import RollingOLS
from scipy.stats.mstats import gmean
     
%config InlineBackend.figure_format = 'retina'


import pylatex
from pylatex import Document, Section, Subsection, Tabular, Math, TikZ, Axis
from pylatex import Plot, Figure, Matrix, Alignat, NoEscape, MultiColumn, MultiRow
from pylatex.utils import italic, NoEscape
pylatex.config.active = pylatex.config.Version2

print("current directory is : " + os.getcwd())

In [None]:
filepath = 'output/'
if not os.path.isdir(filepath): os.mkdir(filepath)

graphpath = 'graphs/'
if not os.path.isdir(graphpath): os.mkdir(graphpath)
    
tablespath = 'tables/'
if not os.path.isdir(tablespath): os.mkdir(tablespath)

## Data code

### Data importation

In [None]:
df = pickle.load(open('data/df_mktvars', 'rb'))

dates, dtdates, permnos, vrs, facs, returns, T, N, V, F =  pickle.load(open('data/info', 'rb'))

rf, mkt_rf, smb, hml, rmw, cma, mom = pickle.load(open('data/misc', 'rb'))

predictions = pickle.load(open('mloutput/predictions', 'rb'))
scores = pickle.load(open('mloutput/scores', 'rb'))

In [None]:
mktcaps = df[['date','permno','mktcap',]].pivot(index='date', columns='permno', values='mktcap')

mktcapshares = mktcaps.to_numpy().T / np.nansum(mktcaps.to_numpy(), axis=1)
mktcapshares = mktcapshares.T
mktcapshares = pd.DataFrame(mktcapshares, index=mktcaps.index,columns=mktcaps.columns)

In [None]:
methods = ['ols', 'lasso', 'ridge', 'enet', 'pcr', 'pls','tree', 'forest', 'gbrt', 'nn1', 'nn2', 'nn3', 'nn5', 'nn10']
M = len(methods)
methodlabels = ['OLS', 'Lasso', 'Ridge', 'Enet', 'PCR', 'PLS','Tree', 'Forest', 'GBRT', 'NN1', 'NN2', 'NN3', 'NN5', 'NN10']

# Miscellaneous code

In [None]:
#Matplotlib utils:

def basefig(a, b):
    fig = plt.figure(figsize=(a,b))
    ax = fig.gca()
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.xaxis.set_ticks_position('bottom')
    ax.yaxis.set_ticks_position('left')
    ax.margins(y=0, x=0)
    return fig, ax

def labeledline(values, label, offset, ax, width=None, color=None, style=None):
    
    ax.plot(dtdates, values, linewidth=width, color=color, linestyle=style)
    ax.annotate(s=label, xy=(dtdates[-1],values[-1]), xytext=(5,offset), textcoords='offset points', va='center')

def labeledlines(valueslist, labellist, offsetlist, ax):
    
    for i, label in enumerate(labellist):
                
        values = valueslist[methods[i]]
        offset = offsetlist[i]

        labeledline(values, label, offset, ax)
        
plt.rcParams['font.sans-serif']=['Fira Sans']
plt.rcParams['font.size'] = 12
plt.rcParams['legend.fontsize'] = 'medium'
plt.rcParams['axes.titlepad'] = 25
plt.rcParams['axes.labelpad'] = 10

plt.rcParams['xtick.major.pad'] = 4
plt.rcParams['ytick.major.pad'] = 5
plt.rcParams['xtick.major.width'] = 1
plt.rcParams['ytick.major.width'] = 1

plt.rcParams['legend.frameon'] = True
plt.rcParams['legend.fancybox'] = False
plt.rcParams['legend.framealpha'] = 1
plt.rcParams['legend.edgecolor'] = 'black'

import seaborn as sns
sns.set_palette("tab20")

import matplotlib.ticker as ticker

after1980 = [datetime.date(1980, 1, 1), datetime.date(2016, 12, 1)]
from80to15 = [datetime.date(1980, 1, 1), datetime.date(2015, 12, 1)]

In [None]:
def labeledline(values, label, offset, ax, width=None, color=None, style=None, alpha=None):
        
    ax.plot(dtdates, values, linewidth=width, color=color, linestyle=style, alpha=alpha)
    ax.annotate(s=label, xy=(dtdates[-1],values[-1]), xytext=(5,offset), textcoords='offset points', va='center')

def labeledlines(valueslist, labellist, offsetlist, ax, alpha=None):
            
    if len(labellist) == len(methodlabels) :
        
        for i, label in enumerate(labellist):

            values = valueslist[methods[i]]
            offset = offsetlist[i]

            labeledline(values, label, offset, ax, width=None, color=methodcolors[i], alpha=alpha)
    
    elif len(labellist) == len(fewmethodlabels) :
            
        for i, label in enumerate(labellist):
            
            values = valueslist[fewmethods[i]]
            offset = offsetlist[i]
            
            labeledline(values, label, offset, ax, width=None, color=fewmethodcolors[i], alpha=alpha)
    else:
        print("Error: labellist length not reconised!")

In [None]:
import datetime
import itertools

fewmethods_idx = [True, False, False, False, False, False, True, True, False, True, True, True, False, False]

fewmethods = list(itertools.compress(methods, fewmethods_idx)) 

fewmethodlabels = list(itertools.compress(methodlabels, fewmethods_idx)) 

colorcycle = plt.rcParams["axes.prop_cycle"].by_key()["color"]
methodcolors = colorcycle[:len(methodlabels)]
#fewmethodcolors = list(itertools.compress(methodcolors, fewmethods_idx)) 
fewmethodcolors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:pink']

Mfew = len(fewmethods)

In [None]:
def ma(sample, window_size=12*3, centered=False):
    roll = pd.Series(sample).rolling(window=window_size, center=centered)
    ma = roll.mean().to_numpy()
    return ma

def movingaverage(sample, window_size=12*5, centered=True):
    roll = pd.Series(sample).rolling(window=window_size, center=centered)
    ma = roll.mean().to_numpy()
    return ma
    
def dictma(sampledict, window_size=12*3, centered=False):
    madict = sampledict.copy()
    for key in sampledict:
        madict[key] = ma(sampledict[key], window_size, centered)
    return madict

In [None]:
returns_pctile = 100 * returns.rank(pct=True, axis=1)
mktcaps_pctile = 100 * mktcaps.rank(pct=True, axis=1)

predictions_pctile = dict()
for method in methods:
    predictions_pctile[method] = 100 * predictions[method].rank(pct=True, axis=1)

# Analyzing predictive performance

## Cross Section Analysis

In [None]:
dates_years = (dates / 10000).astype('int')
years = np.unique(dates_years)

def intopercentile(x):
    return 100 * sp.stats.rankdata(x) / (np.max(sp.stats.rankdata(x[~np.isnan(x)])))

In [None]:
table = Tabular('lccccc')

table.add_row(('', MultiColumn(2, align='c', data='In-sample (2006-2015)'), '', MultiColumn(2, align='c', data='Out-of-sample (2016)')))

table.add_hline(2, 3)
table.add_hline(5, 6)

table.add_row(('', '   R2   ', '   rho   ', '', 'R2', 'rho'))

for (i,method) in enumerate(methods):    

    r2_test = scores.loc[(20160129,method,'test','R2')]
    spearman_test = scores.loc[(20160129,method,'test','spearman')]

    r2_oos = scores.loc[(20160129,method,'oos','R2')]
    spearman_oos = scores.loc[(20160129,method,'oos','spearman')]

    table.add_row((methodlabels[i],
                    '{:.2f}'.format(r2_test), '{:.2f}'.format(spearman_test), '',
                    '{:.2f}'.format(r2_oos), '{:.2f}'.format(spearman_oos)))


table.generate_tex('tables/scores_2016')

In [None]:
method='nn3'

fig, ax = plt.subplots(1, 3, figsize=(18,6))
ax = ax.flatten()

for axis in ax:
    axis.spines['right'].set_visible(False)
    axis.spines['top'].set_visible(False)

date = dates[-1]

prediction = predictions[method].loc[date].to_numpy().flatten()
currets = returns.loc[date].to_numpy().flatten()

prediction = prediction[~np.isnan(prediction)]
currets = currets[~np.isnan(currets)]

alphaparam = 0.05

print(prediction.shape)
print(currets.shape)


ax[0].plot(currets, prediction, 'ko', alpha=alphaparam)
ax[0].plot(np.sort(currets), np.zeros_like(prediction), 'k--')
ax[0].axvline(ymin=min(prediction), ymax=max(prediction), color='k', linestyle='--')
ax[0].set_xlabel('Actual Monthly Return')
ax[0].set_ylabel('Predicted Monthly Return')
ax[0].set_ylim([-0.25, 0.5])
ax[0].set_xlim([-0.25, 0.5])


ax[1].plot(intopercentile(currets), prediction, 'ko', alpha=alphaparam)
ax[1].plot(np.sort(intopercentile(currets)), np.zeros_like(prediction), 'k--')
ax[1].set_xlabel('Actual Monthly Return, Percentile')
ax[1].set_ylabel('Predicted Monthly Return')
ax[1].set_ylim([-0.25, 0.5])

ax[2].plot(intopercentile(currets), intopercentile(prediction), 'ko', alpha=alphaparam)
ax[2].set_xlabel('Actual Monthly Return, Percentile')
ax[2].set_ylabel('Predicted Monthly Return, Percentile')

fig.tight_layout(pad=2.0)
fig.suptitle("Predicted and Actual Monthly Returns - 3-Layer Neural Network - December 2016", size=18, y=1.05)

plt.savefig(graphpath+'predictionsvsreturns_explanationplots.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
graphdates = [19800530, 19830729, 20001130, 20040528, 20151030, 20161230]

for date in graphdates:

    def predictionsvsreturnsplot(method, date, ax, alphaparam=0.01, title=""):

        prediction = predictions[method].loc[date].to_numpy().flatten()
        currets = returns.loc[date].to_numpy().flatten()

        prediction = prediction[~np.isnan(prediction)]
        currets = currets[~np.isnan(currets)]

        binedges = np.linspace(np.min(intopercentile(prediction)), np.max(intopercentile(prediction)), 21)
        binpts = (binedges[1:] + binedges[:-1]) / 2

        means, _, _ = sp.stats.binned_statistic(intopercentile(currets), prediction, 'mean', bins=binedges)

        R2 = r2_score(currets, prediction)
        spearman = sp.stats.spearmanr(currets, prediction)[0]

        ax.plot(np.sort(intopercentile(currets)), np.sort(currets), 'C2--', label="Actual monthly returns")
        ax.plot(intopercentile(currets), prediction, 'ko', alpha=alphaparam)
        ax.plot(intopercentile(currets), np.full_like(intopercentile(currets), np.nanmean(prediction)), 'k-', label='Predicted returns: Overall average')
        ax.plot(binpts, means, 'C0o', label='Predicted returns: Vigintile avg.')
        ax.set_xlabel('Actual Monthly Return, Percentile')
        ax.set_ylabel('Predicted Monthly Return')
        ax.set_ylim([-0.5, 0.5])
        ax.set_title(title + ' - R2: ' + '{:.2f}'.format(R2) + ' - Spearman Rho: ' + '{:.2f}'.format(spearman))
        ax.legend(loc='upper left', frameon=False)


    fig, ax = plt.subplots(2, 2, figsize=(18,12))
    ax = ax.flatten()

    for axis in ax:
        axis.spines['right'].set_visible(False)
        axis.spines['top'].set_visible(False)

    predictionsvsreturnsplot("ols", date, ax[0], alphaparam=0.02, title="Ordinary Least Squares")

    predictionsvsreturnsplot("pcr", date, ax[1], alphaparam=0.02, title="Principal Components Regression")

    predictionsvsreturnsplot("forest", date, ax[2], alphaparam=0.02, title="Random Forest")

    predictionsvsreturnsplot("nn3", date, ax[3], alphaparam=0.02, title="3-Layer Neural Network")

    fig.tight_layout(pad=2.0)
    fig.suptitle("Predicted and Actual Monthly Returns - Cross-Section Analysis - " + str(date), size=18, y=1.02)

    plt.savefig(graphpath+'predictionsvsreturnsplot_'+str(date)+'.png', dpi=300, bbox_inches='tight')
    plt.show()

In [None]:
graphdates = [19800530, 19830729, 20001130, 20040528, 20151030, 20161230]

for date in graphdates:

    def returnsvspredictionsplot(method, date, ax, alphaparam=0.01, title=""):

        prediction = predictions[method].loc[date].to_numpy().flatten()
        currets = returns.loc[date].to_numpy().flatten()

        prediction = prediction[~np.isnan(prediction)]
        currets = currets[~np.isnan(currets)]

        binedges = np.linspace(np.min(intopercentile(currets)), np.max(intopercentile(currets)), 21)
        binpts = (binedges[1:] + binedges[:-1]) / 2

        means, _, _ = sp.stats.binned_statistic(intopercentile(prediction), currets, 'mean', bins=binedges)
        stds, _, _ = sp.stats.binned_statistic(intopercentile(prediction), currets, 'std', bins=binedges)

        R2 = r2_score(currets, prediction)
        spearman = sp.stats.spearmanr(currets, prediction)[0]

        ax.plot(np.sort(intopercentile(prediction)), np.sort(prediction), 'C0--', label="Predicted monthly returns")
        ax.plot(intopercentile(prediction), currets, 'ko', alpha=alphaparam)
        ax.plot(intopercentile(prediction), np.full_like(intopercentile(prediction), np.nanmean(currets)), 'k-', label='Actual returns: Overall average')
        ax.plot(binpts, means, 'C2o', label='Actual returns: Vigintile avg. (std. dev.)')
        ax.set_xlabel('Predicted Monthly Return, Percentile')
        ax.set_ylabel('Actual Monthly Return')
        ax.set_ylim([-0.3, 0.3])
        ax.set_title(title + ' - R2: ' + '{:.2f}'.format(R2) + ' - Spearman Rho: ' + '{:.2f}'.format(spearman))
        ax.legend(loc='upper left', frameon=False)


    fig, ax = plt.subplots(2, 2, figsize=(18,12))
    ax = ax.flatten()

    for axis in ax:
        axis.spines['right'].set_visible(False)
        axis.spines['top'].set_visible(False)

    returnsvspredictionsplot("ols", date, ax[0], alphaparam=0.02, title="Ordinary Least Squares")

    returnsvspredictionsplot("pcr", date, ax[1], alphaparam=0.02, title="Principal Components Regression")

    returnsvspredictionsplot("forest", date, ax[2], alphaparam=0.02, title="Random Forest")

    returnsvspredictionsplot("nn3", date, ax[3], alphaparam=0.02, title="3-Layer Neural Network")

    fig.tight_layout(pad=2.0)
    fig.suptitle("Predicted and Actual Monthly Returns - Cross-Section Analysis - " + str(date), size=18, y=1.02)

    plt.savefig(graphpath+'returnsvspredictionsplot_'+str(date)+'.png', dpi=300, bbox_inches='tight')
    plt.show()

## Time Series Analysis

In [None]:
def scorestimeseries(method, titlelabel):
        
    testR2_list = scores.loc[(dates,method,'test','R2')].values
    testR2_list = pd.DataFrame(testR2_list).fillna(method='ffill').to_numpy().flatten()
    
    testSpearman_list = scores.loc[(dates,method,'test','spearman')].values
    testSpearman_list = pd.DataFrame(testSpearman_list).fillna(method='ffill').to_numpy().flatten()
    
    oosR2_list = scores.loc[(dates,method,'oos','R2')].values
    oosR2_list = pd.DataFrame(oosR2_list).fillna(method='ffill').to_numpy().flatten()
    
    oosSpearman_list = scores.loc[(dates,method,'oos','spearman')].values
    oosSpearman_list = pd.DataFrame(oosSpearman_list).fillna(method='ffill').to_numpy().flatten()
    
        
    fig, ax = plt.subplots(1, figsize=(15,4))
    
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    
    ax.plot(dtdates, testR2_list, 'C0--', alpha=0.2)
    ax.plot(dtdates, ma(testR2_list, 12*3, centered=True), 'C0--', label='Test R2 (3-year mov. avg.)')
    ax.plot(dtdates, oosR2_list, 'C0', alpha=0.2)
    ax.plot(dtdates, ma(oosR2_list, 12*3, centered=True), 'C0', label='OOS R2 (3-year mov. avg.)')
    
    ax.plot(dtdates, testSpearman_list, 'C2--', alpha=0.2)
    ax.plot(dtdates, ma(testSpearman_list, 12*3, centered=True), 'C2--', label='Test Spearman Rho (3-year mov. avg.)')
    ax.plot(dtdates, oosSpearman_list, 'C2', alpha=0.2)
    ax.plot(dtdates, ma(oosSpearman_list, 12*3, centered=True), 'C2', label='OOS Spearman Rho (3-year mov. avg.)')
    
    ax.plot(dtdates, np.zeros(len(dates)), 'k--', linewidth=2)
    ax.plot(dtdates, np.ones(len(dates)), 'k--', linewidth=2)
    ax.plot(dtdates, -0.25*np.ones(len(dates)), 'k--', alpha=0.25, linewidth=1)
    ax.plot(dtdates, -0.5*np.ones(len(dates)), 'k--', alpha=0.25, linewidth=1)
    ax.plot(dtdates, -0.75*np.ones(len(dates)), 'k--', alpha=0.25, linewidth=1)
    ax.plot(dtdates, 0.25*np.ones(len(dates)), 'k--', alpha=0.25, linewidth=1)
    ax.plot(dtdates, 0.5*np.ones(len(dates)), 'k--', alpha=0.25, linewidth=1)
    ax.plot(dtdates, 0.75*np.ones(len(dates)), 'k--', alpha=0.25, linewidth=1)

    plt.legend(frameon=False)
    ax.set_ylim([-1.1, 1.1])
    ax.set_title(titlelabel, size=16, y=0.95)
    plt.savefig(graphpath+'scorestimeseries_'+method+'.pdf')
    plt.show()
    
scorestimeseries('ols', "Ordinary Least Squares")
scorestimeseries('pcr', "Principal Components Regression")
scorestimeseries('forest', "Random Forest")
scorestimeseries('nn3', "3-Layer Neural Network")

# ML-Portfolio Returns

## Strategy computation functions

In [None]:
def longstrategy(method, percentile, valueweighted=True):

    warnings.filterwarnings('ignore')

    strategy = returns.copy()
    strategy[~np.isnan(strategy)] = 0
          
    prediction = predictions[method].copy()

    p_top = np.nanpercentile(prediction, 100 - percentile, axis=1, keepdims=True)

    strategy[prediction>=p_top] = 1
    
    if valueweighted : strategy = strategy * mktcaps.values
    
    rescaler = np.nansum(abs(strategy), axis=1, keepdims=True)
    strategy = strategy / rescaler
    
    strategy = pd.DataFrame(strategy)
    strategy.index, strategy.columns = returns.index, returns.columns

    return strategy

In [None]:
def longshortstrategy(method, percentile, valueweighted=True):

    warnings.filterwarnings('ignore')

    strategy = returns.copy().to_numpy()
    strategy[~np.isnan(strategy)] = 0
              
    prediction = predictions[method].copy().to_numpy()

    p_top = np.nanpercentile(prediction, 100 - percentile, axis=1, keepdims=True)
    p_bottom = np.nanpercentile(prediction, percentile, axis=1, keepdims=True)

    strategy[prediction>p_top] = 1
    strategy[prediction<p_bottom] = -1
    
    if valueweighted : strategy = strategy * mktcaps.values
            
    rescaler = np.nansum(abs(strategy), axis=1, keepdims=True) / 2
    strategy = strategy / rescaler
    
    strategy = pd.DataFrame(strategy)
    strategy.index, strategy.columns = returns.index, returns.columns
    
    return strategy

In [None]:
def rankstrategy(method, valueweighted=True):
    
    warnings.filterwarnings('ignore')

    strategy = returns.copy().to_numpy()
    strategy[~np.isnan(strategy)] = 0
              
    prediction = predictions[method].copy().to_numpy()
    prediction[np.isnan(prediction)] = np.inf
        
    prediction_order = np.argsort(prediction, axis=1)
    prediction_rank = np.argsort(prediction_order, axis=1) + 1
    
    numberofstocks = np.count_nonzero(~np.isnan(strategy), axis=1)
        
    strategy = (prediction_rank.T - ((numberofstocks+1)/2)) 
    
    strategy =  strategy.T
    strategy[np.isnan(returns)] = np.nan
        
    if valueweighted : strategy = strategy * mktcaps.values
            
    rescaler = np.nansum(abs(strategy), axis=1, keepdims=True) / 2
    strategy = strategy / rescaler
        
    strategy = pd.DataFrame(strategy)
    strategy.index, strategy.columns = returns.index, returns.columns
    
    return strategy

## Return, Sharpe ratio and alpha computation functions

In [None]:
def strategyreturns(strategy):

    if type(strategy) is not np.ndarray:
        strategy = np.array(strategy)
     
    indrets = np.multiply(strategy,returns)

    yearrets = np.nansum(indrets, axis=1)
    
    excessrets = yearrets - np.nanmean(returns,axis=1)

    compoundrets = np.cumproduct(yearrets+1)
            
    totalret = compoundrets[-1] - 1

    return yearrets, compoundrets, excessrets, totalret

In [None]:
def sharperatios(yearrets, n_periods):
               
    yearrets_rf = yearrets - rf.values
        
    yearrets_rf_df = pd.DataFrame(yearrets_rf)
    movingaverage = yearrets_rf_df.rolling(n_periods).mean()
    movingstd = yearrets_rf_df.rolling(n_periods).std()
    
    sharperatios = movingaverage[0] / movingstd[0]
    sharperatios = np.array(sharperatios)
    
    sharperatios = np.sqrt(12)*sharperatios
        
    return sharperatios

In [None]:
def capmalphas(yearrets, n_periods):
            
    rols = RollingOLS(yearrets - rf, sm.add_constant(mkt_rf), window=n_periods)
    results = rols.fit().params
    
    alphas = results['const']
    alphas = np.array(alphas)
        
    return alphas

In [None]:
def ff3alphas(yearrets, n_periods):
        
    exog = pd.concat([mkt_rf, smb, hml], axis=1)
    exog = sm.add_constant(exog)

    rols = RollingOLS(yearrets - rf, exog, window=n_periods)
    results = rols.fit().params
        
    alphas = results['const']
    alphas = np.array(alphas)
        
    return alphas

In [None]:
def avgret(yearrets, startdate=19580131, enddate=20161230):
    
    start_idx = dates.tolist().index(startdate)
    end_idx = dates.tolist().index(enddate)
        
    nyears = len(yearrets[start_idx:end_idx])
            
    avgret = np.prod(1 + yearrets[start_idx:end_idx])**(1/nyears) - 1
    
    return avgret

In [None]:
def sharperatio(yearrets, startdate=19580131, enddate=20161230):
    
    start_idx = dates.tolist().index(startdate)
    end_idx = dates.tolist().index(enddate)
        
    yearrets_rf = yearrets - rf.values
    yearrets_rf = yearrets_rf[start_idx:end_idx]
    
    avg = np.mean(yearrets_rf)
    std = np.std(yearrets_rf)
    
    sharperatio = avg / std
    sharperatio = np.sqrt(12)*sharperatio
        
    return sharperatio

In [None]:
def capmalpha(yearrets, startdate=19580131, enddate=20161230):
        
    start_idx = dates.tolist().index(startdate)
    end_idx = dates.tolist().index(enddate)
        
    yearrets_rf = yearrets - rf.values
    yearrets_rf = yearrets_rf[start_idx:end_idx]
    
    exog = sm.add_constant(mkt_rf[start_idx:end_idx])
    
    capm = OLS(yearrets_rf, exog).fit(cov_type='HAC',cov_kwds={'maxlags':12*10})
    
    alpha =  capm.params['const']
    
    t_alpha =  capm.tvalues['const']
    
    return alpha, t_alpha

In [None]:
def ff3alpha(yearrets, startdate=19580131, enddate=20161230):
        
    start_idx = dates.tolist().index(startdate)
    end_idx = dates.tolist().index(enddate)
        
    yearrets_rf = yearrets - rf.values
    yearrets_rf = yearrets_rf[start_idx:end_idx]
    
    exog = pd.concat([mkt_rf[start_idx:end_idx], smb[start_idx:end_idx], hml[start_idx:end_idx]], axis=1)
    exog = sm.add_constant(exog)
    
    ff3 = OLS(yearrets_rf, exog).fit(cov_type='HAC',cov_kwds={'maxlags':12*10})
    
    alpha =  ff3.params['const']
    
    t_alpha =  ff3.tvalues['const']
    
    return alpha, t_alpha

In [None]:
def ch4alpha(yearrets, startdate=19580131, enddate=20161230):
        
    start_idx = dates.tolist().index(startdate)
    end_idx = dates.tolist().index(enddate)
        
    yearrets_rf = yearrets - rf.values
    yearrets_rf = yearrets_rf[start_idx:end_idx]
    
    exog = pd.concat([mkt_rf[start_idx:end_idx], smb[start_idx:end_idx], hml[start_idx:end_idx], mom[start_idx:end_idx]], axis=1)
    exog = sm.add_constant(exog)
    
    ch4 = OLS(yearrets_rf, exog).fit(cov_type='HAC',cov_kwds={'maxlags':12*10})
    
    alpha =  ch4.params['const']
    
    t_alpha =  ch4.tvalues['const']
    
    return alpha, t_alpha

In [None]:
def ff5alpha(yearrets, startdate=19580131, enddate=20161230):
        
    start_idx = dates.tolist().index(startdate)
    end_idx = dates.tolist().index(enddate)
        
    yearrets_rf = yearrets - rf.values
    yearrets_rf = yearrets_rf[start_idx:end_idx]
    
    exog = pd.concat([mkt_rf[start_idx:end_idx], smb[start_idx:end_idx], hml[start_idx:end_idx],
                      rmw[start_idx:end_idx], cma[start_idx:end_idx]], axis=1)
    exog = sm.add_constant(exog)
    
    ff5 = OLS(yearrets_rf, exog).fit(cov_type='HAC',cov_kwds={'maxlags':12*10})
        
    alpha =  ff5.params['const']
    
    t_alpha =  ff5.tvalues['const']
    
    return alpha, t_alpha

In [None]:
def maxLoss(yearrets, startdate=19580131, enddate=20161230):
    
    start_idx = dates.tolist().index(startdate)
    end_idx = dates.tolist().index(enddate)
    
    return np.min(yearrets[start_idx:end_idx])

In [None]:
def maxDD(compoundrets, startdate=19580131, enddate=20161230):
    
    start_idx = dates.tolist().index(startdate)
    end_idx = dates.tolist().index(enddate)
    
    compoundrets = pd.DataFrame(compoundrets[start_idx:end_idx])
    
    Roll_Max = compoundrets.cummax()
    Drawdown = compoundrets/Roll_Max - 1.0    
    Max_Drawdown = np.min(Drawdown.values)
        
    return Max_Drawdown

In [None]:
def turnover(strategy, startdate=19580131, enddate=20161230):

    strategy_t = strategy
    strategy_t = strategy_t.loc[startdate:enddate]
        
    strategy_tp1 = strategy_t.shift(1)
    
    strategy_t = strategy_t.fillna(0)
    strategy_tp1 = strategy_tp1.fillna(0)
    
    indturnover = abs(strategy_tp1.values - strategy_t.values * (1+returns.loc[startdate:enddate].values))    
    monthlyturnover = np.nansum(indturnover, axis=1)
        
    monthlyturnover_pct = monthlyturnover / np.nansum(abs(strategy_t), axis=1)
    monthlyturnover_pct[np.isinf(monthlyturnover_pct)] = 2
        
    avgturnover_pct = np.nanmean(monthlyturnover_pct)
        
    return avgturnover_pct

## Market Performance

In [None]:
avgyearrets_ew = np.nanmean(returns.to_numpy(), axis=1)
avgexcessrets_ew = avgyearrets_ew - np.nanmean(returns.to_numpy(),axis=1)
avgcompoundrets_ew = np.cumproduct(avgyearrets_ew+1)
avgtotalret_ew = avgcompoundrets_ew[-1] - 1

valueweightedindrets = np.multiply(returns.to_numpy(), mktcaps.to_numpy())
avgyearrets = np.nansum(valueweightedindrets, axis=1) / np.nansum(mktcaps.to_numpy(), axis=1)
avgcompoundrets = np.cumproduct(avgyearrets+1)
avgtotalret = avgcompoundrets[-1] - 1


def marketaveragesfrom(startdate):

    avgar = avgret(avgyearrets, startdate, enddate=20161230)
    avgar = '{:.2f}'.format(100*avgar)

    avgsr = sharperatio(avgyearrets, startdate, enddate=20161230)
    avgsr = '{:.2f}'.format(avgsr)

    avgcapma, t_avgcapma = capmalpha(avgyearrets, startdate, enddate=20161230)
    avgcapma, t_avgcapma = '{:.2f}'.format(100*avgcapma), '{:.2f}'.format(t_avgcapma)

    avgff3a, t_avgff3a = ff3alpha(avgyearrets, startdate, enddate=20161230)
    avgff3a, t_avgff3a = '{:.2f}'.format(100*avgff3a), '{:.2f}'.format(t_avgff3a)
    
    avgch4a, t_avgch4a = ch4alpha(avgyearrets, startdate, enddate=20161230)
    avgch4a, t_avgch4a = '{:.2f}'.format(100*avgch4a), '{:.2f}'.format(t_avgch4a)
    
    avgff5a, t_avgff5a = ff5alpha(avgyearrets, startdate, enddate=20161230)
    avgff5a, t_avgff5a = '{:.2f}'.format(100*avgff5a), '{:.2f}'.format(t_avgff5a)
    
    avgmaxloss = maxLoss(avgyearrets, startdate, enddate=20161230)
    
    avgmaxDD = maxDD(avgcompoundrets, startdate, enddate=20161230)
    
    return avgar, avgsr, avgcapma, t_avgcapma, avgff3a, t_avgff3a, avgch4a, t_avgch4a, avgff5a, t_avgff5a, avgmaxloss, avgmaxDD


avgar58, avgsr58, avgcapma58, t_avgcapma58, avgff3a58, t_avgff3a58, avgch4a58, t_avgch4a58, avgff5a58, t_avgff5a58, avgmaxloss58, avgmaxDD58 = marketaveragesfrom(19580131)

avgar58, avgsr80, avgcapma80, t_avgcapma80, avgff3a80, t_avgff3a80, avgch4a80, t_avgch4a80, avgff5a80, t_avgff5a80, avgmaxloss80, avgmaxDD80 = marketaveragesfrom(19800131)

avgar58, avgsr00, avgcapma00, t_avgcapma00, avgff3a00, t_avgff3a00, avgch4a00, t_avgch4a00, avgff5a00, t_avgff5a00, avgmaxloss00, avgmaxDD00 = marketaveragesfrom(20000131)


avgsharperatios_3y = sharperatios(avgyearrets, 12 * 3)
avgsharperatios_10y = sharperatios(avgyearrets, 12 * 10)
    
avgcapmalphas_3y = capmalphas(avgyearrets, 12 * 3)
avgcapmalphas_10y = capmalphas(avgyearrets, 12 * 10)

avgff3alphas_3y = ff3alphas(avgyearrets, 12 * 3)
avgff3alphas_10y = ff3alphas(avgyearrets, 12 * 10)



## Top Decile Long Strategy Performance

In [None]:
nanseries = np.full(T, np.nan)
yearrets_l10 = dict(zip(methods,[nanseries] * M))
compoundrets_l10 = dict(zip(methods, [np.nan]*M))
excessrets_l10 = dict(zip(methods,[nanseries] * M))
totalret_l10 = dict(zip(methods, [np.nan]*M))

yearrets_ew_l10 = dict(zip(methods,[nanseries] * M))
compoundrets_ew_l10 = dict(zip(methods,[nanseries] * M))
excessrets_ew_l10 = dict(zip(methods,[nanseries] * M))
totalret_ew_l10 = dict(zip(methods, [np.nan]*M))

sharperatios_3y_l10 = dict(zip(methods, [np.nan]*M))
sharperatios_10y_l10 = dict(zip(methods, [np.nan]*M))
    
capmalphas_3y_l10 = dict(zip(methods, [np.nan]*M))
capmalphas_10y_l10 = dict(zip(methods, [np.nan]*M))

ff3alphas_3y_l10 = dict(zip(methods, [np.nan]*M))
ff3alphas_10y_l10 = dict(zip(methods, [np.nan]*M))


for method in methods:
    
    print(method)
    
    strategy = longstrategy(method, 10)
    
    yearrets_l10[method], compoundrets_l10[method], excessrets_l10[method], totalret_l10[method] = strategyreturns(strategy)
    
    ewstrategy = longstrategy(method, 10, valueweighted=False)
    yearrets_ew_l10[method], compoundrets_ew_l10[method], excessrets_ew_l10[method], totalret_ew_l10[method] = strategyreturns(ewstrategy)
    
    sharperatios_3y_l10[method] = sharperatios(yearrets_l10[method], 12 * 3)
    sharperatios_10y_l10[method] = sharperatios(yearrets_l10[method], 12 * 10)
    
    capmalphas_3y_l10[method] = capmalphas(yearrets_l10[method], 12 * 3)
    capmalphas_10y_l10[method] = capmalphas(yearrets_l10[method], 12 * 10)
    
    ff3alphas_3y_l10[method] = ff3alphas(yearrets_l10[method], 12 * 3)
    ff3alphas_10y_l10[method] = ff3alphas(yearrets_l10[method], 12 * 10)
    
print("All done!")

In [None]:
def tablefrom(startdate):
    
    table = Tabular('lccccccccccccc')

    table.add_row(('', MultiColumn(13, align='c', data=str(int(startdate/10000))+'-2016')))
    table.add_hline(2, 14)

    table.add_row(('', 'Avg.', 'Sharpe', 'Avg.', 'Max.', 'Max.',
                    'CAPM-', MultiRow(2, data='(t-stat)'), 'FF3-', MultiRow(2, data='(t-stat)'),
                    'CH4-', MultiRow(2, data='(t-stat)'), 'FF5-', MultiRow(2, data='(t-stat)')))
    table.add_row(('', 'Ret.', 'Ratio', 'Turn.', 'Loss', 'DD', 'alpha', '', 'alpha', '', 'alpha', '', 'alpha', ''))

    for (i,method) in enumerate(methods):    

        ar = avgret(yearrets_l10[method], startdate=startdate, enddate=20161230)
        ar = '{:.2f}'.format(100*ar)

        sr = sharperatio(yearrets_l10[method], startdate=startdate, enddate=20161230)
        sr = '{:.2f}'.format(sr)
        
        Max_Loss = maxLoss(yearrets_l10[method], startdate=startdate, enddate=20161230)
        Max_Loss = '{:.1f}'.format(100*Max_Loss)
        
        Max_Drawdown = maxDD(compoundrets_l10[method], startdate=startdate, enddate=20161230)
        Max_Drawdown = '{:.1f}'.format(100*Max_Drawdown)
        
        Avg_Turnover = turnover(longstrategy(method, 10), startdate=startdate, enddate=20161230)
        Avg_Turnover = '{:.0f}'.format(100*Avg_Turnover)

        capma, t_capma = capmalpha(yearrets_l10[method], startdate=startdate, enddate=20161230)
        capma, t_capma = '{:.2f}'.format(100*capma), '({:.2f})'.format(t_capma)

        ff3a, t_ff3a = ff3alpha(yearrets_l10[method], startdate=startdate, enddate=20161230)
        ff3a, t_ff3a = '{:.2f}'.format(100*ff3a), '({:.2f})'.format(t_ff3a)
        
        ch4a, t_ch4a = ch4alpha(yearrets_l10[method], startdate=startdate, enddate=20161230)
        ch4a, t_ch4a = '{:.2f}'.format(100*ch4a), '({:.2f})'.format(t_ch4a)
        
        ff5a, t_ff5a = ff5alpha(yearrets_l10[method], startdate=startdate, enddate=20161230)
        ff5a, t_ff5a = '{:.2f}'.format(100*ff5a), '({:.2f})'.format(t_ff5a)

        table.add_row((methodlabels[i], ar, sr, Avg_Turnover, Max_Loss, Max_Drawdown, capma, t_capma, ff3a, t_ff3a, ch4a, t_ch4a, ff5a, t_ff5a))
        
    return table

table = tablefrom(19580131)
table.generate_tex('tables/l10_from58')

table = tablefrom(19800131)
table.generate_tex('tables/l10_from80')

table = tablefrom(20000131)
table.generate_tex('tables/l10_from00')

print("Very nice!")


In [None]:
#Equally-weigthed compound returns:
fig, ax = basefig(8,3)
methodlabels_ = ['OLS, Lasso, Ridge, Enet', '', '', '', 'PCR, PLS', '','Tree', '', 'GBRT', 'NN1, Forest', 'NN2, NN3', '', 'NN5, NN10', '']
offsets = [0, 0, 0, 0, 5, 0, 0, 7, -12, 0, 0, 0, 3, 0]
labeledlines(compoundrets_ew_l10, methodlabels_, offsets, ax)
labeledline(avgcompoundrets, "Market average", 0, ax, width=2, color='k', style=':')
plt.yscale('log')
plt.title("Equally-weighted Compound Returns - Top Decile Long Strategies")
plt.savefig(graphpath+'compoundrets_ew_l10.pdf', bbox_inches='tight')
plt.show()


# Value-weighted compound returns:
fig, ax = basefig(8,3)
offsets = [2, 0, 0, 0, 0, 0, 0, 0, 0, -8, 0, 0, 0, 0]
methodlabels_ = ['OLS', 'Lasso, Ridge, Enet', '', '', 'PCR', 'PLS', '', 'Forest', 'GBRT, NN10', 'NN1', 'NN2', 'NN3', 'NN5, Tree', '']
labeledlines(compoundrets_l10, methodlabels_, offsets, ax)
labeledline(avgcompoundrets, "Market average", -3, ax, width=2, color='k', style=':')
plt.yscale('log')
plt.title("Value-weighted Compound Returns - Top Decile Long Strategies")
plt.savefig(graphpath+'compoundrets_l10.pdf', bbox_inches='tight')
plt.show()

# Ten-year Sharpe ratios:
fig, ax = basefig(8,3)
methodlabels_ = ['OLS, Lasso, Ridge, Enet', '', '', '', 'PCR', 'PLS', 'Tree, GBRT', 'Forest', '', 'NN1', 'NN2', 'NN3', 'NN5, NN10', '']
offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0, -5, 0, 0, 0, 0]
labeledlines(sharperatios_10y_l10, methodlabels_, offsets, ax)
labeledline(avgsharperatios_10y, "Market average", -3, ax, width=2, color='k', style=':')
plt.plot(dtdates, np.zeros_like(dates), 'k')
plt.title("10-Year Sharpe Ratios - Top Decile Long Strategies")
plt.savefig(graphpath+'sharperatios_10y_l10.pdf', bbox_inches='tight')
plt.show()

# Ten-year FF3 alphas:
fig, ax = basefig(8,3)
methodlabels_ = ['OLS, Lasso, Ridge, Enet', '', '', '', 'PCR', 'PLS', 'Tree', 'Forest', 'GBRT', 'NN1', 'NN2', 'NN3', 'NN5, NN10', '']
offsets = [3, 0, 0, 0, -5, 0, 0, 0, 0, 0, 0, 0, 0, 0]
labeledlines(ff3alphas_10y_l10, methodlabels_, offsets, ax)
plt.plot(dtdates, np.zeros_like(dates), 'k')
plt.title("10-Year FF3 Alphas - Top Decile Long Strategies")
plt.savefig(graphpath+'ff3alphas_10y_l10.pdf', bbox_inches='tight')
plt.show()

## Top/Bottom Decile Long/Short Strategy Performance

In [None]:
nanseries = np.full(T, np.nan)
yearrets_ls10 = dict(zip(methods,[nanseries] * M))
compoundrets_ls10 = dict(zip(methods, [np.nan]*M))
excessrets_ls10 = dict(zip(methods,[nanseries] * M))
totalret_ls10 = dict(zip(methods, [np.nan]*M))

yearrets_ew_ls10 = dict(zip(methods,[nanseries] * M))
compoundrets_ew_ls10 = dict(zip(methods,[nanseries] * M))
excessrets_ew_ls10 = dict(zip(methods,[nanseries] * M))
totalret_ew_ls10 = dict(zip(methods, [np.nan]*M))

sharperatios_3y_ls10 = dict(zip(methods, [np.nan]*M))
sharperatios_10y_ls10 = dict(zip(methods, [np.nan]*M))
    
capmalphas_3y_ls10 = dict(zip(methods, [np.nan]*M))
capmalphas_10y_ls10 = dict(zip(methods, [np.nan]*M))

ff3alphas_3y_ls10 = dict(zip(methods, [np.nan]*M))
ff3alphas_10y_ls10 = dict(zip(methods, [np.nan]*M))


for method in methods:
    
    strategy = longshortstrategy(method, 10)
    
    yearrets_ls10[method], compoundrets_ls10[method], excessrets_ls10[method], totalret_ls10[method] = strategyreturns(strategy)
    
    ewstrategy = longshortstrategy(method, 10, valueweighted=False)
    yearrets_ew_ls10[method], compoundrets_ew_ls10[method], excessrets_ew_ls10[method], totalret_ew_ls10[method] = strategyreturns(ewstrategy)
    
    sharperatios_3y_ls10[method] = sharperatios(yearrets_ls10[method], 12 * 3)
    sharperatios_10y_ls10[method] = sharperatios(yearrets_ls10[method], 12 * 10)
    
    capmalphas_3y_ls10[method] = capmalphas(yearrets_ls10[method], 12 * 3)
    capmalphas_10y_ls10[method] = capmalphas(yearrets_ls10[method], 12 * 10)
    
    ff3alphas_3y_ls10[method] = ff3alphas(yearrets_ls10[method], 12 * 3)
    ff3alphas_10y_ls10[method] = ff3alphas(yearrets_ls10[method], 12 * 10)

print("All done!")

In [None]:
def tablefrom(startdate):
    
    table = Tabular('lccccccccccccc')

    table.add_row(('', MultiColumn(13, align='c', data=str(int(startdate/10000))+'-2016')))
    table.add_hline(2, 14)

    table.add_row(('', 'Avg.', 'Sharpe', 'Avg.', 'Max.', 'Max.',
                    'CAPM-', MultiRow(2, data='(t-stat)'), 'FF3-', MultiRow(2, data='(t-stat)'),
                    'CH4-', MultiRow(2, data='(t-stat)'), 'FF5-', MultiRow(2, data='(t-stat)')))
    table.add_row(('', 'Ret.', 'Ratio', 'Turn.', 'Loss', 'DD', 'alpha', '', 'alpha', '', 'alpha', '', 'alpha', ''))

    for (i,method) in enumerate(methods):    

        ar = avgret(yearrets_ls10[method], startdate=startdate, enddate=20161230)
        ar = '{:.2f}'.format(100*ar)

        sr = sharperatio(yearrets_ls10[method], startdate=startdate, enddate=20161230)
        sr = '{:.2f}'.format(sr)
        
        Max_Loss = maxLoss(yearrets_ls10[method], startdate=startdate, enddate=20161230)
        Max_Loss = '{:.1f}'.format(100*Max_Loss)
        
        Max_Drawdown = maxDD(compoundrets_ls10[method], startdate=startdate, enddate=20161230)
        Max_Drawdown = '{:.1f}'.format(100*Max_Drawdown)
        
        Avg_Turnover = turnover(longshortstrategy(method, 10), startdate=startdate, enddate=20161230)
        Avg_Turnover = '{:.0f}'.format(100*Avg_Turnover)

        capma, t_capma = capmalpha(yearrets_ls10[method], startdate=startdate, enddate=20161230)
        capma, t_capma = '{:.2f}'.format(100*capma), '({:.2f})'.format(t_capma)

        ff3a, t_ff3a = ff3alpha(yearrets_ls10[method], startdate=startdate, enddate=20161230)
        ff3a, t_ff3a = '{:.2f}'.format(100*ff3a), '({:.2f})'.format(t_ff3a)
        
        ch4a, t_ch4a = ch4alpha(yearrets_ls10[method], startdate=startdate, enddate=20161230)
        ch4a, t_ch4a = '{:.2f}'.format(100*ch4a), '({:.2f})'.format(t_ch4a)
        
        ff5a, t_ff5a = ff5alpha(yearrets_ls10[method], startdate=startdate, enddate=20161230)
        ff5a, t_ff5a = '{:.2f}'.format(100*ff5a), '({:.2f})'.format(t_ff5a)

        table.add_row((methodlabels[i], ar, sr, Avg_Turnover, Max_Loss, Max_Drawdown, capma, t_capma, ff3a, t_ff3a, ch4a, t_ch4a, ff5a, t_ff5a))
        
    return table

table = tablefrom(19580131)
table.generate_tex('tables/ls10_from58')

table = tablefrom(19800131)
table.generate_tex('tables/ls10_from80')

table = tablefrom(20000131)
table.generate_tex('tables/ls10_from00')

print("Very nice!")

In [None]:
#Equally-weigthed compound returns:
fig, ax = basefig(8,3)
methodlabels_ = ['OLS', 'Lasso', 'Ridge', 'Enet', 'PCR', 'PLS', 'Tree', 'Forest', 'GBRT', 'NN1', 'NN2', 'NN3', 'NN5', 'NN10']
offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
labeledlines(compoundrets_ew_ls10, methodlabels_, offsets, ax)
plt.plot(dtdates, np.zeros_like(dates), 'k')
plt.yscale('symlog')
plt.title("Equally-weighted Compound Returns - Top/Bottom Decile Long/Short Strategies")
plt.savefig(graphpath+'compoundrets_ew_ls10.pdf', bbox_inches='tight')
plt.show()


# Value-weighted compound returns:
fig, ax = basefig(8,3)
methodlabels_ = ['OLS', 'Lasso', 'Ridge', 'Enet', 'PCR', 'PLS', 'Tree', 'Forest', 'GBRT', 'NN1', 'NN2', 'NN3', 'NN5', 'NN10']
offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
labeledlines(compoundrets_ls10, methodlabels_, offsets, ax)
plt.plot(dtdates, np.zeros_like(dates), 'k')
plt.yscale('symlog')
plt.title("Value-weighted Compound Returns - Top/Bottom Decile Long/Short Strategies")
plt.savefig(graphpath+'compoundrets_ls10.pdf', bbox_inches='tight')
plt.show()

# Ten-year Sharpe ratios:
fig, ax = basefig(8,3)
methodlabels_ = ['OLS', 'Lasso', 'Ridge', 'Enet', 'PCR', 'PLS', 'Tree', 'Forest', 'GBRT', 'NN1', 'NN2', 'NN3', 'NN5', 'NN10']
offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
labeledlines(sharperatios_10y_ls10, methodlabels_, offsets, ax)
plt.plot(dtdates, np.zeros_like(dates), 'k')
plt.title("10-Year Sharpe Ratios - Top/Bottom Decile Long/Short Strategies")
plt.savefig(graphpath+'sharperatios_10y_ls10.pdf', bbox_inches='tight')
plt.show()

# Ten-year FF3 alphas:
fig, ax = basefig(8,3)
methodlabels_ = ['OLS', 'Lasso', 'Ridge', 'Enet', 'PCR', 'PLS', 'Tree', 'Forest', 'GBRT', 'NN1', 'NN2', 'NN3', 'NN5', 'NN10']
offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
labeledlines(ff3alphas_10y_ls10, methodlabels_, offsets, ax)
plt.plot(dtdates, np.zeros_like(dates), 'k')
plt.title("10-Year FF3 Alphas - Top/Bottom Decile Long/Short Strategies")
plt.savefig(graphpath+'ff3alphas_10y_ls10.pdf', bbox_inches='tight')
plt.show()

### Rank-weighted Long/Short Strategy

In [None]:
nanseries = np.full(T, np.nan)
yearrets_rank = dict(zip(methods,[nanseries] * M))
compoundrets_rank = dict(zip(methods, [np.nan]*M))
excessrets_rank = dict(zip(methods,[nanseries] * M))
totalret_rank = dict(zip(methods, [np.nan]*M))

yearrets_ew_rank = dict(zip(methods,[nanseries] * M))
compoundrets_ew_rank = dict(zip(methods,[nanseries] * M))
excessrets_ew_rank = dict(zip(methods,[nanseries] * M))
totalret_ew_rank = dict(zip(methods, [np.nan]*M))

sharperatios_3y_rank = dict(zip(methods, [np.nan]*M))
sharperatios_10y_rank = dict(zip(methods, [np.nan]*M))
    
capmalphas_3y_rank = dict(zip(methods, [np.nan]*M))
capmalphas_10y_rank = dict(zip(methods, [np.nan]*M))

ff3alphas_3y_rank = dict(zip(methods, [np.nan]*M))
ff3alphas_10y_rank = dict(zip(methods, [np.nan]*M))


for method in methods:
    
    strategy = rankstrategy(method)
    yearrets_rank[method], compoundrets_rank[method], excessrets_rank[method], totalret_rank[method] = strategyreturns(strategy)
    
    ewstrategy = rankstrategy(method, valueweighted=False)
    yearrets_ew_rank[method], compoundrets_ew_rank[method], excessrets_ew_rank[method], totalret_ew_rank[method] = strategyreturns(ewstrategy)
    
    sharperatios_3y_rank[method] = sharperatios(yearrets_rank[method], 12 * 3)
    sharperatios_10y_rank[method] = sharperatios(yearrets_rank[method], 12 * 10)
    
    capmalphas_3y_rank[method] = capmalphas(yearrets_rank[method], 12 * 3)
    capmalphas_10y_rank[method] = capmalphas(yearrets_rank[method], 12 * 10)
    
    ff3alphas_3y_rank[method] = ff3alphas(yearrets_rank[method], 12 * 3)
    ff3alphas_10y_rank[method] = ff3alphas(yearrets_rank[method], 12 * 10)

print("All done!")

In [None]:
def tablefrom(startdate):
    
    table = Tabular('lccccccccccccc')

    table.add_row(('', MultiColumn(13, align='c', data=str(int(startdate/10000))+'-2016')))
    table.add_hline(2, 14)

    table.add_row(('', 'Avg.', 'Sharpe', 'Avg.', 'Max.', 'Max.',
                    'CAPM-', MultiRow(2, data='(t-stat)'), 'FF3-', MultiRow(2, data='(t-stat)'),
                    'CH4-', MultiRow(2, data='(t-stat)'), 'FF5-', MultiRow(2, data='(t-stat)')))
    table.add_row(('', 'Ret.', 'Ratio', 'Turn.', 'Loss', 'DD', 'alpha', '', 'alpha', '', 'alpha', '', 'alpha', ''))

    for (i,method) in enumerate(methods):    

        ar = avgret(yearrets_rank[method], startdate=startdate, enddate=20161230)
        ar = '{:.2f}'.format(100*ar)

        sr = sharperatio(yearrets_rank[method], startdate=startdate, enddate=20161230)
        sr = '{:.2f}'.format(sr)
        
        Max_Loss = maxLoss(yearrets_rank[method], startdate=startdate, enddate=20161230)
        Max_Loss = '{:.1f}'.format(100*Max_Loss)
        
        Max_Drawdown = maxDD(compoundrets_rank[method], startdate=startdate, enddate=20161230)
        Max_Drawdown = '{:.1f}'.format(100*Max_Drawdown)
        
        Avg_Turnover = turnover(rankstrategy(method), startdate=startdate, enddate=20161230)
        Avg_Turnover = '{:.0f}'.format(100*Avg_Turnover)

        capma, t_capma = capmalpha(yearrets_rank[method], startdate=startdate, enddate=20161230)
        capma, t_capma = '{:.2f}'.format(100*capma), '({:.2f})'.format(t_capma)

        ff3a, t_ff3a = ff3alpha(yearrets_rank[method], startdate=startdate, enddate=20161230)
        ff3a, t_ff3a = '{:.2f}'.format(100*ff3a), '({:.2f})'.format(t_ff3a)
        
        ch4a, t_ch4a = ch4alpha(yearrets_rank[method], startdate=startdate, enddate=20161230)
        ch4a, t_ch4a = '{:.2f}'.format(100*ch4a), '({:.2f})'.format(t_ch4a)
        
        ff5a, t_ff5a = ff5alpha(yearrets_rank[method], startdate=startdate, enddate=20161230)
        ff5a, t_ff5a = '{:.2f}'.format(100*ff5a), '({:.2f})'.format(t_ff5a)

        table.add_row((methodlabels[i], ar, sr, Avg_Turnover, Max_Loss, Max_Drawdown, capma, t_capma, ff3a, t_ff3a, ch4a, t_ch4a, ff5a, t_ff5a))
        
    return table

table = tablefrom(19580131)
table.generate_tex('tables/rank_from58')

print("Very nice!")

table = tablefrom(19800131)
table.generate_tex('tables/rank_from80')

print("Very nice!!")

table = tablefrom(20000131)
table.generate_tex('tables/rank_from00')

print("Very nice!!!")


In [None]:
#Equally-weigthed compound returns:
fig, ax = basefig(8,3)
methodlabels_ = ['OLS', 'Lasso', 'Ridge', 'Enet', 'PCR', 'PLS', 'Tree', 'Forest', 'GBRT', 'NN1', 'NN2', 'NN3', 'NN5', 'NN10']
offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
labeledlines(compoundrets_ew_rank, methodlabels_, offsets, ax)
plt.yscale('symlog')
plt.title("Equally-weighted Compound Returns - Rank-weighted Long/Short Strategy")
plt.savefig(graphpath+'compoundrets_ew_rank.pdf', bbox_inches='tight')
plt.show()


# Value-weighted compound returns:
fig, ax = basefig(8,3)
methodlabels_ = ['OLS', 'Lasso', 'Ridge', 'Enet', 'PCR', 'PLS', 'Tree', 'Forest', 'GBRT', 'NN1', 'NN2', 'NN3', 'NN5', 'NN10']
offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
labeledlines(compoundrets_rank, methodlabels_, offsets, ax)
plt.yscale('symlog')
plt.title("Value-weighted Compound Returns - Rank-weighted Long/Short Strategy")
plt.savefig(graphpath+'compoundrets_rank.pdf', bbox_inches='tight')
plt.show()

# Ten-year Sharpe ratios:
fig, ax = basefig(8,3)
methodlabels_ = ['OLS', 'Lasso', 'Ridge', 'Enet', 'PCR', 'PLS', 'Tree', 'Forest', 'GBRT', 'NN1', 'NN2', 'NN3', 'NN5', 'NN10']
offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
labeledlines(sharperatios_10y_rank, methodlabels_, offsets, ax)
plt.plot(dtdates, np.zeros_like(dates), 'k')
plt.title("10-Year Sharpe Ratios - Rank-weighted Long/Short Strategy")
plt.savefig(graphpath+'sharperatios_10y_rank.pdf', bbox_inches='tight')
plt.show()


# Ten-year FF3 alphas:
fig, ax = basefig(8,3)
methodlabels_ = ['OLS', 'Lasso', 'Ridge', 'Enet', 'PCR', 'PLS', 'Tree', 'Forest', 'GBRT', 'NN1', 'NN2', 'NN3', 'NN5', 'NN10']
offsets = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
labeledlines(ff3alphas_10y_rank, methodlabels_, offsets, ax)
plt.plot(dtdates, np.zeros_like(dates), 'k')
plt.title("10-Year FF3 Alphas - Rank-weighted Long/Short Strategy")
plt.savefig(graphpath+'ff3alphas_10y_rank.pdf', bbox_inches='tight')
plt.show()

### Robustness check: restricting the universe to Top 10000

In [None]:
a = mktcaps.to_numpy()

mktcaplims_top1000 = []
mktcaplims_top100 = []

for date in dates:
    
    mktcaps_date = mktcaps.loc[date].to_numpy()
    mktcaps_date = mktcaps_date[~np.isnan(mktcaps_date)]
        
    mktcaplim_top1000 = np.partition(mktcaps_date, -1000)[-1000]
    mktcaplims_top1000 = np.append(mktcaplims_top1000, mktcaplim_top1000)
    
    mktcaplim_top100 = np.partition(mktcaps_date, -100)[-100]
    mktcaplims_top100 = np.append(mktcaplims_top100, mktcaplim_top100)

mask_top1000 = (mktcaps.ge(mktcaplims_top1000, axis=0))
mask_top100 = (mktcaps.ge(mktcaplims_top100, axis=0))

In [None]:
def longstrategy_subset(method, percentile, valueweighted=True, mask=mask_top1000):

    warnings.filterwarnings('ignore')

    strategy = returns.copy().to_numpy()
    strategy[~np.isnan(strategy)] = 0
              
    prediction = predictions[method].copy()
    
    p_top = np.nanpercentile(prediction[mask], 100 - percentile, axis=1, keepdims=True)
    
    strategy[prediction[mask]>=p_top] = 1
        
    if valueweighted: strategy = strategy * mktcaps.values
            
    rescaler = np.nansum(abs(strategy), axis=1, keepdims=True)
        
    strategy = strategy / rescaler
        
    strategy = pd.DataFrame(strategy)
    strategy.index, strategy.columns = returns.index, returns.columns
        
    return strategy


In [None]:
def longshortstrategy_subset(method, percentile, valueweighted=True, mask=mask_top1000):

    warnings.filterwarnings('ignore')

    strategy = returns.copy().to_numpy()
    strategy[~np.isnan(strategy)] = 0
              
    prediction = predictions[method].copy()
        
    p_top = np.nanpercentile(prediction[mask], 100 - percentile, axis=1, keepdims=True)
    p_bottom = np.nanpercentile(prediction[mask], percentile, axis=1, keepdims=True)

    strategy[prediction[mask]>p_top] = 1 

    strategy[prediction[mask]<p_bottom] = -1
    
    if valueweighted: strategy = strategy * mktcaps.values
            
    rescaler = np.nansum(abs(strategy), axis=1, keepdims=True) / 2
    strategy = strategy / rescaler
    
    strategy = pd.DataFrame(strategy)
    strategy.index, strategy.columns = returns.index, returns.columns
    
    return strategy

In [None]:
def rankstrategy_subset(method, valueweighted=True, mask=mask_top1000):
    
    warnings.filterwarnings('ignore')

    strategy = returns.copy().to_numpy()
    strategy[~np.isnan(strategy)] = 0
          
    prediction = predictions[method].copy().to_numpy()
    prediction[np.isnan(prediction)] = np.inf
    
    prediction_order = np.argsort(prediction, axis=1)
    prediction_rank = np.argsort(prediction_order, axis=1) + 1
    
    numberofstocks = np.count_nonzero(~np.isnan(strategy), axis=1)
    
    strategy = (prediction_rank.T - ((numberofstocks+1)/2))     
    strategy =  strategy.T
    
    strategy[np.isnan(returns.values)] = np.nan
        
    if valueweighted: strategy = strategy * mktcaps.values
        
    strategy[mask] = 0 #this is cheating a bit...
            
    rescaler = np.nansum(abs(strategy), axis=1, keepdims=True) / 2
    strategy = strategy / rescaler
        
    strategy = pd.DataFrame(strategy)
    strategy.index, strategy.columns = returns.index, returns.columns
    
    
    return strategy

In [None]:
def tablefrom(startdate, mask):
    
    table = Tabular('lccccccccccccc')

    table.add_row(('', MultiColumn(13, align='c', data=str(int(startdate/10000))+'-2016')))
    table.add_hline(2, 14)

    table.add_row(('', 'Avg.', 'Sharpe', 'Avg.', 'Max.', 'Max.',
                    'CAPM-', MultiRow(2, data='(t-stat)'), 'FF3-', MultiRow(2, data='(t-stat)'),
                    'CH4-', MultiRow(2, data='(t-stat)'), 'FF5-', MultiRow(2, data='(t-stat)')))
    table.add_row(('', 'Ret.', 'Ratio', 'Turn.', 'Loss', 'DD', 'alpha', '', 'alpha', '', 'alpha', '', 'alpha', ''))

    for (i,method) in enumerate(methods):
                
        strategy = longstrategy_subset(method, 10, mask=mask)
            
        yearrets, compoundrets, excessrets, totalret = strategyreturns(strategy)
        
        ar = avgret(yearrets, startdate=startdate, enddate=20161230)
        ar = '{:.2f}'.format(100*ar)

        sr = sharperatio(yearrets, startdate=startdate, enddate=20161230)
        sr = '{:.2f}'.format(sr)
        
        Max_Loss = maxLoss(yearrets, startdate=startdate, enddate=20161230)
        Max_Loss = '{:.1f}'.format(100*Max_Loss)
        
        Max_Drawdown = maxDD(compoundrets, startdate=startdate, enddate=20161230)
        Max_Drawdown = '{:.1f}'.format(100*Max_Drawdown)
        
        Avg_Turnover = turnover(longstrategy_subset(method, 10, mask=mask), startdate=startdate, enddate=20161230)
        Avg_Turnover = '{:.0f}'.format(100*Avg_Turnover)

        capma, t_capma = capmalpha(yearrets, startdate=startdate, enddate=20161230)
        capma, t_capma = '{:.2f}'.format(100*capma), '({:.2f})'.format(t_capma)

        ff3a, t_ff3a = ff3alpha(yearrets, startdate=startdate, enddate=20161230)
        ff3a, t_ff3a = '{:.2f}'.format(100*ff3a), '({:.2f})'.format(t_ff3a)
        
        ch4a, t_ch4a = ch4alpha(yearrets, startdate=startdate, enddate=20161230)
        ch4a, t_ch4a = '{:.2f}'.format(100*ch4a), '({:.2f})'.format(t_ch4a)
        
        ff5a, t_ff5a = ff5alpha(yearrets, startdate=startdate, enddate=20161230)
        ff5a, t_ff5a = '{:.2f}'.format(100*ff5a), '({:.2f})'.format(t_ff5a)

        table.add_row((methodlabels[i], ar, sr, Avg_Turnover, Max_Loss, Max_Drawdown, capma, t_capma, ff3a, t_ff3a, ch4a, t_ch4a, ff5a, t_ff5a))
        
    return table

table = tablefrom(19580131, mask=mask_top1000)
table.generate_tex('tables/l10_top100')

table = tablefrom(19580131, mask=mask_top100)
table.generate_tex('tables/l10_top1000')

print("Very nice!")


In [None]:
def tablefrom(startdate, mask):
    
    table = Tabular('lccccccccccccc')

    table.add_row(('', MultiColumn(13, align='c', data=str(int(startdate/10000))+'-2016')))
    table.add_hline(2, 14)

    table.add_row(('', 'Avg.', 'Sharpe', 'Avg.', 'Max.', 'Max.',
                    'CAPM-', MultiRow(2, data='(t-stat)'), 'FF3-', MultiRow(2, data='(t-stat)'),
                    'CH4-', MultiRow(2, data='(t-stat)'), 'FF5-', MultiRow(2, data='(t-stat)')))
    table.add_row(('', 'Ret.', 'Ratio', 'Turn.', 'Loss', 'DD', 'alpha', '', 'alpha', '', 'alpha', '', 'alpha', ''))

    for (i,method) in enumerate(methods):
                
        strategy = longshortstrategy_subset(method, 10, mask=mask)
            
        yearrets, compoundrets, excessrets, totalret = strategyreturns(strategy)
        
        ar = avgret(yearrets, startdate=startdate, enddate=20161230)
        ar = '{:.2f}'.format(100*ar)

        sr = sharperatio(yearrets, startdate=startdate, enddate=20161230)
        sr = '{:.2f}'.format(sr)
        
        Max_Loss = maxLoss(yearrets, startdate=startdate, enddate=20161230)
        Max_Loss = '{:.1f}'.format(100*Max_Loss)
        
        Max_Drawdown = maxDD(compoundrets, startdate=startdate, enddate=20161230)
        Max_Drawdown = '{:.1f}'.format(100*Max_Drawdown)
        
        Avg_Turnover = turnover(longshortstrategy_subset(method, 10, mask=mask), startdate=startdate, enddate=20161230)
        Avg_Turnover = '{:.0f}'.format(100*Avg_Turnover)

        capma, t_capma = capmalpha(yearrets, startdate=startdate, enddate=20161230)
        capma, t_capma = '{:.2f}'.format(100*capma), '({:.2f})'.format(t_capma)

        ff3a, t_ff3a = ff3alpha(yearrets, startdate=startdate, enddate=20161230)
        ff3a, t_ff3a = '{:.2f}'.format(100*ff3a), '({:.2f})'.format(t_ff3a)
        
        ch4a, t_ch4a = ch4alpha(yearrets, startdate=startdate, enddate=20161230)
        ch4a, t_ch4a = '{:.2f}'.format(100*ch4a), '({:.2f})'.format(t_ch4a)
        
        ff5a, t_ff5a = ff5alpha(yearrets, startdate=startdate, enddate=20161230)
        ff5a, t_ff5a = '{:.2f}'.format(100*ff5a), '({:.2f})'.format(t_ff5a)

        table.add_row((methodlabels[i], ar, sr, Avg_Turnover, Max_Loss, Max_Drawdown, capma, t_capma, ff3a, t_ff3a, ch4a, t_ch4a, ff5a, t_ff5a))
        
    return table

table = tablefrom(19580131, mask=mask_top1000)
table.generate_tex('tables/ls10_top1000')

table = tablefrom(19580131, mask=mask_top100)
table.generate_tex('tables/ls10_top100')

print("Very nice!")


In [None]:
def tablefrom(startdate, mask):
    
    table = Tabular('lccccccccccccc')

    table.add_row(('', MultiColumn(13, align='c', data=str(int(startdate/10000))+'-2016')))
    table.add_hline(2, 14)

    table.add_row(('', 'Avg.', 'Sharpe', 'Avg.', 'Max.', 'Max.',
                    'CAPM-', MultiRow(2, data='(t-stat)'), 'FF3-', MultiRow(2, data='(t-stat)'),
                    'CH4-', MultiRow(2, data='(t-stat)'), 'FF5-', MultiRow(2, data='(t-stat)')))
    table.add_row(('', 'Ret.', 'Ratio', 'Turn.', 'Loss', 'DD', 'alpha', '', 'alpha', '', 'alpha', '', 'alpha', ''))

    for (i,method) in enumerate(methods):
                
        strategy = rankstrategy_subset(method, mask=mask)
            
        yearrets, compoundrets, excessrets, totalret = strategyreturns(strategy)
        
        ar = avgret(yearrets, startdate=startdate, enddate=20161230)
        ar = '{:.2f}'.format(100*ar)

        sr = sharperatio(yearrets, startdate=startdate, enddate=20161230)
        sr = '{:.2f}'.format(sr)
        
        Max_Loss = maxLoss(yearrets, startdate=startdate, enddate=20161230)
        Max_Loss = '{:.1f}'.format(100*Max_Loss)
        
        Max_Drawdown = maxDD(compoundrets, startdate=startdate, enddate=20161230)
        Max_Drawdown = '{:.1f}'.format(100*Max_Drawdown)
        
        Avg_Turnover = turnover(rankstrategy_subset(method, 10, mask=mask), startdate=startdate, enddate=20161230)
        Avg_Turnover = '{:.0f}'.format(100*Avg_Turnover)

        capma, t_capma = capmalpha(yearrets, startdate=startdate, enddate=20161230)
        capma, t_capma = '{:.2f}'.format(100*capma), '({:.2f})'.format(t_capma)

        ff3a, t_ff3a = ff3alpha(yearrets, startdate=startdate, enddate=20161230)
        ff3a, t_ff3a = '{:.2f}'.format(100*ff3a), '({:.2f})'.format(t_ff3a)
        
        ch4a, t_ch4a = ch4alpha(yearrets, startdate=startdate, enddate=20161230)
        ch4a, t_ch4a = '{:.2f}'.format(100*ch4a), '({:.2f})'.format(t_ch4a)
        
        ff5a, t_ff5a = ff5alpha(yearrets, startdate=startdate, enddate=20161230)
        ff5a, t_ff5a = '{:.2f}'.format(100*ff5a), '({:.2f})'.format(t_ff5a)

        table.add_row((methodlabels[i], ar, sr, Avg_Turnover, Max_Loss, Max_Drawdown, capma, t_capma, ff3a, t_ff3a, ch4a, t_ch4a, ff5a, t_ff5a))
        
    return table

table = tablefrom(19580131, mask=mask_top1000)
table.generate_tex('tables/rank_top1000')

table = tablefrom(19580131, mask=mask_top100)
table.generate_tex('tables/rank_top100')

print("Very nice!")


### Robustness check: transaction costs

In [None]:
def computemonthlyturnover(strategy):

    strategy_t = strategy
    strategy_tp1 = strategy_t.shift(1)

    strategy_t = strategy_t.fillna(0)
    strategy_tp1 = strategy_tp1.fillna(0)

    indturnover = abs((strategy_tp1.values - strategy_t.values * (1+returns.values)))

    monthlyturnover = np.nansum(indturnover, axis=1) / np.nansum(abs(strategy_t), axis=1)
    monthlyturnover[np.isinf(monthlyturnover)] = 2

    return monthlyturnover

def returnsaftercosts(yearrets, monthlyturnover, commission):
    
    yearrets_tradingcosts = yearrets - commission * monthlyturnover

    compoundrets_tradingcosts = np.cumprod(1 + yearrets_tradingcosts)
    
    totalret_tradingscosts = compoundrets_tradingcosts[-1]
    
    avgret_tradingscosts = totalret_tradingscosts**(1/T) - 1
    
    return yearrets_tradingcosts, compoundrets_tradingcosts, totalret_tradingscosts, avgret_tradingscosts


avgrets_tradingscosts = np.array([])
commissions = np.linspace(0, 0.1, 10)

costmethods = fewmethods
offsets = [0, 0, 0, 0, 3, -3]


fig, ax = plt.subplots(1, 2, figsize=(16, 6))

for ax_ in ax:
    ax_.spines['right'].set_visible(False)
    ax_.spines['top'].set_visible(False)

for (i,method) in enumerate(costmethods):

    strategy = longstrategy(method, 10)
    
    yearrets, compoundrets, _, _ = strategyreturns(strategy)

    monthlyturnover = computemonthlyturnover(strategy)
    
    avgrets_tradingscosts = np.array([])
    alphas_tradingscosts = np.array([])

    for commission in commissions:

        yearrets_tradingcosts, _ , _ , avgret_tradingscosts = returnsaftercosts(yearrets, monthlyturnover, commission)
        alpha_tradingscosts = ff3alpha(yearrets_tradingcosts)[0]
        
        avgrets_tradingscosts = np.append(avgrets_tradingscosts, avgret_tradingscosts)
        alphas_tradingscosts = np.append(alphas_tradingscosts, alpha_tradingscosts)
        
    ax[0].plot(commissions, avgrets_tradingscosts)
    ax[0].annotate(s=fewmethodlabels[i], xy=(commissions[-1], avgrets_tradingscosts[-1]), xytext=(5,offsets[i]), textcoords='offset points', va='center')
    
    ax[1].plot(commissions, alphas_tradingscosts)
    ax[1].annotate(s=fewmethodlabels[i], xy=(commissions[-1], alphas_tradingscosts[-1]), xytext=(5,0), textcoords='offset points', va='center')
        
        
ax[0].plot(commissions, np.zeros_like(commissions), 'k', linewidth=1)
ax[0].set_ylabel('Average monthly return, adjusted for trading costs')
ax[0].set_xlabel('Trading fees: commission in percent')
ax[0].set_title('Returns')
ax[0].xaxis.set_major_formatter(mtick.PercentFormatter(1))
ax[0].set_ylim([-0.14, 0.075])


ax[1].plot(commissions, np.zeros_like(commissions), 'k', linewidth=1)
ax[1].set_ylabel('Average monthly alpha, adjusted for trading costs')
ax[1].set_xlabel('Trading fees: commission in percent')
ax[1].set_title('Alphas')
ax[1].xaxis.set_major_formatter(mtick.PercentFormatter(1))
ax[1].set_ylim([-0.14, 0.075])

plt.tight_layout(pad=1)
plt.savefig(graphpath+'tradingcosts.pdf', bbox_inches='tight')
plt.show()


## Trading Volume and Short Interest

### Data transformation, exploratory graphs

In [None]:
return_nanshares = np.mean(~np.isnan(returns), axis=1)

In [None]:
df['trmratio'] = df['cshtrm'] / df['shrout']
trmratios = df[['date','permno','trmratio',]].pivot(index='date', columns='permno', values='trmratio')

trmratios_i = trmratios.copy()

trmratios_i = trmratios.interpolate(method='linear', limit=3)
trmratios_i[np.isnan(returns)] = np.nan

trmratios = trmratios.clip(trmratios.quantile(0.001, axis=1), trmratios.quantile(0.999, axis=1), axis=0)
trmratios_i = trmratios_i.clip(trmratios_i.quantile(0.001, axis=1), trmratios_i.quantile(0.999, axis=1), axis=0)


trmratios_obs = np.count_nonzero(~np.isnan(trmratios), axis=1)
trmratios_i_obs = np.count_nonzero(~np.isnan(trmratios_i), axis=1)

trmratios_nanshares = np.mean(~np.isnan(trmratios), axis=1)
trmratios_missingshares = (return_nanshares - trmratios_nanshares) / return_nanshares

trmratios_i_nanshares = np.mean(~np.isnan(trmratios_i), axis=1)
trmratios_i_missingshares = (return_nanshares - trmratios_i_nanshares) / return_nanshares


fig, ax = plt.subplots(1, 3, figsize=(16, 4))

for axis in ax:
    axis.spines['right'].set_visible(False)
    axis.spines['top'].set_visible(False)

ax[0].plot(dtdates, trmratios_obs, 'o', alpha=0.1, markersize=6, label='Original data')
ax[0].plot(dtdates, trmratios_i_obs, 'C2', linewidth=1, label='Interpolation')
ax[0].legend(loc='upper left', frameon=False)
ax[0].set_title("Number of Observations", pad=10)

ax[1].plot(dtdates, trmratios_missingshares, 'o', alpha=0.1, markersize=6)
ax[1].plot(dtdates, trmratios_i_missingshares, 'C2', linewidth=1)
ax[1].set_title("Share of Missing Observations", pad=10)

ax[2].plot(dtdates, trmratios[59408], 'o', markersize=6, label='Original data')
ax[2].plot(dtdates, trmratios_i[59408], 'C2', linewidth=1)
ax[2].plot(dtdates[np.isnan(trmratios[59408])], trmratios_i[59408][np.isnan(trmratios[59408])], 'C2o', markersize=6, label='Interpolation')
ax[2].legend(loc='lower right', frameon=False)
ax[2].set_yscale('log')
ax[2].set_title("Interpolation Example : Ford", pad=10)

fig.suptitle("Trading Ratio : Monthly Volume / Shares Outstanding", size=18, y=1.1)
plt.savefig(graphpath+'datanalysis_trmratios.pdf', bbox_inches='tight')
plt.show()

In [None]:
df['shortratio'] = df['shortint'] / df['shrout']
shortratios = df[['date','permno','shortratio',]].pivot(index='date', columns='permno', values='shortratio')


shortratios_i = shortratios.interpolate(method='linear', limit=3)
shortratios_i[np.isnan(returns)] = np.nan

shortratios = shortratios.clip(shortratios.quantile(0.001, axis=1), shortratios.quantile(0.999, axis=1), axis=0)
shortratios_i = shortratios_i.clip(shortratios_i.quantile(0.001, axis=1), shortratios_i.quantile(0.999, axis=1), axis=0)

shortratios_obs = np.count_nonzero(~np.isnan(shortratios), axis=1)
shortratios_i_obs = np.count_nonzero(~np.isnan(shortratios_i), axis=1)

shortratios_nanshares = np.mean(~np.isnan(shortratios), axis=1)
shortratios_missingshares = (return_nanshares - shortratios_nanshares) / return_nanshares

shortratios_i_nanshares = np.mean(~np.isnan(shortratios_i), axis=1)
shortratios_i_missingshares = (return_nanshares - shortratios_i_nanshares) / return_nanshares

fig, ax = plt.subplots(1, 3, figsize=(16, 4))

for axis in ax:
    axis.spines['right'].set_visible(False)
    axis.spines['top'].set_visible(False)

ax[0].plot(dtdates, shortratios_obs, 'o', alpha=0.1, markersize=6, label='Original data')
ax[0].plot(dtdates, shortratios_i_obs, 'C2', linewidth=1, label='Interpolation')
ax[0].legend(loc='upper left', frameon=False)
ax[0].set_title("Number of Observations", pad=10)

ax[1].plot(dtdates, shortratios_missingshares, 'o', alpha=0.1, markersize=6)
ax[1].plot(dtdates, shortratios_i_missingshares, 'C2', linewidth=1)
ax[1].set_title("Share of Missing Observations", pad=10)

ax[2].plot(dtdates, shortratios[59408], 'o', markersize=6, label='Original data')
ax[2].plot(dtdates, shortratios_i[59408], 'C2', linewidth=1)
ax[2].plot(dtdates[np.isnan(shortratios[59408])], shortratios_i[59408][np.isnan(shortratios[59408])], 'C2o', markersize=6, label='Interpolation')
ax[2].legend(loc='lower right', frameon=False)
ax[2].set_yscale('log')
ax[2].set_title("Interpolation Example : Ford", pad=10)

fig.suptitle("Short Ratio : Short Interest / Shares Outstanding", size=18, y=1.1)
plt.savefig(graphpath+'datanalysis_shortratios.pdf', bbox_inches='tight')
plt.show()

In [None]:
df['daystocover'] = df['shortint'] / df['cshtrm'] / 21
daystocovers = df[['date','permno','daystocover',]].pivot(index='date', columns='permno', values='daystocover')

daystocovers_i = daystocovers.interpolate(method='linear', limit=3)
daystocovers_i[np.isnan(returns)] = np.nan

daystocovers = daystocovers.clip(daystocovers.quantile(0.001, axis=1), daystocovers.quantile(0.999, axis=1), axis=0)
daystocovers_i = daystocovers_i.clip(daystocovers_i.quantile(0.001, axis=1), daystocovers_i.quantile(0.999, axis=1), axis=0)

daystocovers_obs = np.count_nonzero(~np.isnan(daystocovers), axis=1)
daystocovers_i_obs = np.count_nonzero(~np.isnan(daystocovers_i), axis=1)

daystocovers_nanshares = np.mean(~np.isnan(daystocovers), axis=1)
daystocovers_missingshares = (return_nanshares - daystocovers_nanshares) / return_nanshares

daystocovers_i_nanshares = np.mean(~np.isnan(daystocovers_i), axis=1)
daystocovers_i_missingshares = (return_nanshares - daystocovers_i_nanshares) / return_nanshares

fig, ax = plt.subplots(1, 3, figsize=(16, 4))

for axis in ax:
    axis.spines['right'].set_visible(False)
    axis.spines['top'].set_visible(False)

ax[0].plot(dtdates, daystocovers_obs, 'o', alpha=0.1, markersize=6, label='Original data')
ax[0].plot(dtdates, daystocovers_i_obs, 'C2', linewidth=1, label='Interpolation')
ax[0].legend(loc='upper left', frameon=False)
ax[0].set_title("Number of Observations", pad=10)

ax[1].plot(dtdates, daystocovers_missingshares, 'o', alpha=0.1, markersize=6)
ax[1].plot(dtdates, daystocovers_i_missingshares, 'C2', linewidth=1)
ax[1].set_title("Share of Missing Observations", pad=10)

ax[2].plot(dtdates, daystocovers[59408], 'o', markersize=6, label='Original data')
ax[2].plot(dtdates, daystocovers_i[59408], 'C2', linewidth=1)
ax[2].plot(dtdates[np.isnan(daystocovers[59408])], daystocovers_i[59408][np.isnan(daystocovers[59408])], 'C2o', markersize=6, label='Interpolation')
ax[2].legend(loc='lower right', frameon=False)
ax[2].set_yscale('log')
ax[2].set_title("Interpolation Example : Ford", pad=10)

fig.suptitle("Days to Cover : Short Interest / Monthly Volume", size=18, y=1.1)
plt.savefig(graphpath+'datanalysis_daystocovers.pdf', bbox_inches='tight')
plt.show()

In [None]:
shortratios_pctile = 100 * shortratios.rank(pct=True, axis=1)
daystocovers_pctile = 100 * daystocovers.rank(pct=True, axis=1)

print("All done!")

## Trading activity and short interest ratios around ML-Portfolios

In [None]:
def strategytrmratios(strategy):
    
    indtrmratios = np.multiply(strategy.values, trmratios_i.values)
    strategy_trmratios = np.nansum(indtrmratios, axis=1)

    return strategy_trmratios

def strategyshortratios(strategy):
    
    indshortratios = np.multiply(strategy.values, shortratios_i.values)
    strategy_shortratios = np.nansum(indshortratios, axis=1)

    return strategy_shortratios

def strategydaystocovers(strategy):
    
    strategy_shortside = 2 * (strategy<0).astype('int')  # 2 * so that weights sum to 1
    daystocovers_shortside = np.multiply(strategy_shortside.values, daystocovers_i.values)
    inddaystocovers = np.multiply(-strategy.values, daystocovers_shortside)
    strategy_daystocovers = np.nansum(inddaystocovers, axis=1)

    return strategy_daystocovers

def strategydaystocovers_max(strategy):
    
    strategy_shortside = (strategy<0).astype('int')
    daystocovers_shortside = np.multiply(strategy_shortside.values, daystocovers_i.values)
    strategy_daystocovers_max = np.nanmax(daystocovers_shortside, axis=1)

    return strategy_daystocovers_max

In [None]:
# Top/bottom decile strategies

nanseries = np.full(T, np.nan)
trmratios_ls10 = dict(zip(fewmethods,[nanseries] * Mfew))
shortratios_ls10 = dict(zip(fewmethods,[nanseries] * Mfew))
daystocovers_ls10 = dict(zip(fewmethods,[nanseries] * Mfew))
daystocovers_max_ls10 = dict(zip(fewmethods,[nanseries] * Mfew))

for method in fewmethods:
    
    strategy = longshortstrategy(method, 10)
    
    trmratios_ls10[method] = strategytrmratios(strategy)
    shortratios_ls10[method] = strategyshortratios(strategy)
    daystocovers_ls10[method] = strategydaystocovers(strategy)
    daystocovers_max_ls10[method] = strategydaystocovers_max(strategy)

print("All done!")

# Trading ratios:
fig, ax = basefig(10,5)

for (i,method) in enumerate(fewmethods):
    
    ax.plot(dtdates, trmratios_ls10[method], color=fewmethodcolors[i], alpha=0.2)
    ax.plot(dtdates, ma(trmratios_ls10[method], centered=True), color=fewmethodcolors[i], label=fewmethodlabels[i]+" (mov. avg.)")

ax.plot(dtdates, np.zeros(len(dtdates)), 'k-')

plt.xlim(left=datetime.date(1980, 1, 1))
#plt.ylim(-10**-3, 10**-3)
plt.legend(ncol=3, frameon=False)
plt.title("Weighted Trading Ratios - Top/Bottom Decile Long/Short Strategy")
plt.savefig(graphpath+'trmratios_i_ls10.pdf', bbox_inches='tight')
plt.show()

# Short ratios:

fig, ax = basefig(10,5)

for (i,method) in enumerate(fewmethods):
    
    ax.plot(dtdates, shortratios_ls10[method], color=fewmethodcolors[i], alpha=0.1)
    ax.plot(dtdates, ma(shortratios_ls10[method], centered=True), color=fewmethodcolors[i], label=fewmethodlabels[i]+" (mov. avg.)")

ax.plot(dtdates, np.zeros(len(dtdates)), 'k-')

plt.xlim(left=datetime.date(1980, 1, 1))
#plt.ylim(2*-10**-4, 2*10**-4)
plt.legend(ncol=3, frameon=False)
plt.title("Weighted Short Ratios - Top/Bottom Decile Long/Short Strategy")
plt.savefig(graphpath+'shortratios_i_ls10.pdf', bbox_inches='tight')
plt.show()

# Days to cover, weighted average:

fig, ax = basefig(10,5)

for (i,method) in enumerate(fewmethods):
    
    ax.plot(dtdates, daystocovers_ls10[method], color=fewmethodcolors[i], alpha=0.1)
    ax.plot(dtdates, ma(daystocovers_ls10[method], centered=True), color=fewmethodcolors[i], label=fewmethodlabels[i]+" (mov. avg.)")

ax.plot(dtdates, np.zeros(len(dtdates)), 'k-')

plt.xlim(left=datetime.date(1980, 1, 1))
plt.ylim(0, 3)
plt.legend(ncol=3, frameon=False)
plt.title("Weighted Days to Cover - Top/Bottom Decile Long/Short Strategy")
plt.savefig(graphpath+'daystocovers_i_ls10.pdf', bbox_inches='tight')
plt.show()

# Days to cover, max:

fig, ax = basefig(10,5)

for (i,method) in enumerate(fewmethods):
    
    ax.plot(dtdates, daystocovers_max_ls10[method], color=fewmethodcolors[i], alpha=0.1)
    ax.plot(dtdates, ma(daystocovers_max_ls10[method], centered=True), color=fewmethodcolors[i], label=fewmethodlabels[i]+" (mov. avg.)")

ax.plot(dtdates, np.zeros(len(dtdates)), 'k-')

plt.xlim(left=datetime.date(1980, 1, 1))
plt.ylim(0, 100)
plt.legend(ncol=3, frameon=False)
plt.title("Maximum Days to Cover - Top/Bottom Decile Long/Short Strategy")
plt.savefig(graphpath+'daystocovers_max_i_ls10.pdf', bbox_inches='tight')
plt.show()

In [None]:
# Rank-weighted strategies


nanseries = np.full(T, np.nan)
trmratios_rank = dict(zip(fewmethods,[nanseries] * Mfew))
shortratios_rank = dict(zip(fewmethods,[nanseries] * Mfew))
daystocovers_rank = dict(zip(fewmethods,[nanseries] * Mfew))
daystocovers_max_rank = dict(zip(fewmethods,[nanseries] * Mfew))

for method in fewmethods:
    
    strategy = rankstrategy(method)
    
    trmratios_rank[method] = strategytrmratios(strategy)
    shortratios_rank[method] = strategyshortratios(strategy)
    daystocovers_rank[method] = strategydaystocovers(strategy)
    daystocovers_max_rank[method] = strategydaystocovers_max(strategy)

print("All done!")

# Trading ratios:
fig, ax = basefig(10,5)
offsets = [-15, +5, +15, 0, 0, 0]

for (i,method) in enumerate(fewmethods):
    
    ax.plot(dtdates, trmratios_rank[method], color=fewmethodcolors[i], alpha=0.2)
    ax.plot(dtdates, ma(trmratios_rank[method], centered=True), color=fewmethodcolors[i], label=fewmethodlabels[i]+" (mov. avg.)")

ax.plot(dtdates, np.zeros(len(dtdates)), 'k-')

plt.xlim(left=datetime.date(1980, 1, 1))
#plt.ylim(-10**-4, 10**-4)
plt.legend(ncol=3, frameon=False)
plt.title("Weighted Trading Ratios - Rank-weighted Long/Short Strategy")
plt.savefig(graphpath+'trmratios_i_rank.pdf', bbox_inches='tight')
plt.show()

# Short ratios:

fig, ax = basefig(10,5)
offsets = [-15, +5, +15, 0, 0, 0]

for (i,method) in enumerate(fewmethods):
    
    ax.plot(dtdates, shortratios_rank[method], color=fewmethodcolors[i], alpha=0.1)
    ax.plot(dtdates, ma(shortratios_rank[method], centered=True), color=fewmethodcolors[i], label=fewmethodlabels[i]+" (mov. avg.)")

ax.plot(dtdates, np.zeros(len(dtdates)), 'k-')

plt.xlim(left=datetime.date(1980, 1, 1))
#plt.ylim(3*-10**-5, 3*10**-5)
plt.legend(ncol=3, frameon=False)
plt.title("Weighted Short Ratios - Rank-weighted Long/Short Strategy")
plt.savefig(graphpath+'shortratios_i_rank.pdf', bbox_inches='tight')
plt.show()


# Days to cover, weighted average:

fig, ax = basefig(10,5)

for (i,method) in enumerate(fewmethods):
    
    ax.plot(dtdates, daystocovers_rank[method], color=fewmethodcolors[i], alpha=0.1)
    ax.plot(dtdates, ma(daystocovers_rank[method], centered=True), color=fewmethodcolors[i], label=fewmethodlabels[i]+" (mov. avg.)")

ax.plot(dtdates, np.zeros(len(dtdates)), 'k-')

plt.xlim(left=datetime.date(1980, 1, 1))
plt.ylim(0, 3)
plt.legend(ncol=3, frameon=False)
plt.title("Weighted Days to Cover - Top/Bottom Decile Long/Short Strategy")
plt.savefig(graphpath+'daystocovers_i_rank.pdf', bbox_inches='tight')
plt.show()

# Days to cover, max:

fig, ax = basefig(10,5)

for (i,method) in enumerate(fewmethods):
    
    ax.plot(dtdates, daystocovers_max_rank[method], color=fewmethodcolors[i], alpha=0.1)
    ax.plot(dtdates, ma(daystocovers_max_rank[method], centered=True), color=fewmethodcolors[i], label=fewmethodlabels[i]+" (mov. avg.)")

ax.plot(dtdates, np.zeros(len(dtdates)), 'k-')

plt.xlim(left=datetime.date(1980, 1, 1))
plt.ylim(0, 200)
plt.legend(ncol=3, frameon=False)
plt.title("Maximum Days to Cover - Top/Bottom Decile Long/Short Strategy")
plt.savefig(graphpath+'daystocovers_max_i_rank.pdf', bbox_inches='tight')
plt.show()

### Robustness test: using the non-interpolated data

In [None]:
def strategytrmratios(strategy):
    
    indtrmratios = np.multiply(strategy.values, trmratios.values)
    strategy_trmratios = np.nansum(indtrmratios, axis=1)

    return strategy_trmratios

def strategyshortratios(strategy):
    
    indshortratios = np.multiply(strategy.values, shortratios.values)
    strategy_shortratios = np.nansum(indshortratios, axis=1)

    return strategy_shortratios

In [None]:
# Top/bottom decile strategies

nanseries = np.full(T, np.nan)
trmratios_ls10 = dict(zip(fewmethods,[nanseries] * Mfew))
shortratios_ls10 = dict(zip(fewmethods,[nanseries] * Mfew))

for method in fewmethods:
    
    strategy = longshortstrategy(method, 10)

    shortratios_ls10[method] = strategyshortratios(strategy)

print("All done!")

# Short ratios:

fig, ax = basefig(10,5)
offsets = [-15, +5, +15, 0, 0, 0]

for (i,method) in enumerate(fewmethods):
    
    ax.plot(dtdates, shortratios_ls10[method], color=fewmethodcolors[i], alpha=0.1)
    ax.plot(dtdates, ma(shortratios_ls10[method], centered=True), color=fewmethodcolors[i], label=fewmethodlabels[i]+" (mov. avg.)")

ax.plot(dtdates, np.zeros(len(dtdates)), 'k-')

plt.xlim(left=datetime.date(1980, 1, 1))
#plt.ylim(2*-10**-4, 2*10**-4)
plt.legend(ncol=3, frameon=False)
plt.title("Value-weighted Short Ratios - Top/Bottom Decile Long/Short Strategy")
plt.savefig(graphpath+'shortratios_ls10.pdf', bbox_inches='tight')
plt.show()

In [None]:
# Rank-weighted strategies


nanseries = np.full(T, np.nan)
trmratios_rank = dict(zip(fewmethods,[nanseries] * Mfew))
shortratios_rank = dict(zip(fewmethods,[nanseries] * Mfew))

for method in fewmethods:
    
    strategy = rankstrategy(method)

    shortratios_rank[method] = strategyshortratios(strategy)

print("All done!")

# Short ratios:

fig, ax = basefig(10,5)
offsets = [-15, +5, +15, 0, 0, 0]

for (i,method) in enumerate(fewmethods):
    
    ax.plot(dtdates, shortratios_rank[method], color=fewmethodcolors[i], alpha=0.1)
    ax.plot(dtdates, ma(shortratios_rank[method], centered=True), color=fewmethodcolors[i], label=fewmethodlabels[i]+" (mov. avg.)")

ax.plot(dtdates, np.zeros(len(dtdates)), 'k-')

plt.xlim(left=datetime.date(1980, 1, 1))
#plt.ylim(3*-10**-5, 3*10**-5)
plt.legend(ncol=3, frameon=False)
plt.title("Value-weighted Short Ratios - Rank-weighted Long/Short Strategy")
plt.savefig(graphpath+'shortratios_rank.pdf', bbox_inches='tight')
plt.show()

## Trading Activity and Short Interest around Machine Learning Predictions

### Trading Activity

In [None]:
def trmratiovsprediction_tsanalysis(method, methodtitlelabel):

    slopes = np.array([])
    intercepts = np.array([])
    std_errs = np.array([])

    slopes_pct = np.array([])
    intercepts_pct = np.array([])
    std_errs_pct = np.array([])

    diffs_q1q4 = np.array([])
    std_errs_q1q4 = np.array([])

    diffs_d1d10 = np.array([])
    std_errs_d1d10 = np.array([])

    years = np.unique(dates_years)
    yearsmask = np.full_like(years, True).astype('bool')

    for year in years:

        chosendates = dates[np.isin(dates_years, year)]

        prediction = predictions[method].loc[chosendates].to_numpy().flatten()

        trmratio = trmratios.loc[chosendates].to_numpy().flatten()

        mask = ~np.isnan(trmratio) & ~np.isnan(prediction)

        if np.count_nonzero(mask) == 0 :

            yearsmask[years==year] = False

            continue

            slopes = np.append(slopes, np.nan)
            intercepts = np.append(intercepts, np.nan)
            std_errs = np.append(std_errs, np.nan)

            slopes_pct = np.append(slopes_pct, np.nan)
            intercepts_pct = np.append(intercepts, np.nan)
            std_errs_pct = np.append(std_errs_pct, np.nan)

            diffs_q1q4 = np.append(diffs_q1q4, np.nan)
            std_errs_q1q4 = np.append(std_errs_q1q4, np.nan)

            diffs_d1d10 = np.append(diffs_d1d10, np.nan)
            std_errs_d1d10 = np.append(std_errs_d1d10, np.nan)

            continue


        slope, intercept, r_value, p_value, std_err = sp.stats.linregress(trmratio[mask], prediction[mask])

        slopes = np.append(slopes, slope)
        intercepts = np.append(intercepts, intercept)
        std_errs = np.append(std_errs, std_err)


        slope_pct, intercept_pct, r_value_pct, p_value_pct, std_err_pct = sp.stats.linregress(intopercentile(trmratio[mask]), intopercentile(prediction[mask]))

        slopes_pct = np.append(slopes_pct, slope_pct)
        intercepts_pct = np.append(intercepts, intercept_pct)
        std_errs_pct = np.append(std_errs_pct, std_err_pct)


        means, _, _ = sp.stats.binned_statistic(intopercentile(prediction[mask]), trmratio[mask], 'mean', bins=4)
        stds, _, _ = sp.stats.binned_statistic(intopercentile(prediction[mask]), trmratio[mask], 'std', bins=4)
        counts, _, _ = sp.stats.binned_statistic(intopercentile(prediction[mask]), trmratio[mask], 'count', bins=4)

        diff_q1q4 = means[-1] - means[0]
        std_err_q1q4 = np.sqrt(((stds[-1]**2)/counts[-1])+((stds[0]**2)/counts[0]))

        diffs_q1q4 = np.append(diffs_q1q4, diff_q1q4)
        std_errs_q1q4 = np.append(std_errs_q1q4, std_err_q1q4)


        means, _, _ = sp.stats.binned_statistic(intopercentile(prediction[mask]), trmratio[mask], 'mean', bins=3)
        stds, _, _ = sp.stats.binned_statistic(intopercentile(prediction[mask]), trmratio[mask], 'std', bins=3)
        counts, _, _ = sp.stats.binned_statistic(intopercentile(prediction[mask]), trmratio[mask], 'count', bins=3)

        diff_d1d10 = means[-1] - means[0]
        std_err_d1d10 = np.sqrt(((stds[-1]**2)/counts[-1])+((stds[0]**2)/counts[0]))

        diffs_d1d10 = np.append(diffs_d1d10, diff_d1d10)
        std_errs_d1d10 = np.append(std_errs_d1d10, std_err_d1d10)


    fig, ax = plt.subplots(2, 2, figsize=(10*1.66,10))
    ax = ax.flatten()

    for axis in ax:
        axis.spines['right'].set_visible(False)
        axis.spines['top'].set_visible(False)

    ax[0].errorbar(years[yearsmask], slopes, yerr=1.96*std_errs, fmt='C0o')
    ax[0].plot(years[yearsmask], np.zeros_like(years[yearsmask]), 'k')
    ax[0].set_ylabel("Regression Coefficient")
    ax[0].set_title("Linear Regression: Trading Ratio vs Predicted Return")

    ax[1].errorbar(years[yearsmask], slopes_pct, yerr=1.96*std_errs_pct, fmt='C2o')
    ax[1].plot(years[yearsmask], np.zeros_like(years[yearsmask]), 'k')
    ax[1].set_ylabel("Regression Coefficient")
    ax[1].set_title("Percentile Regression: Trading Ratio vs Predicted Return")

    ax[2].errorbar(years[yearsmask], diffs_q1q4, yerr=1.96*std_errs_q1q4, fmt='C4o')
    ax[2].plot(years[yearsmask], np.zeros_like(years[yearsmask]), 'k')
    ax[2].set_ylim([-0.045, 0.075])
    ax[2].set_ylabel("4th Quantile Mean - 1st Quantile Mean")
    ax[2].set_title("Difference of Mean Trading Ratio \n between 4th and 1st Predicted Return Quantile", pad=-20)

    ax[3].errorbar(years[yearsmask], diffs_d1d10, yerr=1.96*std_errs_d1d10, fmt='C6o')
    ax[3].plot(years[yearsmask], np.zeros_like(years[yearsmask]), 'k')
    ax[3].set_ylim([-0.045, 0.075])
    ax[3].set_ylabel("10th Decile Mean - 1st Decile Mean")
    ax[3].set_title("Difference of Mean Trading Ratio \n between 10th and 1st Predicted Return Decile", pad=-10)

    fig.tight_layout(pad=2.0)
    fig.suptitle("Trading Activity and Machine Learning Predictions - Time Series Analysis - " + methodtitlelabel, size=18, y=1.02)
    plt.savefig(graphpath+'trmratiovsprediction_'+method+'_tsanalysis.png', dpi=300, bbox_inches='tight')
    plt.show()
    
trmratiovsprediction_tsanalysis('forest', "Random Forest")
trmratiovsprediction_tsanalysis('nn3', "3-Layer Neural Network")

### Short Interest

In [None]:
# Example for one intuitive year: 2015

chosendates = dates[np.isin(dates_years, 2015)]
print(chosendates)

method = 'nn3'

prediction = predictions['nn3'].loc[chosendates].to_numpy().flatten()
prediction_pctile = predictions_pctile['nn3'].loc[chosendates].to_numpy().flatten()
print("prediction.shape", prediction.shape)
print("predictions non-nans", np.count_nonzero(~np.isnan(prediction)))

shortratio = shortratios.loc[chosendates].to_numpy().flatten()
shortratio_pctile = shortratios_pctile.loc[chosendates].to_numpy().flatten()
print("shortratio.shape", shortratio.shape)
print("shortratio non-nans",  np.count_nonzero(~np.isnan(shortratio)))

mask = ~np.isnan(shortratio) & ~np.isnan(prediction)

print("points:",  np.count_nonzero(mask))


fig, ax = plt.subplots(1, 2, figsize=(12*1.33,12/2))

for axis in ax:
    axis.spines['right'].set_visible(False)
    axis.spines['top'].set_visible(False)

ax[0].plot(prediction[mask], shortratio[mask], 'ko', alpha=0.03)
ax[0].set_xlabel('Monthly Predicted Return')
ax[0].set_ylabel('Monthly Short Ratio')
ax[0].set_yscale('log')
ax[0].set_xlim([-0.5, 0.5])

ax[1].plot(prediction_pctile[mask], shortratio_pctile[mask], 'ko', alpha=0.01)
ax[1].set_xlabel('Monthly Predicted Return, Percentile')
ax[1].set_ylabel('Monthly Short Ratio, Percentile')

plt.savefig(graphpath+'shortratiovsprediction_nn3_2015.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
def shortratiovsprediction_1995to2015(method, methodtitlelabel):

    def shortvsreturnpercentilewithbinsplot(chosenyears, ax, alphaparam=0.01, title=""):

        chosendates = dates[np.isin(dates_years, chosenyears)]

        prediction = predictions_pctile[method].loc[chosendates].to_numpy().flatten()
        shortratio = shortratios.loc[chosendates].to_numpy().flatten()

        nanmask = ~np.isnan(shortratio) & ~np.isnan(prediction)

         # We only draw 50.000 randomly chosen points on each graph
        nanrandommask = np.full(len(shortratio[nanmask]), False)
        nanrandommask[:5*10**4] = True
        np.random.shuffle(nanrandommask)
        randommask = nanmask.copy()
        randommask[nanmask] = nanrandommask    

        binedges = np.linspace(np.min(prediction[nanmask]), np.max(prediction[nanmask]), 21)
        binpts = (binedges[1:] + binedges[:-1]) / 2

        means, _, _ = sp.stats.binned_statistic(prediction[nanmask], shortratio[nanmask], 'mean', bins=binedges)
        stds, _, _ = sp.stats.binned_statistic(prediction[nanmask], shortratio[nanmask], 'std', bins=binedges)
        counts, _, _ = sp.stats.binned_statistic(prediction[nanmask], shortratio[nanmask], 'count', bins=binedges)

        errs = 1.96 * stds / np.sqrt(counts)

        ax.plot(prediction[randommask], shortratio[randommask], 'ko', alpha=alphaparam)
        ax.plot(prediction[randommask], np.full_like(prediction[randommask], np.nanmean(shortratio)), 'k-', label='Overall average')
        ax.errorbar(binpts, means, yerr=errs, fmt='C0o', label='Vigintile avg. (95% CI)')
        ax.set_xlabel('Monthly Predicted Return, Percentile')
        ax.set_ylabel('Monthly Short Ratio')
        ax.set_yscale('log')
        ax.set_ylim([0.001, 1])
        ax.set_title(title)
        ax.legend(loc='upper left', frameon=False)

    fig, ax = plt.subplots(2, 2, figsize=(10*1.5,10))
    ax = ax.flatten()

    for axis in ax:
        axis.spines['right'].set_visible(False)
        axis.spines['top'].set_visible(False)


    shortvsreturnpercentilewithbinsplot([1995, 1996, 1997, 1998, 1999], ax[0], alphaparam=0.002, title="1995-1999")

    shortvsreturnpercentilewithbinsplot([2000, 2001, 2002, 2003, 2004], ax[1], alphaparam=0.002, title="2000-2004")

    shortvsreturnpercentilewithbinsplot([2005, 2006, 2007, 2008, 2009], ax[2], alphaparam=0.002, title="2005-2009")

    shortvsreturnpercentilewithbinsplot([2010, 2011, 2012, 2013, 2014, 2015], ax[3], alphaparam=0.002, title="2010-2015")

    fig.tight_layout(pad=2.0)
    fig.suptitle(methodtitlelabel, size=20, y=1.02)

    plt.savefig(graphpath+'shortratiovsprediction_'+method+'_1995to2015.png', dpi=300, bbox_inches='tight')
    plt.show()
    
shortratiovsprediction_1995to2015('nn3', "3-Layer Neural Network")
shortratiovsprediction_1995to2015('forest', "Random Forest")

shortratiovsprediction_1995to2015('ols', "Ordinary Least Squares")
shortratiovsprediction_1995to2015('tree', "Regression Tree")
shortratiovsprediction_1995to2015('nn1', "1-Layer Neural Network")
shortratiovsprediction_1995to2015('nn2', "2-Layer Neural Network")
    

In [None]:
def shortratiovsprediction_tsanalysis(method, methodtitlelabel):

    slopes = np.array([])
    intercepts = np.array([])
    std_errs = np.array([])

    slopes_pct = np.array([])
    intercepts_pct = np.array([])
    std_errs_pct = np.array([])

    diffs_q1q4 = np.array([])
    std_errs_q1q4 = np.array([])

    diffs_d1d10 = np.array([])
    std_errs_d1d10 = np.array([])

    years = np.unique(dates_years)
    yearsmask = np.full_like(years, True).astype('bool')

    for year in years:

        chosendates = dates[np.isin(dates_years, year)]

        prediction = predictions[method].loc[chosendates].to_numpy().flatten()
        prediction_pctile = predictions_pctile[method].loc[chosendates].to_numpy().flatten()

        shortratio = shortratios.loc[chosendates].to_numpy().flatten()
        shortratio_pctile = shortratios_pctile.loc[chosendates].to_numpy().flatten()

        mask = ~np.isnan(shortratio) & ~np.isnan(prediction)

        if np.count_nonzero(mask) == 0 :

            yearsmask[years==year] = False

            continue

        slope, intercept, r_value, p_value, std_err = sp.stats.linregress(shortratio[mask], prediction[mask])

        slopes = np.append(slopes, slope)
        intercepts = np.append(intercepts, intercept)
        std_errs = np.append(std_errs, std_err)


        slope_pct, intercept_pct, r_value_pct, p_value_pct, std_err_pct = sp.stats.linregress(shortratio_pctile[mask], prediction_pctile[mask])

        slopes_pct = np.append(slopes_pct, slope_pct)
        intercepts_pct = np.append(intercepts, intercept_pct)
        std_errs_pct = np.append(std_errs_pct, std_err_pct)


        means, _, _ = sp.stats.binned_statistic(prediction_pctile[mask], shortratio[mask], 'mean', bins=4)
        stds, _, _ = sp.stats.binned_statistic(prediction_pctile[mask], shortratio[mask], 'std', bins=4)
        counts, _, _ = sp.stats.binned_statistic(prediction_pctile[mask], shortratio[mask], 'count', bins=4)

        diff_q1q4 = means[-1] - means[0]
        std_err_q1q4 = np.sqrt(((stds[-1]**2)/counts[-1])+((stds[0]**2)/counts[0]))

        diffs_q1q4 = np.append(diffs_q1q4, diff_q1q4)
        std_errs_q1q4 = np.append(std_errs_q1q4, std_err_q1q4)


        means, _, _ = sp.stats.binned_statistic(prediction_pctile[mask], shortratio[mask], 'mean', bins=10)
        stds, _, _ = sp.stats.binned_statistic(prediction_pctile[mask], shortratio[mask], 'std', bins=10)
        counts, _, _ = sp.stats.binned_statistic(prediction_pctile[mask], shortratio[mask], 'count', bins=10)

        diff_d1d10 = means[-1] - means[0]
        std_err_d1d10 = np.sqrt(((stds[-1]**2)/counts[-1])+((stds[0]**2)/counts[0]))

        diffs_d1d10 = np.append(diffs_d1d10, diff_d1d10)
        std_errs_d1d10 = np.append(std_errs_d1d10, std_err_d1d10)


    fig, ax = plt.subplots(2, 2, figsize=(10*1.5,10))
    ax = ax.flatten()

    for axis in ax:
        axis.spines['right'].set_visible(False)
        axis.spines['top'].set_visible(False)

    ax[0].errorbar(years[yearsmask], slopes, yerr=1.96*std_errs, fmt='C0o')
    ax[0].plot(years[yearsmask], np.zeros_like(years[yearsmask]), 'k')
    ax[0].set_ylabel("Regression Coefficient")
    ax[0].set_title("Linear Regression: Short Ratio vs Predicted Return")

    ax[1].errorbar(years[yearsmask], slopes_pct, yerr=1.96*std_errs_pct, fmt='C2o')
    ax[1].plot(years[yearsmask], np.zeros_like(years[yearsmask]), 'k')
    ax[1].set_ylabel("Regression Coefficient")
    ax[1].set_title("Percentile Regression: Short Ratio vs Predicted Return")

    ax[2].errorbar(years[yearsmask], diffs_q1q4, yerr=1.96*std_errs_q1q4, fmt='C4o')
    ax[2].plot(years[yearsmask], np.zeros_like(years[yearsmask]), 'k')
    ax[3].set_ylim([-0.045, 0.075])
    ax[2].set_ylabel("4th Quantile Mean - 1st Quantile Mean")
    ax[2].set_title("Difference of Mean Short Ratio \n between 4th and 1st Predicted Return Quantile", pad=-20)

    ax[3].errorbar(years[yearsmask], diffs_d1d10, yerr=1.96*std_errs_d1d10, fmt='C6o')
    ax[3].plot(years[yearsmask], np.zeros_like(years[yearsmask]), 'k')
    ax[3].set_ylim([-0.045, 0.075])
    ax[3].set_ylabel("10th Decile Mean - 1st Decile Mean")
    ax[3].set_title("Difference of Mean Short Ratio \n between 10th and 1st Predicted Return Decile", pad=-10)

    fig.tight_layout(pad=2.0)
    fig.suptitle(methodtitlelabel, size=20, y=1.02)
    plt.savefig(graphpath+'shortratiovsprediction_'+method+'_tsanalysis.png', dpi=300, bbox_inches='tight')
    plt.show()
    
shortratiovsprediction_tsanalysis('forest', "Random Forest")
shortratiovsprediction_tsanalysis('nn3', "3-Layer Neural Network")

shortratiovsprediction_tsanalysis('ols', "Ordinary Least Squares")
shortratiovsprediction_tsanalysis('tree', "Regression Tree")
shortratiovsprediction_tsanalysis('nn1', "1-Layer Neural Network")
shortratiovsprediction_tsanalysis('nn2', "2-Layer Neural Network")

### Days to Cover

In [None]:
def daystocovervsprediction_1995to2015(method, methodtitlelabel):

    def daystocovervsreturnpercentilewithbinsplot(chosenyears, ax, alphaparam=0.01, title=""):

        chosendates = dates[np.isin(dates_years, chosenyears)]

        prediction = predictions_pctile[method].loc[chosendates].to_numpy().flatten()
        daystocover = daystocovers.loc[chosendates].to_numpy().flatten()

        nanmask = ~np.isnan(daystocover) & ~np.isnan(prediction)

         # We only draw 50.000 randomly chosen points on each graph
        nanrandommask = np.full(len(daystocover[nanmask]), False)
        nanrandommask[:5*10**4] = True
        np.random.shuffle(nanrandommask)
        randommask = nanmask.copy()
        randommask[nanmask] = nanrandommask    

        binedges = np.linspace(np.min(prediction[nanmask]), np.max(prediction[nanmask]), 21)
        binpts = (binedges[1:] + binedges[:-1]) / 2

        means, _, _ = sp.stats.binned_statistic(prediction[nanmask], daystocover[nanmask], 'mean', bins=binedges)
        stds, _, _ = sp.stats.binned_statistic(prediction[nanmask], daystocover[nanmask], 'std', bins=binedges)
        counts, _, _ = sp.stats.binned_statistic(prediction[nanmask], daystocover[nanmask], 'count', bins=binedges)

        errs = 1.96 * stds / np.sqrt(counts)

        ax.plot(prediction[randommask], daystocover[randommask], 'ko', alpha=alphaparam)
        ax.plot(prediction[randommask], np.full_like(prediction[randommask], np.nanmean(daystocover)), 'k-', label='Overall average')
        ax.errorbar(binpts, means, yerr=errs, fmt='C0o', label='Vigintile avg. (95% CI)')
        ax.set_xlabel('Monthly Predicted Return, Percentile')
        ax.set_ylabel('Monthly Short Ratio')
        ax.set_yscale('log')
        ax.set_ylim([0.05, 5])
        ax.set_title(title)
        ax.legend(loc='upper left', frameon=False)

    fig, ax = plt.subplots(2, 2, figsize=(10*1.5,10))
    ax = ax.flatten()

    for axis in ax:
        axis.spines['right'].set_visible(False)
        axis.spines['top'].set_visible(False)


    daystocovervsreturnpercentilewithbinsplot([1995, 1996, 1997, 1998, 1999], ax[0], alphaparam=0.002, title="1995-1999")

    daystocovervsreturnpercentilewithbinsplot([2000, 2001, 2002, 2003, 2004], ax[1], alphaparam=0.002, title="2000-2004")

    daystocovervsreturnpercentilewithbinsplot([2005, 2006, 2007, 2008, 2009], ax[2], alphaparam=0.002, title="2005-2009")

    daystocovervsreturnpercentilewithbinsplot([2010, 2011, 2012, 2013, 2014, 2015], ax[3], alphaparam=0.002, title="2010-2015")

    fig.tight_layout(pad=2.0)
    fig.suptitle(methodtitlelabel, size=20, y=1.02)

    plt.savefig(graphpath+'daystocovervsprediction_'+method+'_1995to2015.png', dpi=300, bbox_inches='tight')
    plt.show()
    
daystocovervsprediction_1995to2015('nn3', "3-Layer Neural Network")
daystocovervsprediction_1995to2015('forest', "Random Forest")
daystocovervsprediction_1995to2015('ols', "Ordinary Least Squares")
    

In [None]:
def daystocovervsprediction_tsanalysis(method, methodtitlelabel):

    slopes = np.array([])
    intercepts = np.array([])
    std_errs = np.array([])

    slopes_pct = np.array([])
    intercepts_pct = np.array([])
    std_errs_pct = np.array([])

    diffs_q1q4 = np.array([])
    std_errs_q1q4 = np.array([])

    diffs_d1d10 = np.array([])
    std_errs_d1d10 = np.array([])

    years = np.unique(dates_years)
    yearsmask = np.full_like(years, True).astype('bool')

    for year in years:

        chosendates = dates[np.isin(dates_years, year)]

        prediction = predictions[method].loc[chosendates].to_numpy().flatten()
        prediction_pctile = predictions_pctile[method].loc[chosendates].to_numpy().flatten()

        daystocover = daystocovers.loc[chosendates].to_numpy().flatten()
        daystocover_pctile = daystocovers_pctile.loc[chosendates].to_numpy().flatten()

        mask = ~np.isnan(daystocover) & ~np.isnan(prediction)

        if np.count_nonzero(mask) == 0 :

            yearsmask[years==year] = False

            continue

        slope, intercept, r_value, p_value, std_err = sp.stats.linregress(daystocover[mask], prediction[mask])

        slopes = np.append(slopes, slope)
        intercepts = np.append(intercepts, intercept)
        std_errs = np.append(std_errs, std_err)


        slope_pct, intercept_pct, r_value_pct, p_value_pct, std_err_pct = sp.stats.linregress(daystocover_pctile[mask], prediction_pctile[mask])

        slopes_pct = np.append(slopes_pct, slope_pct)
        intercepts_pct = np.append(intercepts, intercept_pct)
        std_errs_pct = np.append(std_errs_pct, std_err_pct)


        means, _, _ = sp.stats.binned_statistic(prediction[mask], daystocover[mask], 'mean', bins=4)
        stds, _, _ = sp.stats.binned_statistic(prediction[mask], daystocover[mask], 'std', bins=4)
        counts, _, _ = sp.stats.binned_statistic(prediction[mask], daystocover[mask], 'count', bins=4)

        diff_q1q4 = means[-1] - means[0]
        std_err_q1q4 = np.sqrt(((stds[-1]**2)/counts[-1])+((stds[0]**2)/counts[0]))

        diffs_q1q4 = np.append(diffs_q1q4, diff_q1q4)
        std_errs_q1q4 = np.append(std_errs_q1q4, std_err_q1q4)


        means, _, _ = sp.stats.binned_statistic(prediction[mask], daystocover[mask], 'mean', bins=10)
        stds, _, _ = sp.stats.binned_statistic(prediction[mask], daystocover[mask], 'std', bins=10)
        counts, _, _ = sp.stats.binned_statistic(prediction[mask], daystocover[mask], 'count', bins=10)

        diff_d1d10 = means[-1] - means[0]
        std_err_d1d10 = np.sqrt(((stds[-1]**2)/counts[-1])+((stds[0]**2)/counts[0]))

        diffs_d1d10 = np.append(diffs_d1d10, diff_d1d10)
        std_errs_d1d10 = np.append(std_errs_d1d10, std_err_d1d10)


    fig, ax = plt.subplots(2, 2, figsize=(10*1.5,10))
    ax = ax.flatten()

    for axis in ax:
        axis.spines['right'].set_visible(False)
        axis.spines['top'].set_visible(False)

    ax[0].errorbar(years[yearsmask], slopes, yerr=1.96*std_errs, fmt='C0o')
    ax[0].plot(years[yearsmask], np.zeros_like(years[yearsmask]), 'k')
    ax[0].set_ylabel("Regression Coefficient")
    ax[0].set_title("Linear Regression: Days to Cover vs Predicted Return")

    ax[1].errorbar(years[yearsmask], slopes_pct, yerr=1.96*std_errs_pct, fmt='C2o')
    ax[1].plot(years[yearsmask], np.zeros_like(years[yearsmask]), 'k')
    ax[1].set_ylabel("Regression Coefficient")
    ax[1].set_title("Percentile Regression: Days to Cover vs Predicted Return")

    ax[2].errorbar(years[yearsmask], diffs_q1q4, yerr=1.96*std_errs_q1q4, fmt='C4o')
    ax[2].plot(years[yearsmask], np.zeros_like(years[yearsmask]), 'k')
    ax[2].set_ylim([-1, 1])
    ax[2].set_ylabel("4th Quantile Mean - 1st Quantile Mean")
    ax[2].set_title("Difference of Mean Days to Cover \n between 4th and 1st Predicted Return Quantile", pad=-20)

    ax[3].errorbar(years[yearsmask], diffs_d1d10, yerr=1.96*std_errs_d1d10, fmt='C6o')
    ax[3].plot(years[yearsmask], np.zeros_like(years[yearsmask]), 'k')
    ax[3].set_ylim([-1, 1])
    ax[3].set_ylabel("10th Decile Mean - 1st Decile Mean")
    ax[3].set_title("Difference of Mean Days to Cover \n between 10th and 1st Predicted Return Decile", pad=-10)

    fig.tight_layout(pad=2.0)
    fig.suptitle(methodtitlelabel, size=20, y=1.02)
    plt.savefig(graphpath+'daystocovervsprediction_'+method+'_tsanalysis.png', dpi=300, bbox_inches='tight')
    plt.show()
    
daystocovervsprediction_tsanalysis('forest', "Random Forest")
daystocovervsprediction_tsanalysis('nn3', "3-Layer Neural Network")
daystocovervsprediction_tsanalysis('ols', "Ordinary Least Squares")

### Short Interest by Market Cap

In [None]:

mktcaplims_p95 = np.nanpercentile(mktcaps.values, 95, axis=1)
mktcaplims_p75 = np.nanpercentile(mktcaps.values, 75, axis=1)


mask_bigcap = (mktcaps_pctile.ge(95))
mask_midcap = (mktcaps_pctile.lt(95) & mktcaps_pctile.ge(75))
mask_smallcap = (mktcaps_pctile.lt(75))


plt.plot(mktcaplims_p95)
plt.plot(mktcaplims_p75)
plt.yscale('log')
plt.show()

print(mktcaplims_p95[-1])
print(mktcaplims_p75[-1])

In [None]:
def shortratiovsprediction_1995to2015(method, methodtitlelabel):

    def shortvsreturnpercentilewithbinsplot(chosenyears, ax, alphaparam=0.01, title=""):

        chosendates = dates[np.isin(dates_years, chosenyears)]

        prediction = predictions_pctile[method].loc[chosendates].to_numpy().flatten()
        mktcappctile = mktcaps_pctile.loc[chosendates].to_numpy().flatten()
        shortratio = shortratios.loc[chosendates].to_numpy().flatten()

        # Mask for nans and marketcaps:
        nanmask = ~np.isnan(shortratio) & ~np.isnan(prediction)
        
        bigcapmask = (mktcappctile>=95)
        midcapmask = (95>mktcappctile) & (mktcappctile>=50)
        smallcapmask = (50>mktcappctile)
        
        # We only draw 50.000 randomly chosen points on each graph
        nanrandommask = np.full(len(shortratio[nanmask]), False)
        nanrandommask[:5*10**4] = True
        np.random.shuffle(nanrandommask)
        randommask = nanmask.copy()
        randommask[nanmask] = nanrandommask    

        binedges = np.linspace(np.min(prediction[nanmask]), np.max(prediction[nanmask]), 21)
        binpts = (binedges[1:] + binedges[:-1]) / 2

        means_big, _, _ = sp.stats.binned_statistic(prediction[nanmask & bigcapmask], shortratio[nanmask & bigcapmask], 'mean', bins=binedges)
        stds_big, _, _ = sp.stats.binned_statistic(prediction[nanmask & bigcapmask], shortratio[nanmask & bigcapmask], 'std', bins=binedges)
        counts_big, _, _ = sp.stats.binned_statistic(prediction[nanmask & bigcapmask], shortratio[nanmask & bigcapmask], 'count', bins=binedges)

        errs_big = 1.96 * stds_big / np.sqrt(counts_big)
        
        
        means_mid, _, _ = sp.stats.binned_statistic(prediction[nanmask & midcapmask], shortratio[nanmask & midcapmask], 'mean', bins=binedges)
        stds_mid, _, _ = sp.stats.binned_statistic(prediction[nanmask & midcapmask], shortratio[nanmask & midcapmask], 'std', bins=binedges)
        counts_mid, _, _ = sp.stats.binned_statistic(prediction[nanmask & midcapmask], shortratio[nanmask & midcapmask], 'count', bins=binedges)

        errs_mid = 1.96 * stds_mid / np.sqrt(counts_mid)
        
        
        means_small, _, _ = sp.stats.binned_statistic(prediction[nanmask & smallcapmask], shortratio[nanmask & smallcapmask], 'mean', bins=binedges)
        stds_small, _, _ = sp.stats.binned_statistic(prediction[nanmask & smallcapmask], shortratio[nanmask & smallcapmask], 'std', bins=binedges)
        counts_small, _, _ = sp.stats.binned_statistic(prediction[nanmask & smallcapmask], shortratio[nanmask & smallcapmask], 'count', bins=binedges)

        errs_small = 1.96 * stds_small / np.sqrt(counts_small)

        ax.plot(prediction[randommask], shortratio[randommask], 'ko', alpha=alphaparam)
        ax.plot(prediction[randommask], np.full_like(prediction[randommask], np.nanmean(shortratio)), 'k-', label='Overall average')
        ax.errorbar(binpts, means_small, yerr=errs_small, fmt='C6o', label='Small cap - Vigintile avg. (95% CI)')
        ax.errorbar(binpts, means_mid, yerr=errs_mid, fmt='C2o', label='Mid cap - Vigintile avg. (95% CI)')
        ax.errorbar(binpts, means_big, yerr=errs_big, fmt='C4o', label='Big cap - Vigintile avg. (95% CI)')
        ax.set_xlabel('Monthly Predicted Return, Percentile')
        ax.set_ylabel('Monthly Short Ratio')
        ax.set_yscale('log')
        ax.set_ylim([0.005, 5])
        ax.set_title(title)
        ax.legend(loc='upper left', frameon=False)

    fig, ax = plt.subplots(2, 2, figsize=(10*1.5,10))
    ax = ax.flatten()

    for axis in ax:
        axis.spines['right'].set_visible(False)
        axis.spines['top'].set_visible(False)


    shortvsreturnpercentilewithbinsplot([1995, 1996, 1997, 1998, 1999], ax[0], alphaparam=0.002, title="1995-1999")

    shortvsreturnpercentilewithbinsplot([2000, 2001, 2002, 2003, 2004], ax[1], alphaparam=0.002, title="2000-2004")

    shortvsreturnpercentilewithbinsplot([2005, 2006, 2007, 2008, 2009], ax[2], alphaparam=0.002, title="2005-2009")

    shortvsreturnpercentilewithbinsplot([2010, 2011, 2012, 2013, 2014, 2015], ax[3], alphaparam=0.002, title="2010-2015")

    fig.tight_layout(pad=2.0)
    fig.suptitle(methodtitlelabel, size=20, y=1.02)

    plt.savefig(graphpath+'shortratiovsprediction_'+method+'_1995to2015_bycap.png', dpi=300, bbox_inches='tight')
    plt.show()
    
shortratiovsprediction_1995to2015('nn3', "3-Layer Neural Network")
shortratiovsprediction_1995to2015('forest', "Random Forest")

shortratiovsprediction_1995to2015('ols', "Ordinary Least Squares")
shortratiovsprediction_1995to2015('tree', "Regression Tree")
shortratiovsprediction_1995to2015('nn1', "1-Layer Neural Network")
shortratiovsprediction_1995to2015('nn2', "2-Layer Neural Network")
    

In [None]:
def shortratiovsprediction_tsanalysis(method, methodtitlelabel):

    slopes_big = np.array([])
    std_errs_big = np.array([])

    slopes_mid = np.array([])
    std_errs_mid = np.array([])
    
    slopes_small = np.array([])
    std_errs_small = np.array([])

    years = np.unique(dates_years)
    yearsmask = np.full_like(years, True).astype('bool')

    for year in years:

        chosendates = dates[np.isin(dates_years, year)]

        prediction = predictions_pctile[method].loc[chosendates].to_numpy().flatten()
        mktcappctile = mktcaps_pctile.loc[chosendates].to_numpy().flatten()
        shortratio = shortratios_pctile.loc[chosendates].to_numpy().flatten()

        nanmask = ~np.isnan(shortratio) & ~np.isnan(prediction)
                
        bigcapmask = (mktcappctile>=95)
        midcapmask = (95>mktcappctile) & (mktcappctile>=50)
        smallcapmask = (50>mktcappctile)

        if np.count_nonzero(nanmask & bigcapmask) * np.count_nonzero(nanmask & midcapmask) * np.count_nonzero(nanmask & smallcapmask)== 0 :

            yearsmask[years==year] = False

            continue

        slope_big, _, _, _, std_err_big = sp.stats.linregress(shortratio[nanmask & bigcapmask], prediction[nanmask & bigcapmask])

        slopes_big = np.append(slopes_big, slope_big)
        std_errs_big = np.append(std_errs_big, std_err_big)
        
        
        slope_mid, _, _, _, std_err_mid = sp.stats.linregress(shortratio[nanmask & midcapmask], prediction[nanmask & midcapmask])

        slopes_mid = np.append(slopes_mid, slope_mid)
        std_errs_mid = np.append(std_errs_mid, std_err_mid)

    
        slope_small, _, _, _, std_err_small = sp.stats.linregress(shortratio[nanmask & smallcapmask], prediction[nanmask & smallcapmask])

        slopes_small = np.append(slopes_small, slope_small)
        std_errs_small = np.append(std_errs_small, std_err_small)

    fig, ax = plt.subplots(1, 3, figsize=(6*3.5,6), sharey=True)
    ax = ax.flatten()

    for axis in ax:
        axis.spines['right'].set_visible(False)
        axis.spines['top'].set_visible(False)

    ax[0].errorbar(years[yearsmask], slopes_big, yerr=1.96*std_errs_big, fmt='C0o')
    ax[0].plot(years[yearsmask], np.zeros_like(years[yearsmask]), 'k')
    ax[0].set_ylabel("Regression Coefficient")
    ax[0].set_title("Big Cap")

    ax[1].errorbar(years[yearsmask], slopes_mid, yerr=1.96*std_errs_mid, fmt='C2o')
    ax[1].plot(years[yearsmask], np.zeros_like(years[yearsmask]), 'k')
    ax[1].set_ylabel("Regression Coefficient")
    ax[1].set_title("Mid Cap")

    ax[2].errorbar(years[yearsmask], slopes_small, yerr=1.96*std_errs_small, fmt='C4o')
    ax[2].plot(years[yearsmask], np.zeros_like(years[yearsmask]), 'k')
    ax[2].set_ylabel("Regression Coefficient")
    ax[2].set_title("Small Cap")

    fig.tight_layout(pad=2.0)
    fig.suptitle(methodtitlelabel, size=20, y=1.02)
    plt.savefig(graphpath+'shortratiovsprediction_'+method+'_tsanalysis_bycap.png', dpi=300, bbox_inches='tight')
    plt.show()
    
shortratiovsprediction_tsanalysis('forest', "Random Forest")
shortratiovsprediction_tsanalysis('nn3', "3-Layer Neural Network")

shortratiovsprediction_tsanalysis('ols', "Ordinary Least Squares")
shortratiovsprediction_tsanalysis('tree', "Regression Tree")
shortratiovsprediction_tsanalysis('nn1', "1-Layer Neural Network")
shortratiovsprediction_tsanalysis('nn2', "2-Layer Neural Network")

# Post-publication decline

## Graphs with publication dates

In [None]:
pubdates = np.array(['1805', '1986', '1943', '2005', '1901', '1966', '1959',
                     '1995', '2002', '1986', '1986', '1986', '1986', '1986'], dtype='datetime64')

methodpubdates = dict(zip(methods, pubdates))
print(methodpubdates)

pubs = ['Legendre (1805)', 'Santosa and Symes (1986)', 'Tikhonov (1943)', 'Zou and Hastie (2005)',
              'Pearson (1901)', 'Wold (1966)', 'Belson (1959)', 'Ho (1995)', 'Friedman (2002)',
              'McClelland et al. (1986)', 'McClelland et al. (1986)', 'McClelland et al. (1986)',
              'McClelland et al. (1986)', 'McClelland et al. (1986)']

methodpubs = dict(zip(methods, pubs))
print(methodpubs)



In [None]:
method = 'enet'

i = 0
labels = ['Enet']

def pubdategraph(method, label, hlaboffsets=[0, 0, 0]):

    fig, ax = plt.subplots(1, 3, figsize=(18, 4))

    for axis in ax:
        axis.spines['right'].set_visible(False)
        axis.spines['top'].set_visible(False)

    oos_spearman = scores.loc[(dates,method,'oos','spearman')].fillna(method='ffill').to_numpy()

    ax[0].plot(dtdates, oos_spearman, 'C5', alpha=0.5, label='Spearman Rho')
    ax[0].plot(dtdates, ma(oos_spearman, window_size=12*5, centered=True), 'C4', label='5-year avg.')
    ax[0].axvline(x=methodpubdates[method], color='k', linestyle='--')
    ax[0].text(methodpubdates[method] + hlaboffsets[0], 0.97, methodpubs[method], bbox=dict(boxstyle="round", fc="1"), ha='center')
    ax[0].set_ylim([-0.05, 1.05])
    ax[0].legend(frameon=False, loc='upper left')
    ax[0].set_title(label + " : OOS Spearman Rho", pad=10)


    ax[1].plot(dtdates, yearrets_l10[method], 'C1', label='Returns')
    ax[1].plot(dtdates, ma(yearrets_l10[method], window_size=12*3, centered=True), 'C0', label='3-year avg.')
    ax[1].axvline(x=methodpubdates[method], color='k', linestyle='--')
    ax[1].text(methodpubdates[method] + hlaboffsets[1], 0.27, methodpubs[method], bbox=dict(boxstyle="round", fc="1"), ha='center')
    ax[1].set_ylim([-0.1, 0.3])
    ax[1].legend(frameon=False, loc='upper left')
    ax[1].set_title(label + " : Monthly returns", pad=10)

    ax[2].plot(dtdates, ff3alphas_3y_rank[method], 'C2')
    ax[2].axvline(x=methodpubdates[method], color='k', linestyle='--')
    ax[2].text(methodpubdates[method] + hlaboffsets[2], 0.135, methodpubs[method], bbox=dict(boxstyle="round", fc="1"), ha='center')
    ax[2].set_ylim([-0.05, 0.15])
    ax[2].set_title(label + " : 3-year FF3 alpha", pad=10)
    
    if np.isin(method, ['nn1', 'nn2', 'nn3', 'nn4', 'nn5']):
        
        # Add Werbos (1974)
        ax[0].axvline(x=np.datetime64('1974'), ymax=0.8, color='k', linestyle='--')
        ax[0].text(np.datetime64('1974'), 0.02, 'Werbos (1974)', bbox=dict(boxstyle="round", fc="1"), ha='center')
        
        ax[1].axvline(x=np.datetime64('1974'), ymax=0.8, color='k', linestyle='--')
        ax[1].text(np.datetime64('1974'), -0.08, 'Werbos (1974)', bbox=dict(boxstyle="round", fc="1"), ha='center')
        
        ax[2].axvline(x=np.datetime64('1974'), color='k', linestyle='--')
        ax[2].text(np.datetime64('1974'), -0.04, 'Werbos (1974)', bbox=dict(boxstyle="round", fc="1"), ha='center')


    plt.savefig(graphpath+'pubdategraph_' + method + '.pdf', bbox_inches='tight')
    plt.show()


hlaboffsets = [np.timedelta64(14,'Y'), np.timedelta64(14,'Y'), np.timedelta64(0,'Y')]
pubdategraph('lasso', 'Lasso', hlaboffsets)

pubdategraph('enet', 'Enet')

pubdategraph('forest', 'Forest')

pubdategraph('gbrt', 'GBRT')

hlaboffsets = [np.timedelta64(12,'Y'), np.timedelta64(12,'Y'), np.timedelta64(0,'Y')]
pubdategraph('nn1', 'NN1', hlaboffsets)
pubdategraph('nn2', 'NN2', hlaboffsets)
pubdategraph('nn3', 'NN3', hlaboffsets)

## Publication-date aligned graphs

In [None]:
plotmethods = ['lasso', 'enet', 'forest', 'gbrt', 'nn1', 'nn2', 'nn2']
labels = ['Lasso', 'Enet', 'Forest', 'GBRT', 'NN1', 'NN2', 'NN3']

nyears = 25

fig, ax = plt.subplots(1, figsize=(12, 5))

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

ylabels = np.linspace(-nyears, nyears, 2*12*nyears)
        
for (i,method) in enumerate(plotmethods):
    
    pubidx = np.where(dtdates>=methodpubdates[method])[0][0]
    
    oos_spearman = scores.loc[(dates,method,'oos','spearman')].fillna(method='ffill').to_numpy()
    
    plotspearman = oos_spearman[max(pubidx-12*nyears, 0):min(pubidx+12*nyears, T)]
    
    plotspearman = np.append(np.array([np.nan]*(min(pubidx-12*nyears, 0))), plotspearman)    
    plotspearman = np.append(plotspearman, np.array([np.nan]*(max(pubidx+12*nyears-T, 0))))
    
    plotspearman_ma = ma(plotspearman, window_size=12*3, centered=True)
        
    ax.plot(ylabels, plotspearman_ma, label=labels[i] + ' (' + str(methodpubdates[method]) + ')')

ax.plot(ylabels, np.zeros_like(ylabels), 'k', linewidth=0.75)

ax.axvline(x=0, color='k', linestyle='--')
ax.text(x=0, y=1, s='Publication Date', bbox=dict(boxstyle="round", fc="1"), ha='center')

ax.set_xlabel('Years before and after publication')
ax.set_ylabel('Out-of-sample Spearman rho, \n 3-year centered moving average')
ax.set_ylim([-0.1, 1.1])

ax.legend(frameon=False, loc='upper left')
plt.savefig(graphpath+'pubdategraph_aligned_spearman.pdf', bbox_inches='tight')
plt.show()


In [None]:
plotmethods = ['lasso', 'enet', 'forest', 'gbrt', 'nn1', 'nn2', 'nn2']
labels = ['Lasso', 'Enet', 'Forest', 'GBRT', 'NN1', 'NN2', 'NN3']

nyears = 25

fig, ax = plt.subplots(1, figsize=(12, 5))

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

ylabels = np.linspace(-nyears, nyears, 2*12*nyears)
        
for (i,method) in enumerate(plotmethods):
    
    pubidx = np.where(dtdates>=methodpubdates[method])[0][0]
    
    plotrets = yearrets_l10[method][max(pubidx-12*nyears, 0):min(pubidx+12*nyears, T)]
    
    plotrets = np.append(np.array([np.nan]*(min(pubidx-12*nyears, 0))), plotrets)    
    plotrets = np.append(plotrets, np.array([np.nan]*(max(pubidx+12*nyears-T, 0))))
    
    plotrets_ma = ma(plotrets, window_size=12*3, centered=True)
        
    ax.plot(ylabels, plotrets_ma, label=labels[i] + ' (' + str(methodpubdates[method]) + ')')

ax.plot(ylabels, np.zeros_like(ylabels), 'k', linewidth=0.75)

ax.axvline(x=0, color='k', linestyle='--')
ax.text(x=0, y=0.15, s='Publication Date', bbox=dict(boxstyle="round", fc="1"), ha='center')

ax.set_xlabel('Years before and after publication')
ax.set_ylabel('3-year centered moving average return')
ax.set_ylim([-0.02, 0.16])

ax.legend(frameon=False)
plt.savefig(graphpath+'pubdategraph_aligned_returns.pdf', bbox_inches='tight')
plt.show()


In [None]:
plotmethods = ['lasso', 'enet', 'forest', 'gbrt', 'nn1', 'nn2', 'nn2']
labels = ['Lasso', 'Enet', 'Forest', 'GBRT', 'NN1', 'NN2', 'NN3']

nyears = 25

fig, ax = plt.subplots(1, figsize=(12, 5))

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

ylabels = np.linspace(-nyears, nyears, 2*12*nyears)
        
for (i,method) in enumerate(plotmethods):
    
    pubidx = np.where(dtdates>=methodpubdates[method])[0][0]
    
    plotalphas = ff3alphas_3y_rank[method][max(pubidx-12*nyears, 0):min(pubidx+12*nyears, T)]
    
    plotalphas = np.append(np.array([np.nan]*(min(pubidx-12*nyears, 0))), plotalphas)    
    plotalphas = np.append(plotalphas, np.array([np.nan]*(max(pubidx+12*nyears-T, 0))))
            
    ax.plot(ylabels, plotalphas, label=labels[i] + ' (' + str(methodpubdates[method]) + ')')

ax.plot(ylabels, np.zeros_like(ylabels), 'k', linewidth=0.75)

ax.axvline(x=0, color='k', linestyle='--')
ax.text(x=0, y=0.105, s='Publication Date', bbox=dict(boxstyle="round", fc="1"), ha='center')

ax.set_xlabel('Years before and after publication')
ax.set_ylabel('3-year Fama-French 3-factor alpha')

ax.legend(frameon=False)
plt.savefig(graphpath+'pubdategraph_aligned_alphas.pdf', bbox_inches='tight')
plt.show()


## Regression tables

In [None]:
plotmethods = ['lasso', 'enet', 'forest', 'gbrt', 'nn1', 'nn2', 'nn2']
labels = ['Lasso', 'Enet', 'Forest', 'GBRT', 'NN1', 'NN2', 'NN3']

nyears = 25

fig, ax = plt.subplots(1, figsize=(12, 6))

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
        
for (i,method) in enumerate(['nn3']):
    
    pubidx = np.where(dtdates>=methodpubdates[method])[0][0]

    nmonths_before = min(12*nyears, pubidx)    
    nmonths_after = min(12*nyears, T - pubidx)

    rets = yearrets_l10[method][pubidx-nmonths_before:pubidx+nmonths_after]

    dummy = np.zeros_like(rets)
    dummy[12*nyears:] = 1

    X0 = np.ones_like(rets)
    X1 = np.arange(len(rets)) - pubidx

    exog = np.array([dummy, X0, X1, X1**2, X1**3, X1**4, X1**5]).T
    exog = pd.DataFrame(exog, columns=['dummy', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5'])

    model1 = sm.OLS(rets,exog[['dummy', 'X0', 'X1',]]).fit(cov_type='HAC',cov_kwds={'maxlags':12*10})
    model2 = sm.OLS(rets,exog[['dummy', 'X0', 'X1', 'X2']]).fit(cov_type='HAC',cov_kwds={'maxlags':12*10})
    model5 = sm.OLS(rets,exog[['dummy', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5']]).fit(cov_type='HAC',cov_kwds={'maxlags':12*10})

    ylabels = np.linspace(-nmonths_before/12, nmonths_after/12, (nmonths_before+nmonths_after))
            
    ax.plot(ylabels, rets, label='Returns')
    ax.text(x=0, y=0.6, s='McClelland et al. (1986)', bbox=dict(boxstyle="round", fc="1"), ha='center')
    
    ax.plot(ylabels, model1.predict(exog[['dummy', 'X0', 'X1',]]), 'C2', label='Linear fit')
    ax.plot(ylabels, model2.predict(exog[['dummy', 'X0', 'X1', 'X2']]), 'C4', label='Quadratic fit')
    ax.plot(ylabels, model5.predict(exog[['dummy', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5']]), 'C6', label='Quintic fit')

ax.plot(ylabels, np.zeros_like(ylabels), 'k', linewidth=0.75)

ax.axvline(x=0, color='k', linestyle='--')

ax.set_xlabel('Years before and after publication')
ax.set_ylabel('Monthly returns')

ax.set_title('3-Layer Neural Network - Returns')

ax.legend(frameon=False)
plt.savefig(graphpath+'fitgraph_aligned_returns.pdf', bbox_inches='tight')
plt.show()

In [None]:
methods_table = ['lasso', 'enet']
methods_table2 = ['forest', 'gbrt']
methods_table3 = ['nn1', 'nn2', 'nn2']

maxnyears = 15

def returnmodels(methodslist):
    
    modellist = []
        
    for (i,method) in enumerate(methodslist):

        pubidx = np.where(dtdates>=methodpubdates[method])[0][0]

        nmonths_before = min(12*nyears, pubidx)    
        nmonths_after = min(12*nyears, T - pubidx)

        rets = 100 * yearrets_l10[method][pubidx-nmonths_before:pubidx+nmonths_after]

        dummy = np.zeros_like(rets)
        dummy[12*nyears:] = 1

        X0 = np.ones_like(rets)
        X1 = np.arange(len(rets)) - pubidx

        exog = np.array([dummy, X0, X1, X1**2, X1**3, X1**4, X1**5]).T
        exog = pd.DataFrame(exog, columns=['dummy', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5'])

        model1 = sm.OLS(rets,exog[['dummy', 'X0', 'X1',]]).fit(cov_type='HAC',cov_kwds={'maxlags':12*10})
        model2 = sm.OLS(rets,exog[['dummy', 'X0', 'X1', 'X2']]).fit(cov_type='HAC',cov_kwds={'maxlags':12*10})
        model5 = sm.OLS(rets,exog[['dummy', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5']]).fit(cov_type='HAC',cov_kwds={'maxlags':12*10})

        modellist.append(model1)
        modellist.append(model2)
        modellist.append(model5)
            
    return modellist


pystout(models=returnmodels(methods_table),
        file=tablespath+'postpubdecline_returns_ls10_table.tex',
        exogvars = ['dummy'],
        digits=2,
        endog_names=['Linear','Quadratic','Quintic']*2,
        varlabels={'dummy':'Post-Publication Dummy'},
        mgroups={'Lasso - Returns':[1,3],'Enet - Returns':[4,6]},
        stars={.1:'*',.05:'**',.01:'***'},
        modstat={'nobs':'Obs','rsquared_adj':'Adj. R\sym{2}'})

pystout(models=returnmodels(methods_table2),
        file=tablespath+'postpubdecline_returns_ls10_table2.tex',
        exogvars = ['dummy'],
        digits=2,
        endog_names=['Linear','Quadratic','Quintic']*2,
        varlabels={'dummy':'Post-Publication Dummy'},
        mgroups={'Forest - Returns':[1,3],'GBRT - Returns':[4,6]},
        stars={.1:'*',.05:'**',.01:'***'},
        modstat={'nobs':'Obs','rsquared_adj':'Adj. R\sym{2}'})

pystout(models=returnmodels(methods_table3),
        file=tablespath+'postpubdecline_returns_ls10_table3.tex',
        exogvars = ['dummy'],
        digits=2,
        endog_names=['Linear','Quadratic','Quintic']*3,
        varlabels={'dummy':'Post-Pub. Dummy'},
        mgroups={'NN1 - Returns':[1,3],'NN2 - Returns':[4,6],'NN3 - Returns':[7,9]},
        stars={.1:'*',.05:'**',.01:'***'},
        modstat={'nobs':'Obs','rsquared_adj':'Adj. R\sym{2}'})

In [None]:
methods_table = ['lasso', 'enet']
methods_table2 = ['forest', 'gbrt']
methods_table3 = ['nn1', 'nn2', 'nn2']

maxnyears = 15

def returnmodels(methodslist):
    
    modellist = []
        
    for (i,method) in enumerate(methodslist):

        pubidx = np.where(dtdates>=methodpubdates[method])[0][0]

        nmonths_before = min(12*nyears, pubidx)    
        nmonths_after = min(12*nyears, T - pubidx)

        rets = 100 * yearrets_rank[method][pubidx-nmonths_before:pubidx+nmonths_after]

        dummy = np.zeros_like(rets)
        dummy[12*nyears:] = 1

        X0 = np.ones_like(rets)
        X1 = np.arange(len(rets))

        exog = np.array([dummy, X0, X1, X1**2, X1**3, X1**4, X1**5]).T
        exog = pd.DataFrame(exog, columns=['dummy', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5'])

        model1 = sm.OLS(rets,exog[['dummy', 'X0', 'X1',]]).fit(cov_type='HAC',cov_kwds={'maxlags':12*10})
        model2 = sm.OLS(rets,exog[['dummy', 'X0', 'X1', 'X2']]).fit(cov_type='HAC',cov_kwds={'maxlags':12*10})
        model5 = sm.OLS(rets,exog[['dummy', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5']]).fit(cov_type='HAC',cov_kwds={'maxlags':12*10})

        modellist.append(model1)
        modellist.append(model2)
        modellist.append(model5)
            
    return modellist


pystout(models=returnmodels(methods_table),
        file=tablespath+'postpubdecline_returns_rank_table.tex',
        exogvars = ['dummy'],
        digits=2,
        endog_names=['Linear','Quadratic','Quintic']*2,
        varlabels={'dummy':'Post-Publication Dummy'},
        mgroups={'Lasso - Returns':[1,3],'Enet - Returns':[4,6]},
        stars={.1:'*',.05:'**',.01:'***'},
        modstat={'nobs':'Obs','rsquared_adj':'Adj. R\sym{2}'})

pystout(models=returnmodels(methods_table2),
        file=tablespath+'postpubdecline_returns_rank_table2.tex',
        exogvars = ['dummy'],
        digits=2,
        endog_names=['Linear','Quadratic','Quintic']*2,
        varlabels={'dummy':'Post-Publication Dummy'},
        mgroups={'Forest - Returns':[1,3],'GBRT - Returns':[4,6]},
        stars={.1:'*',.05:'**',.01:'***'},
        modstat={'nobs':'Obs','rsquared_adj':'Adj. R\sym{2}'})

pystout(models=returnmodels(methods_table3),
        file=tablespath+'postpubdecline_returns_rank_table3.tex',
        exogvars = ['dummy'],
        digits=2,
        endog_names=['Linear','Quadratic','Quintic']*3,
        varlabels={'dummy':'Post-Pub. Dummy'},
        mgroups={'NN1 - Returns':[1,3],'NN2 - Returns':[4,6],'NN3 - Returns':[7,9]},
        stars={.1:'*',.05:'**',.01:'***'},
        modstat={'nobs':'Obs','rsquared_adj':'Adj. R\sym{2}'})

In [None]:
methods_table = ['lasso', 'enet']
methods_table2 = ['forest', 'gbrt']
methods_table3 = ['nn1', 'nn2', 'nn2']

maxnyears = 15

def returnmodels(methodslist):
    
    modellist = []
        
    for (i,method) in enumerate(methodslist):

        pubidx = np.where(dtdates>=methodpubdates[method])[0][0]

        nmonths_before = min(12*nyears, pubidx)    
        nmonths_after = min(12*nyears, T - pubidx)

        rets = 100 * yearrets_l10[method][pubidx-nmonths_before:pubidx+nmonths_after]

        dummy = np.zeros_like(rets)
        dummy[12*nyears:] = 1

        X0 = np.ones_like(rets)
        X1 = np.arange(len(rets))

        exog = np.array([dummy, X0, X1, X1**2, X1**3, X1**4, X1**5]).T
        exog = pd.DataFrame(exog, columns=['dummy', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5'])

        model1 = sm.OLS(rets,exog[['dummy', 'X0', 'X1',]]).fit(cov_type='HAC',cov_kwds={'maxlags':12*10})
        model2 = sm.OLS(rets,exog[['dummy', 'X0', 'X1', 'X2']]).fit(cov_type='HAC',cov_kwds={'maxlags':12*10})
        model5 = sm.OLS(rets,exog[['dummy', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5']]).fit(cov_type='HAC',cov_kwds={'maxlags':12*10})

        modellist.append(model1)
        modellist.append(model2)
        modellist.append(model5)
            
    return modellist


pystout(models=returnmodels(methods_table),
        file=tablespath+'postpubdecline_ff3alphas_ls10_table.tex',
        exogvars = ['dummy'],
        digits=2,
        endog_names=['Linear','Quadratic','Quintic']*2,
        varlabels={'dummy':'Post-Publication Dummy'},
        mgroups={'Lasso - 3Y-FF3-Alpha':[1,3],'Enet - 3Y-FF3-Alpha':[4,6]},
        stars={.1:'*',.05:'**',.01:'***'},
        modstat={'nobs':'Obs','rsquared_adj':'Adj. R\sym{2}'})

pystout(models=returnmodels(methods_table2),
        file=tablespath+'postpubdecline_ff3alphas_ls10_table2.tex',
        exogvars = ['dummy'],
        digits=2,
        endog_names=['Linear','Quadratic','Quintic']*2,
        varlabels={'dummy':'Post-Publication Dummy'},
        mgroups={'Forest - 3Y-FF3-Alpha':[1,3],'GBRT - 3Y-FF3-Alpha':[4,6]},
        stars={.1:'*',.05:'**',.01:'***'},
        modstat={'nobs':'Obs','rsquared_adj':'Adj. R\sym{2}'})

pystout(models=returnmodels(methods_table3),
        file=tablespath+'postpubdecline_ff3alphas_ls10_table3.tex',
        exogvars = ['dummy'],
        digits=2,
        endog_names=['Linear','Quadratic','Quintic']*3,
        varlabels={'dummy':'Post-Pub. Dummy'},
        mgroups={'NN1 - 3Y-FF3-Alpha':[1,3],'NN2 - 3Y-FF3-Alpha':[4,6],'NN3 - 3Y-FF3-Alpha':[7,9]},
        stars={.1:'*',.05:'**',.01:'***'},
        modstat={'nobs':'Obs','rsquared_adj':'Adj. R\sym{2}'})

In [None]:
methods_table = ['lasso', 'enet']
methods_table2 = ['forest', 'gbrt']
methods_table3 = ['nn1', 'nn2', 'nn2']

maxnyears = 15

def returnmodels(methodslist):
    
    modellist = []
        
    for (i,method) in enumerate(methodslist):

        pubidx = np.where(dtdates>=methodpubdates[method])[0][0]

        nmonths_before = min(12*nyears, pubidx)    
        nmonths_after = min(12*nyears, T - pubidx)

        rets = 100 * ff3alphas_3y_rank[method][pubidx-nmonths_before:pubidx+nmonths_after]

        dummy = np.zeros_like(rets)
        dummy[12*nyears:] = 1

        X0 = np.ones_like(rets)
        X1 = np.arange(len(rets))

        exog = np.array([dummy, X0, X1, X1**2, X1**3, X1**4, X1**5]).T
        exog = pd.DataFrame(exog, columns=['dummy', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5'])

        model1 = sm.OLS(rets,exog[['dummy', 'X0', 'X1',]]).fit(cov_type='HAC',cov_kwds={'maxlags':12*10})
        model2 = sm.OLS(rets,exog[['dummy', 'X0', 'X1', 'X2']]).fit(cov_type='HAC',cov_kwds={'maxlags':12*10})
        model5 = sm.OLS(rets,exog[['dummy', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5']]).fit(cov_type='HAC',cov_kwds={'maxlags':12*10})

        modellist.append(model1)
        modellist.append(model2)
        modellist.append(model5)
            
    return modellist


pystout(models=returnmodels(methods_table),
        file=tablespath+'postpubdecline_ff3alphas_rank_table.tex',
        exogvars = ['dummy'],
        digits=2,
        endog_names=['Linear','Quadratic','Quintic']*2,
        varlabels={'dummy':'Post-Publication Dummy'},
        mgroups={'Lasso - 3Y-FF3-Alpha':[1,3],'Enet - 3Y-FF3-Alpha':[4,6]},
        stars={.1:'*',.05:'**',.01:'***'},
        modstat={'nobs':'Obs','rsquared_adj':'Adj. R\sym{2}'})

pystout(models=returnmodels(methods_table2),
        file=tablespath+'postpubdecline_ff3alphas_rank_table2.tex',
        exogvars = ['dummy'],
        digits=2,
        endog_names=['Linear','Quadratic','Quintic']*2,
        varlabels={'dummy':'Post-Publication Dummy'},
        mgroups={'Forest - 3Y-FF3-Alpha':[1,3],'GBRT - 3Y-FF3-Alpha':[4,6]},
        stars={.1:'*',.05:'**',.01:'***'},
        modstat={'nobs':'Obs','rsquared_adj':'Adj. R\sym{2}'})

pystout(models=returnmodels(methods_table3),
        file=tablespath+'postpubdecline_ff3alphas_rank_table3.tex',
        exogvars = ['dummy'],
        digits=2,
        endog_names=['Linear','Quadratic','Quintic']*3,
        varlabels={'dummy':'Post-Pub. Dummy'},
        mgroups={'NN1 - 3Y-FF3-Alpha':[1,3],'NN2 - 3Y-FF3-Alpha':[4,6],'NN3 - 3Y-FF3-Alpha':[7,9]},
        stars={.1:'*',.05:'**',.01:'***'},
        modstat={'nobs':'Obs','rsquared_adj':'Adj. R\sym{2}'})

# Market Prescience

In [None]:

def shortvsreturnpercentilewithbinsplot(chosenyears, ax, alphaparam=0.01, title=""):

    chosendates = dates[np.isin(dates_years, chosenyears)]

    rets = returns_pctile.loc[chosendates].to_numpy().flatten()
    shortratio = shortratios.loc[chosendates].to_numpy().flatten()

    nanmask = ~np.isnan(shortratio) & ~np.isnan(rets)

     # We only draw 100.000 randomly chosen points on each graph
    nanrandommask = np.full(len(shortratio[nanmask]), False)
    nanrandommask[:5*10**4] = True
    np.random.shuffle(nanrandommask)
    randommask = nanmask.copy()
    randommask[nanmask] = nanrandommask    

    binedges = np.linspace(np.min(rets[nanmask]), np.max(rets[nanmask]), 21)
    binpts = (binedges[1:] + binedges[:-1]) / 2

    means, _, _ = sp.stats.binned_statistic(rets[nanmask], shortratio[nanmask], 'mean', bins=binedges)
    stds, _, _ = sp.stats.binned_statistic(rets[nanmask], shortratio[nanmask], 'std', bins=binedges)
    counts, _, _ = sp.stats.binned_statistic(rets[nanmask], shortratio[nanmask], 'count', bins=binedges)

    errs = 1.96 * stds / np.sqrt(counts)

    ax.plot(rets[randommask], shortratio[randommask], 'ko', alpha=alphaparam)
    ax.plot(rets[randommask], np.full_like(rets[randommask], np.nanmean(shortratio)), 'k-', label='Overall average')
    ax.errorbar(binpts, means, yerr=errs, fmt='C0o', label='Vigintile avg. (95% CI)')
    ax.set_xlabel('Monthly Return, Percentile')
    ax.set_ylabel('Monthly Short Ratio')
    ax.set_yscale('log')
    ax.set_ylim([0.005, 1])
    ax.set_title(title)
    ax.legend(loc='upper left', frameon=False)


fig, ax = plt.subplots(2, 2, figsize=(12*1.33,12))
ax = ax.flatten()

for axis in ax:
    axis.spines['right'].set_visible(False)
    axis.spines['top'].set_visible(False)


shortvsreturnpercentilewithbinsplot([1995, 1996, 1997, 1998, 1999], ax[0], alphaparam=0.002, title="1995-1999")

shortvsreturnpercentilewithbinsplot([2000, 2001, 2002, 2003, 2004], ax[1], alphaparam=0.002, title="2000-2004")

shortvsreturnpercentilewithbinsplot([2005, 2006, 2007, 2008, 2009], ax[2], alphaparam=0.002, title="2005-2009")

shortvsreturnpercentilewithbinsplot([2010, 2011, 2012, 2013, 2014, 2015], ax[3], alphaparam=0.002, title="2010-2015")

fig.tight_layout(pad=2.0)
fig.suptitle("Short Interest and Actual Returns - Cross-Section Analysis", size=18, y=1.02)

plt.savefig(graphpath+'shortratiovsreturns_1995to2015.png', dpi=300, bbox_inches='tight')
plt.show()
    

In [None]:
slopes = np.array([])
intercepts = np.array([])
std_errs = np.array([])

slopes_pct = np.array([])
intercepts_pct = np.array([])
std_errs_pct = np.array([])

diffs_q1q4 = np.array([])
std_errs_q1q4 = np.array([])

diffs_d1d10 = np.array([])
std_errs_d1d10 = np.array([])

years = np.unique(dates_years)
yearsmask = np.full_like(years, True).astype('bool')

for year in years:

    chosendates = dates[np.isin(dates_years, year)]

    rets = returns.loc[chosendates].to_numpy().flatten()
    rets_pctile = returns.loc[chosendates].to_numpy().flatten()

    shortratio = shortratios.loc[chosendates].to_numpy().flatten()
    shortratio_pctile = shortratios_pctile.loc[chosendates].to_numpy().flatten()

    mask = ~np.isnan(shortratio) & ~np.isnan(rets)

    if np.count_nonzero(mask) == 0 :

        yearsmask[years==year] = False

        continue


    slope, intercept, r_value, p_value, std_err = sp.stats.linregress(shortratio[mask], rets[mask])

    slopes = np.append(slopes, slope)
    intercepts = np.append(intercepts, intercept)
    std_errs = np.append(std_errs, std_err)


    slope_pct, intercept_pct, r_value_pct, p_value_pct, std_err_pct = sp.stats.linregress(shortratio_pctile[mask], rets_pctile[mask])

    slopes_pct = np.append(slopes_pct, slope_pct)
    intercepts_pct = np.append(intercepts, intercept_pct)
    std_errs_pct = np.append(std_errs_pct, std_err_pct)


    means, _, _ = sp.stats.binned_statistic(rets_pctile[mask], shortratio[mask], 'mean', bins=10)
    stds, _, _ = sp.stats.binned_statistic(rets_pctile[mask], shortratio[mask], 'std', bins=10)
    counts, _, _ = sp.stats.binned_statistic(rets_pctile[mask], shortratio[mask], 'count', bins=10)

    diff_q1q4 = means[-1] - means[0]
    std_err_q1q4 = np.sqrt(((stds[-1]**2)/counts[-1])+((stds[0]**2)/counts[0]))

    diffs_q1q4 = np.append(diffs_q1q4, diff_q1q4)
    std_errs_q1q4 = np.append(std_errs_q1q4, std_err_q1q4)


    means, _, _ = sp.stats.binned_statistic(rets_pctile[mask], shortratio[mask], 'mean', bins=10)
    stds, _, _ = sp.stats.binned_statistic(rets_pctile[mask], shortratio[mask], 'std', bins=10)
    counts, _, _ = sp.stats.binned_statistic(rets_pctile[mask], shortratio[mask], 'count', bins=10)

    diff_d1d10 = means[-1] - means[0]
    std_err_d1d10 = np.sqrt(((stds[-1]**2)/counts[-1])+((stds[0]**2)/counts[0]))

    diffs_d1d10 = np.append(diffs_d1d10, diff_d1d10)
    std_errs_d1d10 = np.append(std_errs_d1d10, std_err_d1d10)


fig, ax = plt.subplots(2, 2, figsize=(10*1.66,10))
ax = ax.flatten()

for axis in ax:
    axis.spines['right'].set_visible(False)
    axis.spines['top'].set_visible(False)

ax[0].errorbar(years[yearsmask], slopes, yerr=1.96*std_errs, fmt='C0o')
ax[0].plot(years[yearsmask], np.zeros_like(years[yearsmask]), 'k')
ax[0].set_ylabel("Regression Coefficient")
ax[0].set_title("Linear Regression: Short Ratio vs Return")
ax[0].set_ylim([-0.06, 0.06])
    
ax[1].errorbar(years[yearsmask], slopes_pct, yerr=1.96*std_errs_pct, fmt='C2o')
ax[1].plot(years[yearsmask], np.zeros_like(years[yearsmask]), 'k')
ax[1].set_ylabel("Regression Coefficient")
ax[1].set_title("Percentile Regression: Short Ratio vs Return")
ax[1].set_ylim([-0.0003, 0.0003])

ax[2].errorbar(years[yearsmask], diffs_q1q4, yerr=1.96*std_errs_q1q4, fmt='C4o')
ax[2].plot(years[yearsmask], np.zeros_like(years[yearsmask]), 'k')
ax[2].set_ylim([-0.1, 0.1])
ax[2].set_ylabel("4th Quantile Mean - 1st Quantile Mean")
ax[2].set_title("Difference of Mean Short Ratio \n between 4th and 1st Return Quantile", pad=-20)

ax[3].errorbar(years[yearsmask], diffs_d1d10, yerr=1.96*std_errs_d1d10, fmt='C6o')
ax[3].plot(years[yearsmask], np.zeros_like(years[yearsmask]), 'k')
ax[3].set_ylim([-0.1, 0.1])
ax[3].set_ylabel("10th Decile Mean - 1st Decile Mean")
ax[3].set_title("Difference of Mean Short Ratio \n between 10th and 1st Return Decile", pad=-10)

def addpubdates(ax, y):

    ax.axvline(x=1974, ymax=1, color='k', linestyle='--')
    ax.text(1976, y, 'Neural Networks', bbox=dict(boxstyle="round", fc="1"), ha='center')
    
    ax.axvline(x=1986, ymax=1, color='k', linestyle='--')
    ax.text(1986, y, 'Lasso', bbox=dict(boxstyle="round", fc="1"), ha='center')
    
    ax.axvline(x=1995, ymax=1, color='k', linestyle='--')
    ax.text(1995, y, 'Forest', bbox=dict(boxstyle="round", fc="1"), ha='center')
    
    ax.axvline(x=2002, ymax=1, color='k', linestyle='--')
    ax.text(2002, y, 'GBRT', bbox=dict(boxstyle="round", fc="1"), ha='center')
    
    ax.axvline(x=2005, ymax=1, color='k', linestyle='--')
    ax.text(2006, y, 'Enet', bbox=dict(boxstyle="round", fc="1"), ha='center')
    
addpubdates(ax[0], -0.055)
addpubdates(ax[1], -0.00027)
addpubdates(ax[2], -0.09)
addpubdates(ax[3], -0.09)


fig.tight_layout(pad=2.0)
fig.suptitle("Market Prescience - Time Series Analysis", size=18, y=1.02)
plt.savefig(graphpath+'shortratiovsreturns_tsanalysis.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
assocmethods = ['ols', 'tree', 'nn3']
assoclabels = ['Ordinary Least Squares', 'Regression Tree', '3-Layer Neural Network']


pearson_rets = np.array([])
spearman_rets = np.array([])


pearson_prediction = dict()
spearman_prediction = dict()

for method in assocmethods :

    pearson_prediction[method] = np.array([])
    spearman_prediction[method] = np.array([])


datesmask = np.full_like(dates, True).astype('bool')
datesmask[dates<=19800000] = False

for date in dates[datesmask]:
        
    rets = returns.loc[date].to_numpy().flatten()
    
    shortratio = shortratios_i.loc[date].to_numpy().flatten()

    mask = ~np.isnan(shortratio) & ~np.isnan(rets)
    
    pearson_rets = np.append(pearson_rets, sp.stats.pearsonr(shortratio[mask], rets[mask])[0])
    spearman_rets = np.append(spearman_rets, sp.stats.spearmanr(shortratio[mask], rets[mask])[0])

    for method in assocmethods :
        
        prediction = predictions[method].loc[date].to_numpy().flatten()
    
        pearson_prediction[method] = np.append(pearson_prediction[method], sp.stats.pearsonr(shortratio[mask], prediction[mask])[0])

        spearman_prediction[method] = np.append(spearman_prediction[method], sp.stats.spearmanr(shortratio[mask], prediction[mask])[0])


fig, ax = plt.subplots(1, figsize=(8*1.66,8))

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

ax.plot(dtdates[datesmask], pearson_rets, 'k', alpha=0.2)
ax.plot(dtdates[datesmask], ma(pearson_rets, window_size=12*3, centered=True), 'k', label="True Returns (3-year avg.)")

colours = ['C0', 'C2', 'C4', 'C6']

for (i,method) in enumerate(assocmethods):
    
    ax.plot(dtdates[datesmask], pearson_prediction[method], colours[i], alpha=0.2)
    ax.plot(dtdates[datesmask], ma(pearson_prediction[method], window_size=12*3, centered=True), colours[i], label="Pred. rets. : " + assoclabels[i] + " (3-year avg.)")

ax.plot(dtdates[datesmask], np.zeros_like(dates[datesmask]), 'k--', linewidth=1.5)
ax.legend(frameon=False)
ax.set_title("Relationship between True or Predicted Returns and Monthly Short Interest", size=18)
ax.set_ylabel("Pearson Correlation Coefficient")
plt.savefig(graphpath+'marketprescience_spearman.pdf', bbox_inches='tight')
plt.show()


fig, ax = plt.subplots(1, figsize=(8*1.66,8))

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

ax.plot(dtdates[datesmask], pearson_rets, 'k', alpha=0.2)
ax.plot(dtdates[datesmask], ma(pearson_rets, window_size=12*3, centered=True), 'k', label="True Returns (3-year avg.)")

colours = ['C0', 'C2', 'C4', 'C6']

for (i,method) in enumerate(assocmethods):
    
    ax.plot(dtdates[datesmask], spearman_prediction[method], colours[i], alpha=0.2)
    ax.plot(dtdates[datesmask], ma(spearman_prediction[method], window_size=12*3, centered=True), colours[i], label="Pred. rets. : " + assoclabels[i] + " (3-year avg.)")

ax.plot(dtdates[datesmask], np.zeros_like(dates[datesmask]), 'k--', linewidth=1.5)
ax.legend(frameon=False)
ax.set_title("Relationship between True or Predicted Returns and Monthly Short Interest", size=18)
ax.set_ylabel("Spearman Rho")

plt.show()

In [None]:
method = 'ols'
print(method)

date = dates[-100]
print(date)

rets = returns.loc[date].to_numpy().flatten()

shortratio = shortratios_i.loc[date].to_numpy().flatten()

prediction = predictions[method].loc[date].to_numpy().flatten()

mask = ~np.isnan(shortratio) & ~np.isnan(rets)

print(sp.stats.pearsonr(rets[mask], shortratio[mask]))
print(sp.stats.pearsonr(prediction[mask], shortratio[mask]))
print(sp.stats.pearsonr(rets[mask], prediction[mask]))

print("sp.stats.pearsonr(rets[mask], rets[mask] - prediction[mask])")
print(sp.stats.pearsonr(shortratio[mask], rets[mask] - prediction[mask]))


fig, ax = plt.subplots(1, 4, figsize=(6*4.2,6))
ax = ax.flatten()

ax[0].plot(rets, shortratio, 'o', alpha=0.05)
ax[0].set_yscale('log')
ax[0].set_xlim([-0.75, 0.75])
ax[0].set_xlabel('rets')
ax[0].set_ylabel('shortratio')

ax[1].plot(prediction, shortratio, 'o', alpha=0.05)
ax[1].set_yscale('log')
ax[1].set_xlim([-0.75, 0.75])
ax[1].set_xlabel('prediction')
ax[1].set_ylabel('shortratio')

prederror = rets - prediction

ax[2].plot(prederror, shortratio, 'o', alpha=0.05)
ax[2].set_yscale('log')
ax[2].set_xlim([-0.75, 0.75])
ax[2].set_xlabel('prederror')
ax[2].set_ylabel('shortratio')

ax[3].plot(rets, prediction, 'o', alpha=0.05)
ax[3].set_xlim([-0.5, 0.5])
ax[3].set_xlabel('rets')
ax[3].set_ylabel('prediction')

plt.show()