In [None]:
import os
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
%matplotlib inline


In [None]:
ticker = 'SNEX'
exchange = 'nsdq'


In [None]:
# drop columns and cleanup
df = pd.read_csv(f'data/{ticker}-{exchange}.csv')
df = df.drop([f'open {ticker} Price in USD', f'high {ticker} Price in USD',
             f'low {ticker} Price in USD', 'Dividends', 'hidden'], axis=1)
df = df.rename(columns={'Exchange Reported SI': 'SI',
               f'close {ticker} Price in USD': 'Close', 'Days to cover 3m (on loan)': 'Days To Cover'})
df = df.dropna(subset=['Close'])
df['Date'] = pd.to_datetime(df['Date'])
df['Close 2 week rolling'] = df['Close'].rolling(10).mean()
df = df.set_index('Date')
first_SI_index = df['SI'].first_valid_index()
df = df.loc[first_SI_index:]
df = df[:-1]
df['index'] =np.arange(len(df))
df.info()


In [None]:
fig, ax = plt.subplots(figsize=(25, 10))
ax3 = ax.twinx()
# df['SI'].dropna().plot(ax=ax3, legend=True, linestyle='-', marker='.')
df['Days To Cover'].dropna().plot(ax=ax3, legend=True)
# df['Cost To Borrow'].dropna().plot(ax=ax3, legend=True)
df['Close 2 week rolling'].plot(ax=ax, style='r-', legend=True)
df['Close'].plot(ax=ax, style='g-', legend=True)


In [None]:
def overallPearsonR(col):
    dropped_df = df.dropna(subset=[col, 'Close'])
    r, p = stats.pearsonr(dropped_df[col], dropped_df['Close'])
    print(f'{col} against Close Pearson r: {r}, p-value: {p}')


for col in ['SI', 'Days To Cover', 'Cost To Borrow']:
    overallPearsonR(col)


In [None]:
dropped_df = df.dropna(subset=['SI', 'Close 2 week rolling'])
r, p = stats.pearsonr(dropped_df['SI'], dropped_df['Close 2 week rolling'])
print(f'SI against 2 week rolling close Pearson r: {r}, p-value: {p}')


In [None]:
# local correlation
df_interpolated = df.interpolate()
df_interpolated.head()
# window 40 = 2 months?
rolling_r = df['Close'].rolling(40).corr(df_interpolated['SI'])
fig, ax = plt.subplots(2,1,figsize=(25, 15))
ax3 = ax[0].twinx()
df['SI'].dropna().plot(ax=ax3,linestyle='-', marker='.').legend(loc='upper left')
df['Close'].rolling(10).mean().plot(ax=ax[0], style='r-', legend=True)
rolling_r.plot(ax=ax[1], xlim=ax[0].get_xlim())

In [None]:
# time lagged cross correlation
def crosscorr(datax, datay, lag=0, wrap=False):
    """ Lag-N cross correlation. 
    Shifted data filled with NaNs 

    Parameters
    ----------
    lag : int, default 0
    datax, datay : pandas.Series objects of equal length
    Returns
    ----------
    crosscorr : float
    """
    if wrap:
        shiftedy = datay.shift(lag)
        shiftedy.iloc[:lag] = datay.iloc[-lag:].values
        return datax.corr(shiftedy)
    else:
        return datax.corr(datay.shift(lag))


d1 = df['Close']
d2 = df_interpolated['SI']
rs = [crosscorr(d1, d2, lag) for lag in range(-60, 60)]
# use absolute value as could be positve or negatively correlated
offset = np.floor(len(rs)/2)-np.argmax(np.abs(rs))
print(rs[np.argmax(np.abs(rs))])
fig, ax = plt.subplots(figsize=(20, 5))
ax.plot(rs)
ax.axvline(np.ceil(len(rs)/2), color='k', linestyle='--', label='Center')
ax.axvline(np.argmax(np.abs(rs)), color='r', linestyle='--', label='Peak synchrony')
ax.set(title=f'Offset = {offset}', xlabel='Offset', ylabel='Pearson r')
ax.set_xticks([0, 20, 40, 60, 80, 100, 120])
ax.set_xticklabels([-60, -40, -20, 0, 20, 40, 60])
plt.legend()


In [None]:
# Cost To Borrow time lagged cross correlation
d1 = df['Close']
d2 = df_interpolated['Cost To Borrow']
rs = [crosscorr(d1, d2, lag) for lag in range(-60, 60)]
# use absolute value as could be positve or negatively correlated
offset = np.floor(len(rs)/2)-np.argmax(np.abs(rs))
print(rs[np.argmax(np.abs(rs))])
fig, ax = plt.subplots(figsize=(20, 5))
ax.plot(rs)
ax.axvline(np.ceil(len(rs)/2), color='k', linestyle='--', label='Center')
ax.axvline(np.argmax(np.abs(rs)), color='r', linestyle='--', label='Peak synchrony')
ax.set(title=f'Offset = {offset}', xlabel='Offset', ylabel='Pearson r')
ax.set_xticks([0, 20, 40, 60, 80, 100, 120])
ax.set_xticklabels([-60, -40, -20, 0, 20, 40, 60])
plt.legend()

In [None]:
df = df.reset_index()

# Windowed time lagged cross correlation
num_splits = 7
samples_per_split = df.shape[0]/num_splits
rss=[]
for t in range(0, num_splits):
    d1 = df['Close'].loc[(t)*samples_per_split:(t+1)*samples_per_split]
    d2 = df['SI'].loc[(t)*samples_per_split:(t+1)*samples_per_split]
    rs = [crosscorr(d1,d2, lag) for lag in range(-60,60)]
    rss.append(rs)
rss = pd.DataFrame(rss)
f,ax = plt.subplots(figsize=(10,5))
sns.heatmap(rss,cmap='RdBu_r',ax=ax)
ax.set(title=f'Windowed Time Lagged Cross Correlation', xlabel='Offset',ylabel='Window epochs')
ax.set_xticks([0, 20, 40, 60, 80, 100, 120])
ax.set_xticklabels([-60, -40, -20, 0, 20, 40, 60])

# set index back to date
df = df.set_index('Date')

In [None]:
# Rolling window time lagged cross correlation
window_size = 70 #samples
t_start = 0
t_end = t_start + window_size
step_size = 20
rss=[]
while t_end < len(df):
    d1 = df['Close'].iloc[t_start:t_end]
    d2 = df['SI'].iloc[t_start:t_end]
    rs = [crosscorr(d1,d2, lag, wrap=False) for lag in range(-60,60)]
    rss.append(rs)
    t_start = t_start + step_size
    t_end = t_end + step_size
rss = pd.DataFrame(rss)

f,ax = plt.subplots(figsize=(10,10))
sns.heatmap(rss,cmap='RdBu_r',ax=ax)
ax.set(title=f'Rolling Windowed Time Lagged Cross Correlation', xlabel='Offset',ylabel='Epochs')
ax.set_xticks([0, 20, 40, 60, 80, 100, 120])
ax.set_xticklabels([-60, -40, -20, 0, 20, 40, 60])

In [None]:
from scipy.signal import hilbert, butter, filtfilt
from scipy.fftpack import fft,fftfreq,rfft,irfft,ifft
import numpy as np
import seaborn as sns
import pandas as pd
import scipy.stats as stats
def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return b, a


def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = filtfilt(b, a, data)
    return y

lowcut  = .01
highcut = .5
fs = 30.
order = 1
d1 = df['Close'].interpolate().values
d2 = df['SI'].interpolate().values
y1 = butter_bandpass_filter(d1,lowcut=lowcut,highcut=highcut,fs=fs,order=order)
y2 = butter_bandpass_filter(d2,lowcut=lowcut,highcut=highcut,fs=fs,order=order)

al1 = np.angle(hilbert(y1),deg=False)
al2 = np.angle(hilbert(y2),deg=False)
phase_synchrony = 1-np.sin(np.abs(al1-al2)/2)
N = len(al1)

# Plot results
f,ax = plt.subplots(3,1,figsize=(14,7),sharex=True)
ax[0].plot(y1,color='r',label='y1')
ax[0].plot(y2,color='b',label='y2')
ax[0].legend(bbox_to_anchor=(0., 1.02, 1., .102),ncol=2)
ax[0].set(xlim=[0,N], title='Filtered Timeseries Data')
ax[1].plot(al1,color='r')
ax[1].plot(al2,color='b')
ax[1].set(ylabel='Angle',title='Angle at each Timepoint',xlim=[0,N])
phase_synchrony = 1-np.sin(np.abs(al1-al2)/2)
ax[2].plot(phase_synchrony)
ax[2].set(ylim=[0,1.1],xlim=[0,N],title='Instantaneous Phase Synchrony',xlabel='Time',ylabel='Phase Synchrony')
plt.tight_layout()
plt.show()


In [None]:
# Average SI, DTC and annualised return
avg_SI = df['SI'].mean()
print(avg_SI)
avg_DTC = df['Days To Cover'].mean()
print(avg_DTC)
ret_series = (1 + df['Close'].pct_change()).cumprod() - 1
overall_return = ret_series.tail(1)[0]
period_days = (df.tail(1).index.date - df.head(1).index.date)[0].days
annualised_return = (1 + overall_return)**(365/period_days) - 1
print(annualised_return)


# Add csv file into data folder and run code below

In [None]:
# stats_df = pd.DataFrame(columns=['SI Pearson r', 'SI p-value (r)', 'SI offset', 'SI Corr at offset', 'DTC Pearson r',
#                         'DTC p-value (r)', 'DTC offset', 'DTC Corr at offset', 'CTB Pearson r', 'CTB p-value (r)', 'CTB offset', 'CTB Corr at offset', 'Avg SI', 'Avg DTC', 'Annualised return'])
# stats_df.index.name = 'ticker'

stats_df = pd.read_csv('stats.csv', index_col='ticker')


In [None]:
def correl(csv_file):
    [ticker, exchange] = csv_file[:-4].split('-')
    if f'{ticker}_{exchange}' in stats_df.index:
        return
    df = pd.read_csv(f'data/{ticker}-{exchange}.csv')
    df = df.drop([f'open {ticker} Price in USD', f'high {ticker} Price in USD',
                  f'low {ticker} Price in USD', 'Dividends', 'hidden'], axis=1)
    df = df.rename(columns={'Exchange Reported SI': 'SI',
                            f'close {ticker} Price in USD': 'Close', 'Days to cover 3m (on loan)': 'Days To Cover'})
    df = df.dropna(subset=['Close'])
    df['Date'] = pd.to_datetime(df['Date'])
    df['Close 2 week rolling'] = df['Close'].rolling(10).mean()
    df = df.set_index('Date')
    first_SI_index = df['SI'].first_valid_index()
    df = df.loc[first_SI_index:]
    if len(df) < 10:
        print(csv_file)
        return
    df = df[:-1]
    df['index'] = np.arange(len(df))

    df_interpolated = df.interpolate()

    dropped_df = df.dropna(subset=['SI', 'Close'])
    if len(dropped_df) < 10:
        print(csv_file)
        return
    r, p = stats.pearsonr(dropped_df['SI'], dropped_df['Close'])

    dropped_df = df.dropna(subset=['Cost To Borrow', 'Close'])
    if len(dropped_df) < 10:
        print(csv_file)
        return
    ctb_r, ctb_p = stats.pearsonr(
        dropped_df['Cost To Borrow'], dropped_df['Close'])

    dropped_df = df.dropna(subset=['Days To Cover', 'Close'])
    if len(dropped_df) < 10:
        print(csv_file)
        return
    dtc_r, dtc_p = stats.pearsonr(
        dropped_df['Days To Cover'], dropped_df['Close'])

    def crosscorr(datax, datay, lag=0, wrap=False):
        """ Lag-N cross correlation. 
        Shifted data filled with NaNs 

        Parameters
        ----------
        lag : int, default 0
        datax, datay : pandas.Series objects of equal length
        Returns
        ----------
        crosscorr : float
        """
        if wrap:
            shiftedy = datay.shift(lag)
            shiftedy.iloc[:lag] = datay.iloc[-lag:].values
            return datax.corr(shiftedy)
        else:
            return datax.corr(datay.shift(lag))

    d1 = df['Close']
    d2 = df_interpolated['SI']
    rs = [crosscorr(d1, d2, lag) for lag in range(-60, 60)]
    si_offset = np.floor(len(rs)/2)-np.argmax(np.abs(rs))
    si_max_corr = rs[np.argmax(np.abs(rs))]

    d3 = df_interpolated['Days To Cover']
    rs = [crosscorr(d1, d3, lag) for lag in range(-60, 60)]
    dtc_offset = np.floor(len(rs)/2)-np.argmax(np.abs(rs))
    dtc_max_corr = rs[np.argmax(np.abs(rs))]

    d4 = df_interpolated['Cost To Borrow']
    rs = [crosscorr(d1, d4, lag) for lag in range(-60, 60)]
    ctb_offset = np.floor(len(rs)/2)-np.argmax(np.abs(rs))
    ctb_max_corr = rs[np.argmax(np.abs(rs))]

    avg_SI = df['SI'].mean()
    avg_DTC = df['Days To Cover'].mean()
    ret_series = (1 + df['Close'].pct_change()).cumprod() - 1
    overall_return = ret_series.tail(1)[0]
    period_days = (df.tail(1).index.date - df.head(1).index.date)[0].days
    annualised_return = (1 + overall_return)**(365/period_days) - 1

    stats_df.loc[f'{ticker}_{exchange}']['Annualised return'] = [r, p, si_offset, si_max_corr, dtc_r,
                                            dtc_p, dtc_offset, dtc_max_corr, ctb_r, ctb_p, ctb_offset, ctb_max_corr,
                                            avg_SI, avg_DTC, annualised_return]


filenames = os.listdir('data')
for file in filenames:
    correl(file)


In [None]:
stats_df

In [None]:
stats_df.to_csv('stats.csv')