In [3]:
import os
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
%matplotlib inline


In [None]:
ticker = 'SNEX'
exchange = 'nsdq'


In [None]:
# drop columns and cleanup
df = pd.read_csv(f'data/{ticker}-{exchange}.csv')
df = df.drop([f'open {ticker} Price in USD', f'high {ticker} Price in USD',
             f'low {ticker} Price in USD', 'Dividends', 'hidden'], axis=1)
df = df.rename(columns={'Exchange Reported SI': 'SI',
               f'close {ticker} Price in USD': 'Close'})
df = df.dropna(subset=['Close'])
df['Date'] = pd.to_datetime(df['Date'])
df['Close 2 week rolling'] = df['Close'].rolling(10).mean()
df = df.set_index('Date')
first_SI_index = df['SI'].first_valid_index()
df = df.loc[first_SI_index:]
df['index'] =np.arange(len(df))


In [None]:
fig, ax = plt.subplots(figsize=(25, 10))
ax3 = ax.twinx()
df['SI'].dropna().plot(ax=ax3, legend=True, linestyle='-', marker='.')
df['Close 2 week rolling'].plot(ax=ax, style='r-', legend=True)
df['Close'].plot(ax=ax, style='g-', legend=True)


In [None]:
# overall correlation (Close)
pd_r = df.corr().loc['SI', 'Close']
print(f"Pands Pearson r: {pd_r}")

dropped_df = df.dropna(subset=['SI', 'Close'])
r, p = stats.pearsonr(dropped_df['SI'], dropped_df['Close'])
print(f'Scipy Pearson r: {r}, p-value: {p}')


In [None]:
# overall correlation (2 week rolling)
pd_r = df.corr().loc['SI', 'Close 2 week rolling']
print(f"Pands Pearson r: {pd_r}")

dropped_df = df.dropna(subset=['SI', 'Close 2 week rolling'])
r, p = stats.pearsonr(dropped_df['SI'], dropped_df['Close 2 week rolling'])
print(f'Scipy Pearson r: {r}, p-value: {p}')


In [None]:
# local correlation
df_interpolated = df.interpolate()
# window 40 = 2 months?
rolling_r = df['Close'].rolling(40).corr(df_interpolated['SI'])
fig, ax = plt.subplots(2,1,figsize=(25, 15))
ax3 = ax[0].twinx()
df['SI'].dropna().plot(ax=ax3, legend=True, linestyle='-', marker='.')
df['Close'].rolling(10).mean().plot(ax=ax[0], style='r-', legend=True)
rolling_r.plot(ax=ax[1])

In [None]:
# time lagged cross correlation
def crosscorr(datax, datay, lag=0, wrap=False):
    """ Lag-N cross correlation. 
    Shifted data filled with NaNs 

    Parameters
    ----------
    lag : int, default 0
    datax, datay : pandas.Series objects of equal length
    Returns
    ----------
    crosscorr : float
    """
    if wrap:
        shiftedy = datay.shift(lag)
        shiftedy.iloc[:lag] = datay.iloc[-lag:].values
        return datax.corr(shiftedy)
    else:
        return datax.corr(datay.shift(lag))


d1 = df['Close']
d2 = df_interpolated['SI']
rs = [crosscorr(d1, d2, lag) for lag in range(-60, 60)]
# use absolute value as could be positve or negatively correlated
offset = np.floor(len(rs)/2)-np.argmax(np.abs(rs))
print(rs[np.argmax(np.abs(rs))])
fig, ax = plt.subplots(figsize=(20, 5))
ax.plot(rs)
ax.axvline(np.ceil(len(rs)/2), color='k', linestyle='--', label='Center')
ax.axvline(np.argmin(rs), color='r', linestyle='--', label='Peak synchrony')
ax.set(title=f'Offset = {offset}', xlabel='Offset', ylabel='Pearson r')
ax.set_xticks([0, 20, 40, 60, 80, 100, 120])
ax.set_xticklabels([-60, -40, -20, 0, 20, 40, 60])
plt.legend()


In [None]:
df = df.reset_index()

# Windowed time lagged cross correlation
num_splits = 7
samples_per_split = df.shape[0]/num_splits
rss=[]
for t in range(0, num_splits):
    d1 = df['Close'].loc[(t)*samples_per_split:(t+1)*samples_per_split]
    d2 = df['SI'].loc[(t)*samples_per_split:(t+1)*samples_per_split]
    rs = [crosscorr(d1,d2, lag) for lag in range(-60,60)]
    rss.append(rs)
rss = pd.DataFrame(rss)
f,ax = plt.subplots(figsize=(10,5))
sns.heatmap(rss,cmap='RdBu_r',ax=ax)
ax.set(title=f'Windowed Time Lagged Cross Correlation', xlabel='Offset',ylabel='Window epochs')
ax.set_xticks([0, 20, 40, 60, 80, 100, 120])
ax.set_xticklabels([-60, -40, -20, 0, 20, 40, 60])

# set index back to date
df = df.set_index('Date')

In [None]:
# Rolling window time lagged cross correlation
window_size = 70 #samples
t_start = 0
t_end = t_start + window_size
step_size = 20
rss=[]
while t_end < len(df):
    d1 = df['Close'].iloc[t_start:t_end]
    d2 = df['SI'].iloc[t_start:t_end]
    rs = [crosscorr(d1,d2, lag, wrap=False) for lag in range(-60,60)]
    rss.append(rs)
    t_start = t_start + step_size
    t_end = t_end + step_size
rss = pd.DataFrame(rss)

f,ax = plt.subplots(figsize=(10,10))
sns.heatmap(rss,cmap='RdBu_r',ax=ax)
ax.set(title=f'Rolling Windowed Time Lagged Cross Correlation', xlabel='Offset',ylabel='Epochs')
ax.set_xticks([0, 20, 40, 60, 80, 100, 120])
ax.set_xticklabels([-60, -40, -20, 0, 20, 40, 60])

In [None]:
from scipy.signal import hilbert, butter, filtfilt
from scipy.fftpack import fft,fftfreq,rfft,irfft,ifft
import numpy as np
import seaborn as sns
import pandas as pd
import scipy.stats as stats
def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return b, a


def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = filtfilt(b, a, data)
    return y

lowcut  = .01
highcut = .5
fs = 30.
order = 1
d1 = df['Close'].interpolate().values
d2 = df['SI'].interpolate().values
y1 = butter_bandpass_filter(d1,lowcut=lowcut,highcut=highcut,fs=fs,order=order)
y2 = butter_bandpass_filter(d2,lowcut=lowcut,highcut=highcut,fs=fs,order=order)

al1 = np.angle(hilbert(y1),deg=False)
al2 = np.angle(hilbert(y2),deg=False)
phase_synchrony = 1-np.sin(np.abs(al1-al2)/2)
N = len(al1)

# Plot results
f,ax = plt.subplots(3,1,figsize=(14,7),sharex=True)
ax[0].plot(y1,color='r',label='y1')
ax[0].plot(y2,color='b',label='y2')
ax[0].legend(bbox_to_anchor=(0., 1.02, 1., .102),ncol=2)
ax[0].set(xlim=[0,N], title='Filtered Timeseries Data')
ax[1].plot(al1,color='r')
ax[1].plot(al2,color='b')
ax[1].set(ylabel='Angle',title='Angle at each Timepoint',xlim=[0,N])
phase_synchrony = 1-np.sin(np.abs(al1-al2)/2)
ax[2].plot(phase_synchrony)
ax[2].set(ylim=[0,1.1],xlim=[0,N],title='Instantaneous Phase Synchrony',xlabel='Time',ylabel='Phase Synchrony')
plt.tight_layout()
plt.show()


# Add csv file into data folder and run code below

In [21]:
# stats_df = pd.DataFrame(columns=['Pearson r','p-value (r)','offset','Corr at offset'])
# stats_df.index.name = 'ticker'

stats_df = pd.read_csv('stats.csv', index_col='ticker')

In [24]:
def correl(csv_file):
    [ticker, exchange] = csv_file[:-4].split('-')
    if f'{ticker}_{exchange}' in stats_df.index: return
    df = pd.read_csv(f'data/{ticker}-{exchange}.csv')
    df = df.drop([f'open {ticker} Price in USD', f'high {ticker} Price in USD',
                  f'low {ticker} Price in USD', 'Dividends', 'hidden'], axis=1)
    df = df.rename(columns={'Exchange Reported SI': 'SI',
                            f'close {ticker} Price in USD': 'Close'})
    df = df.dropna(subset=['Close'])
    df['Date'] = pd.to_datetime(df['Date'])
    df['Close 2 week rolling'] = df['Close'].rolling(10).mean()
    df = df.set_index('Date')
    first_SI_index = df['SI'].first_valid_index()
    df = df.loc[first_SI_index:]
    df['index'] = np.arange(len(df))

    dropped_df = df.dropna(subset=['SI', 'Close'])
    r, p = stats.pearsonr(dropped_df['SI'], dropped_df['Close'])

    def crosscorr(datax, datay, lag=0, wrap=False):
        """ Lag-N cross correlation. 
        Shifted data filled with NaNs 

        Parameters
        ----------
        lag : int, default 0
        datax, datay : pandas.Series objects of equal length
        Returns
        ----------
        crosscorr : float
        """
        if wrap:
            shiftedy = datay.shift(lag)
            shiftedy.iloc[:lag] = datay.iloc[-lag:].values
            return datax.corr(shiftedy)
        else:
            return datax.corr(datay.shift(lag))

    df_interpolated = df.interpolate()
    d1 = df['Close']
    d2 = df_interpolated['SI']
    rs = [crosscorr(d1, d2, lag) for lag in range(-60, 60)]
    offset = np.floor(len(rs)/2)-np.argmax(np.abs(rs))
    max_corr = rs[np.argmax(np.abs(rs))]

    stats_df.loc[f'{ticker}_{exchange}'] = [r, p, offset, max_corr]


filenames = os.listdir('data')
for file in filenames:
    correl(file)

In [25]:
stats_df

Unnamed: 0_level_0,Pearson r,p-value (r),offset,Corr at offset
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RWT_nyse,-0.41929,0.0005079879,-52.0,-0.506159
SNEX_nsdq,-0.751796,3.719796e-11,12.0,-0.795084
BANC_nyse,0.083008,0.5109356,-59.0,-0.37328
ARI_nyse,0.489038,3.570953e-05,60.0,0.547201
PIPR_nyse,-0.685817,3.870794e-07,-59.0,-0.884944
PFS_nyse,0.411268,0.0006656473,10.0,0.397264
FBP_nyse,0.427112,0.0003877423,-59.0,0.522115
COOP_nsdq,0.079101,0.5445354,-48.0,0.261588
PMT_nyse,-0.500677,2.162409e-05,-42.0,-0.52587
HMST_nsdq,0.158363,0.2076831,60.0,0.397611


In [26]:
stats_df.to_csv('stats.csv')