# Correlation Testing

In [1]:

import os
from pricepredict import PricePredict
from datetime import datetime, timedelta
import pandas as pd

model_dir = '../models/'
chart_dir = '../charts/'
preds_dir = '../predictions/'
ppo_dir = '../ppo/'
ppo_save_dir = '../ppo_save/'

def read_ppos():
    # Get all daily PPO objects from the ppo dir
    read_ppos = {}
    for file in os.listdir('../ppo'):
        # Check if filename has _D_ in it and ends with .dill
        if '_D_' in file and file.endswith('.dill'):
            # Get the symbol name from the file name (first chars before _D_)
            symbol = file.split('_D_')[0]
            print (symbol," ", end='')
            # Load the PPO object from the file
            with open(f'../ppo/{file}', 'rb') as f:
                pp_obj = f.read()
            # unserialize the PPO object
            read_ppos[symbol] = PricePredict.unserialize(pp_obj)
            
    return read_ppos


2025-01-13 00:21:05.974841: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-13 00:21:05.990675: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-13 00:21:06.011680: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-13 00:21:06.016425: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-13 00:21:06.028734: I tensorflow/core/platform/cpu_feature_guar

In [2]:

def create_ppos(symbols: [str]):
    # Create a PricePredict object for each symbol
    ppos = {}
    for sym in symbols:
        ppo = PricePredict(sym, period=PricePredict.PeriodDaily,
                           model_dir=model_dir,
                           chart_dir=chart_dir,
                           preds_dir=preds_dir,)
        end_dt = datetime.now()
        # Load up over 5 years of data
        start_dt = end_dt - timedelta(days=365 * 5)
        end_date = end_dt.strftime('%Y-%m-%d')
        start_date = start_dt.strftime('%Y-%m-%d')

        # Fetch data for the ppo
        try:
            ppo.fetch_data_yahoo(ppo.ticker, start_date, end_date)
        except Exception as e:
            print(f'Error fetching data for {sym}')
            continue

        ppos[sym] = ppo

    return ppos


In [2]:

dally_ppos = read_ppos()

print(f'Loaded {len(dally_ppos)} daily PPO objects')
print(f'Daily Symbols: {dally_ppos.keys()}')



Loaded 0 daily PPO objects
Daily Symbols: dict_keys([])


In [4]:
from tqdm import tqdm

# symbols = ['AAPL', '000001.SS', 'EURUSD=X', 'IBM', 'TSLA', 'SYK', 'RTX', 'QCOM', 'PACB', 'MDLZ']
# ppos = create_ppos(symbols)

all_symbols = sorted(dally_ppos.keys())
print(f'Loaded {len(dally_ppos)} daily PPO objects')
all_ptp = None
# Loop through the ppos sorted by key (symbol)
for pc_period in tqdm(range(7, 271, 2), "Pair Trading Period"):
    # print(f'.', end='', flush=True)
    sym1_pb = tqdm(all_symbols, f'Corr Period: {pc_period}', leave=False)
    for sym1 in all_symbols:
        # print(f'===== {ppos[symbol].ticker} =====')
        # Generate correlations between each symbol and all other symbols
        sym1_pb.update(1)
        sym2_pb = tqdm(all_symbols, f'{sym1}: Corr Period: {pc_period}', leave=False)
        for sym2 in all_symbols:
            if sym1 != sym2:
                # Get the corr between the two symbols
                    try:
                        corr = all_symbols[sym1].periodic_correlation(all_symbols[sym2], pc_period_len=pc_period)
                    except Exception as e:
                        print(f'Error calculating correlation between {sym1} and {sym2}\n{e}')
                        continue
    
                    if corr['coint_stationary']:
                        corr_dict = {'potential_pair': f'{sym1}:{sym2}',
                                     'corr_start_date': corr['start_date'], 'corr_end_date': corr['end_date'],
                                     'period_days': corr['corr_period_len'],
                                     'coint_stasn': corr['coint_stationary'],
                                     'coint_pval':  corr['coint_test']['p_val'],
                                     'adf_pval': corr['adf_test']['p_val']}
                        ptp = pd.DataFrame(corr_dict, index=[0])
                        if all_ptp is None:
                            all_ptp = ptp
                        else:
                            all_ptp = pd.concat([all_ptp, ptp])
                        print(corr_dict)
                    
            sym2_pb.update(1)
            
all_ptp


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [34]:
import inspect
dally_ppos['SEDG'].orig_downloaded_data.__len__()
dally_ppos['SEDG'].date_start, dally_ppos['SEDG'].date_end
# dally_ppos['AAPL'].fetch_data_yahoo('SEDG', '2020-12-31', '2021-01-01')


('2020-01-01', '2021-01-01')