<a href="https://colab.research.google.com/github/codingFerryman/crypto_market_hierarchy_structure/blob/main/src/crypto_correlation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [51]:
import pandas as pd
import numpy as np
from scipy.signal import correlate
from scipy.stats import spearmanr
from scipy.stats import pearsonr
import glob
import os
from pathlib import Path
from utils import check_integrity, datestring_to_timestamp, load_data

###Preprocessing and Load data

+ Preprocessing missing value

+ Load data

+ Build up list of original data vectors

+ Get the name list of cryptocurrency

#### Load data, Build up list of original data vectors, Get the name list of cryptocurrency

In [None]:
# Get file paths
def get_file_path(folder_path):
  paths = glob.glob(os.path.join(folder_path, "*"))
  paths = sorted(paths)
  return paths

In [59]:
# create the list of crytocurrencies price records, 
# each item means the list of original price vectors of each cryptocurrency
# each vector contains all the close prices of per day / per week (determing with time scale)
def get_interval(timescale):
  if timescale == '3h':
    return 8
  if timescale == '30m':
    return 1

def build_data_list(paths, timescale):
  interval = get_interval(timescale)
  
  data_list = []
  name_list = []
  
  for i in range(0, len(paths)):
    idx = 0
    shift_idx = interval - 1
    cryptocurrency_price_list = []
    
    cryptocurrency_df = load_data(start_from_timestamp, end_before_timestamp, paths[i], fill_na=True)
    
    #record the name of crytocurrency

    name_list.append(cryptocurrency_df.index[0][1])

    #discard other columns except close price
    close_price_df = cryptocurrency_df["close"]
    #print(close_price_df.head(10))
  
    while shift_idx <= len(close_price_df):
      vector = close_price_df.iloc[idx:shift_idx + 1]
      #print(vector)
      #Calculate the price ratio of changing
      vector = vector.to_frame()
      vector = vector.pct_change().fillna(0)

      cryptocurrency_price_list.append(np.asarray(vector))
      idx += interval
      shift_idx += interval

    #print(len(cryptocurrency_price_list))
    data_list.append(cryptocurrency_price_list)  

  return data_list, name_list


###Define functions to calculate correlation matrix

+ Pearson correlation coefficient
+ Spearman rank-order correlation coefficient

In [53]:
# Calculate correlation matrix - Pearson correlation coefficient
def cm_pearson(x, y):
  pccs = pearsonr(x,y)
  return pccs[0]

# Calculate correlation matrix - Spearman rank-order correlation coefficient
def cm_spearman(x, y):
  sccs = spearmanr(x, y)
  return sccs[0]

###Calculate Correlation matrix

In [68]:
def calculate_cm(data_list, func):
    N = len(data_list)
    cm_res_list = []
    cm_res = np.zeros((N,N))
    for day in range(0, len(data_list[0])):
      for i in range(N):
        for j in range(N):
          data_list[i][day] = data_list[i][day].squeeze()
          data_list[j][day] = data_list[j][day].squeeze()
          res = func(data_list[i][day], data_list[j][day]) 
          cm_res[i,j] = res
      
      cm_res_list.append(cm_res)
    
    return cm_res_list

In [70]:
# test code

data_path = "../data"
start_from = "2021-04-12"
end_before = "2021-06-15"
interval = "3h"

start_from_timestamp = datestring_to_timestamp(start_from)
end_before_timestamp = datestring_to_timestamp(end_before)
data_3h_path_list = list(Path(data_path, interval).iterdir())

coins_1D_pass = []
for file_path in data_3h_path_list:
    _coin_code = file_path.name.split("_")[0]
    if check_integrity(start_from, end_before, file_path) is not None:
        coins_1D_pass.append(file_path)

#print(len(coins_1D_pass))
# load csv and build list
crypto_data_list, name_list = build_data_list(coins_1D_pass, '3h')

# calculate correlation matrix

correlation_matrix_pearson = calculate_cm(crypto_data_list, cm_pearson)
#print(len(crypto_data_list))
#print(len(crypto_data_list[0]))
print(len(correlation_matrix_pearson))
print(correlation_matrix_pearson)

64
[array([[ 1.        ,  0.72191111,  0.67501888, ...,  0.39107018,
         0.09529164,  0.94013164],
       [ 0.72191111,  1.        ,  0.75500237, ...,  0.5118613 ,
         0.52936746,  0.85134086],
       [ 0.67501888,  0.75500237,  1.        , ...,  0.61916045,
         0.33137815,  0.70360944],
       ...,
       [ 0.39107018,  0.5118613 ,  0.61916045, ...,  1.        ,
        -0.15858355,  0.32194608],
       [ 0.09529164,  0.52936746,  0.33137815, ..., -0.15858355,
         1.        ,  0.41416047],
       [ 0.94013164,  0.85134086,  0.70360944, ...,  0.32194608,
         0.41416047,  1.        ]]), array([[ 1.        ,  0.72191111,  0.67501888, ...,  0.39107018,
         0.09529164,  0.94013164],
       [ 0.72191111,  1.        ,  0.75500237, ...,  0.5118613 ,
         0.52936746,  0.85134086],
       [ 0.67501888,  0.75500237,  1.        , ...,  0.61916045,
         0.33137815,  0.70360944],
       ...,
       [ 0.39107018,  0.5118613 ,  0.61916045, ...,  1.        ,
     

### Save correlation matrix to files in format of Numpy array