<a href="https://colab.research.google.com/github/codingFerryman/crypto_market_hierarchy_structure/blob/main/src/crypto_correlation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [75]:
import pandas as pd
import numpy as np
from scipy.signal import correlate
from scipy.stats import spearmanr
from scipy.stats import pearsonr
import glob
import os
from pathlib import Path
from utils import check_integrity, datestring_to_timestamp, load_data

###Preprocessing and Load data

+ Preprocessing missing value

+ Load data

+ Build up list of original data vectors

+ Get the name list of cryptocurrency

#### Load data, Build up list of original data vectors, Get the name list of cryptocurrency

In [74]:
# Get file paths
def get_file_path(folder_path):
  paths = glob.glob(os.path.join(folder_path, "*"))
  paths = sorted(paths)
  return paths

In [134]:
# create the list of crytocurrencies price records, 
# each item means the list of original price vectors of each cryptocurrency
# each vector contains all the close prices of per day / per week (determing with time scale)
def get_interval(timescale):
  if timescale == '3h':
    return 8
  if timescale == '30m':
    return 48
  if timescale == '6h':
    return 4

def get_days(timescale):
  if timescale == '1d':
    return 1
  if timescale =='1w':
    return 7

def build_data_list(paths, interval, timescale, start_from_timestamp, end_before_timestamp):
  interval = get_interval(interval)
  timescale = get_days(timescale)
  data_list = []
  name_list = []
  
  for i in range(0, len(paths)):
    idx = 0
    shift_idx = interval*timescale - 1
    cryptocurrency_price_list = []
    
    cryptocurrency_df = load_data(start_from_timestamp, end_before_timestamp, paths[i], fill_na=True)
    
    #record the name of crytocurrency

    name_list.append(cryptocurrency_df.index[0][1])

    #discard other columns except close price
    close_price_df = cryptocurrency_df["close"]
    #print(close_price_df.head(10))
  
    while shift_idx <= len(close_price_df):
      vector = close_price_df.iloc[idx:shift_idx + 1]
      print(len(vector))
      #Calculate the price ratio of changing
      vector = vector.to_frame()
      vector = vector.pct_change().fillna(0)

      cryptocurrency_price_list.append(np.asarray(vector))
      idx += interval*timescale
      shift_idx += interval*timescale

    #print(len(cryptocurrency_price_list))
    data_list.append(cryptocurrency_price_list)  

  return data_list, name_list


In [191]:
def build_data_list_once(paths, interval, timescale, start_from_timestamp, end_before_timestamp):
  interval = get_interval(interval)
  timescale = get_days(timescale)
  
  name_list = []
  cryptocurrency_price_list = []
  for i in range(0, len(paths)):
    idx = 0
    shift_idx = interval*timescale - 1
    
    
    cryptocurrency_df = load_data(start_from_timestamp, end_before_timestamp, paths[i], fill_na=True)
    
    #record the name of crytocurrency

    name_list.append(cryptocurrency_df.index[0][1])

    #discard other columns except close price
    close_price_df = cryptocurrency_df["close"]
  
    vector = close_price_df.iloc[idx:shift_idx + 1]
  
    #Calculate the price ratio of changing
    vector = vector.to_frame()
    vector = vector.pct_change().fillna(0)
   
    flat_vector = vector['close'].values
    #print(flat_vector)
    cryptocurrency_price_list.append(np.asarray(flat_vector)) 

  return cryptocurrency_price_list, name_list

###Define functions to calculate correlation matrix

+ Pearson correlation coefficient
+ Spearman rank-order correlation coefficient

In [78]:
# Calculate correlation matrix - Pearson correlation coefficient
def cm_pearson(x, y):
  pccs = pearsonr(x,y)
  return pccs[0]

# Calculate correlation matrix - Spearman rank-order correlation coefficient
def cm_spearman(x, y):
  sccs = spearmanr(x, y)
  return sccs[0]

###Calculate Correlation matrix

In [79]:
def calculate_cm(data_list, func):
    N = len(data_list)
    cm_res_list = []
    cm_res = np.zeros((N,N))
    for day in range(0, len(data_list[0])):
      for i in range(N):
        for j in range(N):
          data_list[i][day] = data_list[i][day].squeeze()
          data_list[j][day] = data_list[j][day].squeeze()
          res = func(data_list[i][day], data_list[j][day]) 
          cm_res[i,j] = res
      
      cm_res_list.append(cm_res)
    
    return cm_res_list

In [205]:
def calculate_cm_once(data_list, func):
    N = len(data_list)
    cm_res = np.zeros((N,N))
    for i in range(N):
        for j in range(N):
            res = func(data_list[i], data_list[j]) 
            cm_res[i,j] = res
    #print(cm_res.shape)        
    return cm_res

In [None]:
# test code

data_path = "../data"
start_from = "2021-04-12"
end_before = "2021-06-15"
interval = "6h"

start_from_timestamp = datestring_to_timestamp(start_from)
end_before_timestamp = datestring_to_timestamp(end_before)
data_3h_path_list = list(Path(data_path, interval).iterdir())
print(len(data_3h_path_list))
coins_1D_pass = []
for file_path in data_3h_path_list:
    _coin_code = file_path.name.split("_")[0]#
    try:  
        if check_integrity(start_from, end_before, file_path) is not None:
            coins_1D_pass.append(file_path)
    except KeyError: print(file_path)
    

print(len(coins_1D_pass))
# load csv and build list
crypto_data_list, name_list = build_data_list(coins_1D_pass, '6h', '1w',start_from_timestamp, end_before_timestamp)

# calculate correlation matrix

correlation_matrix_pearson = calculate_cm(crypto_data_list, cm_pearson)
print(len(crypto_data_list))
print(len(correlation_matrix_pearson))
print(correlation_matrix_pearson[0])


### Save correlation matrix to files in format of Numpy array

In [116]:
np.save('0412_0615_1W_6h_69.npy', correlation_matrix_pearson, allow_pickle=True)

In [117]:
# save namelist
print(name_list)
textfile = open("0412-0615_1W_6h_namelist.txt", "w")
for element in name_list:
    textfile.write(element + "\n")
textfile.close()

['VSY', 'XAUT', 'CHZ', 'DAI', 'YFI', 'VET', 'DOT', 'BAL', 'BSV', 'BOSON', 'NEO', 'SUN', 'EGLD', 'BTC', 'ETC', 'ESS', 'ETP', 'RBT', 'BCHN', 'XSN', 'UOP', 'LINK', 'CLO', 'ZRX', 'SOL', 'ANT', 'AVAX', 'MLN', 'EOS', 'IQX', 'FIL', 'OMG', 'FET', 'XRP', 'XDC', 'XVG', 'SNX', 'PLU', 'ENJ', 'KSM', 'QTM', 'LTC', 'AAVE', 'KNC', 'BTT', 'XTZ', 'ZIL', 'COMP', 'CEL', 'BTG', 'SUSHI', 'LEO', 'BAT', 'HEZ', 'XMR', 'JST', 'UOS', 'WBT', 'ZEC', 'XLM', 'XRA', 'ETH', 'MKR', 'BAND', 'GTX', 'UNI', 'TRX', 'SAN', 'ADA']


In [None]:
# calculate correlation and check integrity by timescale

# timescale: daily
data_path = "../data"

# datarange
date_range = pd.date_range(start="2021-08-16",end="2021-10-18", freq='D').strftime('%Y-%m-%d')
#print(len(date_range))
interval = '30m'

cm_list = []
name_lists = []

for i in range(0, len(date_range)-1):
    start_from = date_range[i]
    end_before = date_range[i + 1]
  
    start_from_timestamp = datestring_to_timestamp(start_from)
    end_before_timestamp = datestring_to_timestamp(end_before)

    data_list = list(Path(data_path, interval).iterdir())

    checked_list = []
    for file_path in data_list:
        if check_integrity(start_from, end_before, file_path) is not None:
            checked_list.append(file_path)

    # load csv and build list
    crypto_daily_list, name_list = build_data_list_once(checked_list, '30m', '1d',start_from_timestamp, end_before_timestamp)

    # calculate correlation matrix
    correlation_matrix_pearson = calculate_cm_once(crypto_daily_list, cm_pearson)
    print(correlation_matrix_pearson.shape)

    cm_list.append(correlation_matrix_pearson)
    name_lists.append(name_list)
    


In [None]:
print(len(cm_list))
for item in cm_list:
    print(item.shape)

In [None]:
print(name_lists)
for item in name_lists:
    print(len(item))

In [218]:
#save to .npy file
np.save('../cm_data/0816_1018_1d_30m.npy', cm_list, allow_pickle=True)

  return array(a, dtype, copy=False, order=order, subok=True)


In [219]:
# save namelist
import json

with open('../cm_data/0816-1018_1d_30m_namelist.json', 'w') as f:
    json.dump(name_lists, f)

In [None]:
# time: monthly 

data_path = "../data"

# datarange
date_range = pd.date_range(start="2021-08-16",end="2021-10-18", freq='7D').strftime('%Y-%m-%d')
print(date_range)
interval = '3h'

cm_list = []
name_lists = []

for i in range(0, len(date_range)-1):
    start_from = date_range[i]
    end_before = date_range[i + 1]
  
    start_from_timestamp = datestring_to_timestamp(start_from)
    end_before_timestamp = datestring_to_timestamp(end_before)

    data_list = list(Path(data_path, interval).iterdir())
    #print(len(data_list))
    checked_list = []
    for file_path in data_list:
        if check_integrity(start_from, end_before, file_path) is not None:
            checked_list.append(file_path)
    #print(len(checked_list))
    
   
    # load csv and build list
    crypto_daily_list, name_list = build_data_list_once(checked_list, '3h', '1w',start_from_timestamp, end_before_timestamp)
    print(len(crypto_daily_list))
    print(crypto_daily_list[0].shape)
    
    
    # calculate correlation matrix
    correlation_matrix_pearson = calculate_cm_once(crypto_daily_list, cm_pearson)
    print(correlation_matrix_pearson.shape)
    print("-----------------")
    
    cm_list.append(correlation_matrix_pearson)
    name_lists.append(name_list)


In [None]:
print(len(cm_list))
for item in cm_list:
    print(item.shape)

In [None]:
print(len(name_lists))
for item in name_lists:
    print(len(item))

In [245]:
#save to .npy file
np.save('../cm_data/0816_1018_1W_3h.npy', cm_list, allow_pickle=True)

In [246]:
# save namelist

with open('../cm_data/0816-1018_1W_3h_namelist.json', 'w') as f:
    json.dump(name_lists, f)