# Goal: Create the covariance, correlation, and distance matrix

This cell is for importing necessary modules.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import heapq # for minimum spanning tree
import pickle # for saving the data
import os
import itertools

This cell is for reading the data file.

In [2]:
PSE_data = pd.read_csv(
    "daily.csv"
)
PSE_data['date_id'] = pd.to_datetime(PSE_data['date_id'])
PSE_data = PSE_data.set_index("date_id")
df_filtered = PSE_data.drop(columns=[col for col in PSE_data.columns if not (col.startswith('PH_') and col.endswith('_P')) and col != 'date_id'])

This code is for removing the other rows that are not within the last 10 years of the data. I also used the date June 25, 2013 as the start date so that:
- the window size will not cut the amount of days
- when I calculate log returns, I will have exactly 10 years worth of data. 

In [3]:
df_filtered = df_filtered.loc['2013-06-25':'2023-09-26']

## Returns from raw prices
This block is for cleaning the data and removing columns with too many null values.

In [4]:
# def has_high_null_percentage(column, null_threshold):
#     return column.isnull().mean() > null_threshold

# # def has_consecutive_constant_values(column, threshold):
# #     constant_streak = column.groupby((column != column.shift()).cumsum()).transform('size')
# #     return constant_streak.max() > threshold

# # def has_end_constant_values(column, end_threshold):
# #     end_section = column.iloc[-int(len(column) * 0.05):]
# #     return end_section.nunique() == 1

# # def has_few_variations(column, variation_threshold):
# #     return column.nunique() < variation_threshold

# null_percentage_threshold = round(df_filtered.shape[0] * 0.85)
# # consecutive_constant_threshold = int(len(df_filtered) * 0.35) 
# # end_constant_threshold = 0.01
# # variation_threshold = 50

# columns_to_drop = []

# for col in df_filtered.columns:
#     if col == "date_id":  
#         continue
    
#     if (
#         has_high_null_percentage(df_filtered[col], null_percentage_threshold) # or
#         # has_consecutive_constant_values(df_filtered[col], consecutive_constant_threshold) or
#         # has_end_constant_values(df_filtered[col], end_constant_threshold) or
#         # has_few_variations(df_filtered[col], variation_threshold)
#     ):
#         columns_to_drop.append(col)

# df_filtered = df_filtered.drop(columns=columns_to_drop)

In [5]:
def has_high_null_percentage(column, null_threshold):
    return column.isnull().mean() > null_threshold

null_percentage_threshold = round(df_filtered.shape[0] * 0.85)
columns_to_drop = []

for col in df_filtered.columns:
    if col == "date_id":  
        continue
    if has_high_null_percentage(df_filtered[col], null_percentage_threshold):
        columns_to_drop.append(col)

df_filtered = df_filtered.drop(columns=columns_to_drop)

In [6]:
df_filtered

Unnamed: 0_level_0,PH_PIP_P,PH_ASA_P,PH_ABS_P,PH_AGN_P,PH_APC_P,PH_CHP_P,PH_CEU_P,PH_CIR_P,PH_CAA_P,PH_EEQ_P,...,PH_MJC_P,PH_PCK_P,PH_MRP_P,PH_LOT_P,PH_BAG_P,PH_H2O_P,PH_PRC_P,PH_SSN_P,PH_SIN_P,PH_SHK_P
date_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-06-25,,6.0,37.0,5.0,1.0,,12.0,13.0,46.0,12.0,...,1.0,6.0,8.0,5.0,11.0,7.0,9.0,15.0,0.0,
2013-06-26,,7.0,38.0,6.0,1.0,,12.0,13.0,47.0,13.0,...,1.0,6.0,8.0,5.0,11.0,6.0,9.0,15.0,0.0,
2013-06-27,,7.0,39.0,6.0,1.0,,12.0,12.0,49.0,13.0,...,1.0,6.0,8.0,5.0,11.0,7.0,10.0,14.0,0.0,
2013-06-28,,7.0,38.0,6.0,1.0,,12.0,13.0,49.0,13.0,...,1.0,6.0,8.0,5.0,11.0,7.0,10.0,15.0,0.0,
2013-07-01,,7.0,40.0,6.0,1.0,,12.0,12.0,49.0,13.0,...,1.0,6.0,10.0,5.0,11.0,7.0,10.0,15.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-09-20,2.0,11.0,3.0,3.0,0.0,1.0,8.0,2.0,44.0,5.0,...,1.0,4.0,7.0,4.0,5.0,1.0,7.0,2.0,1.0,9.0
2023-09-21,2.0,11.0,3.0,3.0,0.0,1.0,8.0,2.0,44.0,5.0,...,1.0,4.0,7.0,4.0,5.0,1.0,7.0,2.0,1.0,9.0
2023-09-22,2.0,11.0,3.0,3.0,0.0,1.0,8.0,2.0,44.0,5.0,...,1.0,4.0,7.0,3.0,5.0,1.0,7.0,2.0,1.0,9.0
2023-09-25,2.0,11.0,3.0,3.0,0.0,1.0,8.0,2.0,44.0,5.0,...,1.0,4.0,7.0,4.0,5.0,1.0,7.0,2.0,1.0,9.0


This cell is for calculating log returns.

In [7]:
PSE_log_returns = df_filtered.copy()
numeric_cols = PSE_log_returns.columns.difference(["date_id"])
PSE_log_returns[numeric_cols] = PSE_log_returns[numeric_cols].replace(0, np.nan)
PSE_log_returns[numeric_cols] = np.log(PSE_log_returns[numeric_cols] / PSE_log_returns[numeric_cols].shift(1))

I removed the the row for September 25, 2013, and I filled all the nan values with zero.

In [8]:
PSE_log_returns = PSE_log_returns.loc['2013-06-26':'2023-09-26']
PSE_log_returns.columns = PSE_log_returns.columns.str.replace(r"_P", "").str.replace(r"PH_", "")

# this was for testing
least_null_cols = PSE_log_returns.isna().sum().nsmallest(3).index
trial = PSE_log_returns[least_null_cols]
trial

Unnamed: 0_level_0,ASA,ABS,AGN
date_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-06-26,0.154151,0.026668,0.182322
2013-06-27,0.000000,0.025975,0.000000
2013-06-28,0.000000,-0.025975,0.000000
2013-07-01,0.000000,0.051293,0.000000
2013-07-02,0.000000,0.000000,0.000000
...,...,...,...
2023-09-20,0.000000,0.000000,0.000000
2023-09-21,0.000000,0.000000,0.000000
2023-09-22,0.000000,0.000000,0.000000
2023-09-25,0.000000,0.000000,0.000000


In [9]:
returns_df = PSE_log_returns.copy()
PSE_log_returns = PSE_log_returns.to_numpy()

## Create the matrices

This has the functions to create the covariance and correlation matrices.

In [10]:
# from Prof. Jose's code
def returns_to_covar(returns):
    num_col = skips
    test = np.zeros(shape = (num_col, num_col))
    
    for i, j in list(itertools.combinations(np.arange(num_col), 2)):
        row_mask = (~np.isnan(returns[:, i]))&(~np.isnan(returns[:, j]))
        pi = returns[:, i][row_mask]
        pj = returns[:, j][row_mask]
        test[i, j] = np.sum((pi - np.mean(pi))*(pj - np.mean(pj)))/len(pi)
        #print(i, j)
        
    test = np.where(test,test,test.T)
    for i in range(num_col):
        test[i, i] = np.nanstd(returns[:, i])**2
    
    return test

# from: https://gist.github.com/wiso/ce2a9919ded228838703c1c7c7dad13b
def covar_to_correl(cov_matrix):
    std_devs = np.sqrt(np.diag(cov_matrix))
    outer_v = np.outer(std_devs, std_devs)
    correlation = cov_matrix / outer_v
    correlation[cov_matrix == 0] = 0
    return correlation

This code is to create and save all the matrices (returns, covar, correl, and dist matrices) of each timestamp into one pickle file per date.

In [12]:
skips = len(returns_df.columns)
window = 110
counter = 0

for i in range(len(returns_df.index)):
    # this is to get the matrix
    returns = PSE_log_returns[i:i+window]

    # this is to get the covar matrix
    covar_matrix = returns_to_covar(returns)
    counter += 1
    # print(f"Covar matrix:\n{covar_matrix}")
    
    # this is to get the correl matrix
    correl_matrix = covar_to_correl(covar_matrix)
    # print(f"Correl matrix:\n{correl_matrix}")
    
    # this is to get the distance matrix
    distance_matrix = 1 - np.square(correl_matrix)
    # print(f"Distance matrix:\n{distance_matrix}")

    # print(f"index:{test_df.index[counter]}")

    # this is to save it in a pickle file
    # tables = {"PSE log returns": returns,
    #           "covariance matrix": covar_matrix, 
    #           "correlation matrix": correl_matrix, 
    #           "distance matrix": distance_matrix}
    
    # folder_path = "matrices"
    # name = returns_df.index[counter-1]
    # file_path = os.path.join(folder_path, f"{window} {name}.pickl")

    # pd.to_pickle(tables, file_path)

  test[i, j] = np.sum((pi - np.mean(pi))*(pj - np.mean(pj)))/len(pi)
  correlation = cov_matrix / outer_v


In [14]:
with open('matrices/110 2023-09-26 00:00:00.pickl', 'rb') as f:
    abc123 = pickle.load(f)
abc123

{'PSE log returns': array([[ 0.        ,  0.        ,  0.        ,  0.        ,         nan,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,         nan,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,         nan,
                 nan,  0.        ,  0.01709443,         nan,  0.        ,
          0.        ,  0.00634923,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.00542007,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,         nan,  0.        ,  0.        ,  0.03077166,
          0.        ,  0.        ,  0.        ,  0.        ,  0.05406722,
          0.        ,  0.        ,  0.        ,         nan,  0.        ,
          0.       

# Goal: Test the code

In [None]:
test = pd.DataFrame({
    'a':[1, 2, 3, 4, 5, 6, 6, 5, 4, 3, 2, 1, 2, 2, 2, 2, 2, 2],
    'b':[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6],
    'c':[np.nan, 1, 9, np.nan, 1, 9, np.nan, 1, 9, np.nan, 1, 9, np.nan, 1, 9, np.nan, 1, 9]
},
    index=pd.date_range(20200601, periods=18, freq='d')
)

returns_to_covar(test)

### Prof Jose

boolean arrays, less for loops <br>
covariance cant have null values nanstd <br>
masked array<br>
dendrogram <br>

In [None]:
A = np.array([1, 2, 3, 4, 5, 6, np.nan])

In [None]:
print(np.std(A))
print(np.max(A))
print(np.mean(A))

In [None]:
print(np.nanstd(A))
print(np.nanmax(A))
print(np.nanmean(A))

In [None]:
# masked array
A[A > 3]

In [None]:
A > 3