# Goal: Create the correlation matrix

This cell is for importing necessary modules.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import heapq # for minimum spanning tree
import pickle # for saving the data
import os

This cell is for reading the data file.

In [2]:
PSE_data = pd.read_csv(
    "daily.csv"
)
PSE_data['date_id'] = pd.to_datetime(PSE_data['date_id'])
PSE_data = PSE_data.set_index("date_id")
df_filtered = PSE_data.drop(columns=[col for col in PSE_data.columns if not (col.startswith('PH_') and col.endswith('_P')) and col != 'date_id'])

This code is for removing the other rows that are not within the last 10 years of the data. I also used the date September 25, 2013 as the start date so that when I calculate log returns, I will have exactly 10 years worth of data. 

In [3]:
df_filtered = df_filtered.loc['2013-09-25':'2023-09-26']

## Cleaning data
This block is for cleaning the data and removing columns with too many null values.

In [4]:
def has_high_null_percentage(column, null_threshold):
    return column.isnull().mean() > null_threshold

# def has_consecutive_constant_values(column, threshold):
#     constant_streak = column.groupby((column != column.shift()).cumsum()).transform('size')
#     return constant_streak.max() > threshold

# def has_end_constant_values(column, end_threshold):
#     end_section = column.iloc[-int(len(column) * 0.05):]
#     return end_section.nunique() == 1

# def has_few_variations(column, variation_threshold):
#     return column.nunique() < variation_threshold

null_percentage_threshold = round(df_filtered.shape[0] * 0.85)
# consecutive_constant_threshold = int(len(df_filtered) * 0.35) 
# end_constant_threshold = 0.01
# variation_threshold = 50

columns_to_drop = []

for col in df_filtered.columns:
    if col == "date_id":  
        continue
    
    if (
        has_high_null_percentage(df_filtered[col], null_percentage_threshold) # or
        # has_consecutive_constant_values(df_filtered[col], consecutive_constant_threshold) or
        # has_end_constant_values(df_filtered[col], end_constant_threshold) or
        # has_few_variations(df_filtered[col], variation_threshold)
    ):
        columns_to_drop.append(col)

df_filtered = df_filtered.drop(columns=columns_to_drop)

In [5]:
df_filtered

Unnamed: 0_level_0,PH_PIP_P,PH_ASA_P,PH_ABS_P,PH_AGN_P,PH_APC_P,PH_CHP_P,PH_CEU_P,PH_CIR_P,PH_CAA_P,PH_EEQ_P,...,PH_MJC_P,PH_PCK_P,PH_MRP_P,PH_LOT_P,PH_BAG_P,PH_H2O_P,PH_PRC_P,PH_SSN_P,PH_SIN_P,PH_SHK_P
date_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-09-25,,7.0,33.0,6.0,1.0,,11.0,12.0,46.0,10.0,...,1.0,6.0,11.0,6.0,13.0,5.0,10.0,12.0,0.0,
2013-09-26,,6.0,33.0,6.0,1.0,,11.0,12.0,46.0,10.0,...,1.0,6.0,11.0,6.0,12.0,6.0,10.0,11.0,0.0,
2013-09-27,,6.0,33.0,6.0,1.0,,11.0,12.0,46.0,10.0,...,1.0,6.0,11.0,6.0,13.0,5.0,10.0,11.0,0.0,
2013-09-30,,6.0,33.0,6.0,1.0,,11.0,12.0,46.0,10.0,...,1.0,6.0,11.0,6.0,12.0,5.0,10.0,11.0,0.0,
2013-10-01,,7.0,33.0,6.0,1.0,,11.0,12.0,46.0,10.0,...,1.0,6.0,10.0,6.0,13.0,5.0,10.0,11.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-09-20,2.0,11.0,3.0,3.0,0.0,1.0,8.0,2.0,44.0,5.0,...,1.0,4.0,7.0,4.0,5.0,1.0,7.0,2.0,1.0,9.0
2023-09-21,2.0,11.0,3.0,3.0,0.0,1.0,8.0,2.0,44.0,5.0,...,1.0,4.0,7.0,4.0,5.0,1.0,7.0,2.0,1.0,9.0
2023-09-22,2.0,11.0,3.0,3.0,0.0,1.0,8.0,2.0,44.0,5.0,...,1.0,4.0,7.0,3.0,5.0,1.0,7.0,2.0,1.0,9.0
2023-09-25,2.0,11.0,3.0,3.0,0.0,1.0,8.0,2.0,44.0,5.0,...,1.0,4.0,7.0,4.0,5.0,1.0,7.0,2.0,1.0,9.0


## Calculate returns from raw prices; log returns are probably easier, but you can try others like standardized returns if you have time

This cell is for calculating log returns.

In [6]:
PSE_log_returns = df_filtered.copy()
numeric_cols = PSE_log_returns.columns.difference(["date_id"])
PSE_log_returns[numeric_cols] = PSE_log_returns[numeric_cols].replace(0, np.nan)
PSE_log_returns[numeric_cols] = np.log(PSE_log_returns[numeric_cols] / PSE_log_returns[numeric_cols].shift(1))

I removed the the row for September 25, 2013, and I filled all the nan values with zero.

In [7]:
PSE_log_returns = PSE_log_returns.loc['2013-09-26':'2023-09-26']
PSE_log_returns.columns = PSE_log_returns.columns.str.replace(r"_P", "").str.replace(r"PH_", "")

# this was for testing
least_null_cols = PSE_log_returns.isna().sum().nsmallest(3).index
trial = PSE_log_returns[least_null_cols]
trial

Unnamed: 0_level_0,ASA,ABS,AGN
date_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-09-26,-0.154151,0.0,0.0
2013-09-27,0.000000,0.0,0.0
2013-09-30,0.000000,0.0,0.0
2013-10-01,0.154151,0.0,0.0
2013-10-02,0.000000,0.0,0.0
...,...,...,...
2023-09-20,0.000000,0.0,0.0
2023-09-21,0.000000,0.0,0.0
2023-09-22,0.000000,0.0,0.0
2023-09-25,0.000000,0.0,0.0


## Finding an appropriate window size and creating a covariance matrix

The cell below is for creating the covariance matrix. I used a window size of 6 months.

In [8]:
# code is from: https://youtu.be/oRbESNj83mY?si=5qT_HAsnko5lPIFJ
# code should already be in a pickl file

# covar_matrix = PSE_log_returns.rolling(window=55).cov()
# covar_matrix

This is to create the covariance matrix pickle file.

In [9]:
# this is to save covar_matrix in a pickle

# with open("covar_matrix.pickl", 'wb') as file:
#     pickle.dump(covar_matrix, file)

This is to open the pickle file.

In [10]:
with open('covar_matrix.pickl', 'rb') as f:
    covar_matrix = pickle.load(f)
covar_matrix

Unnamed: 0_level_0,Unnamed: 1_level_0,PHIP,ASA,ABS,AGN,APC,CHP,CEU,CIR,CAA,EEQ,...,MJC,PHCK,MRP,LOT,BAG,H2O,PHRC,SSN,SIN,SHK
date_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2013-09-26,PHIP,,,,,,,,,,,...,,,,,,,,,,
2013-09-26,ASA,,,,,,,,,,,...,,,,,,,,,,
2013-09-26,ABS,,,,,,,,,,,...,,,,,,,,,,
2013-09-26,AGN,,,,,,,,,,,...,,,,,,,,,,
2013-09-26,APC,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-09-26,H2O,0.0,0.0,0.000000,0.000000,,0.0,0.000000e+00,0.000000,0.000000,0.0,...,0.0,0.000000e+00,0.0,0.000000e+00,0.0,0.0,0.000000e+00,0.0,0.0,0.000000e+00
2023-09-26,PHRC,0.0,0.0,0.000036,0.001426,,0.0,-6.930625e-06,0.000021,-0.000015,0.0,...,0.0,-7.418235e-04,0.0,4.918081e-03,0.0,0.0,1.197040e-02,0.0,0.0,-1.800733e-20
2023-09-26,SSN,0.0,0.0,0.000000,0.000000,,0.0,0.000000e+00,0.000000,0.000000,0.0,...,0.0,0.000000e+00,0.0,0.000000e+00,0.0,0.0,0.000000e+00,0.0,0.0,0.000000e+00
2023-09-26,SIN,0.0,0.0,0.000000,0.000000,,0.0,0.000000e+00,0.000000,0.000000,0.0,...,0.0,0.000000e+00,0.0,0.000000e+00,0.0,0.0,0.000000e+00,0.0,0.0,0.000000e+00


### Prof Jose

boolean arrays, less for loops <br>
covariance cant have null values nanstd <br>
masked array<br>
dendrogram <br>

In [11]:
A = np.array([1, 2, 3, 4, 5, 6, np.nan])

In [12]:
print(np.std(A))
print(np.max(A))
print(np.mean(A))

nan
nan
nan


In [13]:
print(np.nanstd(A))
print(np.nanmax(A))
print(np.nanmean(A))

1.707825127659933
6.0
3.5


In [14]:
# masked array
A[A > 3]

array([4., 5., 6.])

In [15]:
A > 3

array([False, False, False,  True,  True,  True, False])

# Goal:  Create the correlation and distance matrices for returns of PSE stocks over different windows and create the Minimum Spanning Tree

## Turn the covariance matrix into correlation matrix. Turn the correlation matrix into a distance matrix. Create the Minimum Spanning Tree. 

In [16]:
 # from: https://gist.github.com/wiso/ce2a9919ded228838703c1c7c7dad13b
def covar_to_correl(cov_matrix):
    std_devs = np.sqrt(np.abs(np.diag(cov_matrix)))  # Standard deviations
    outer_v = np.outer(std_devs, std_devs)  # Outer product for normalization
    correlation = cov_matrix / outer_v  # Convert to correlation
    correlation[cov_matrix == 0] = 0  # Handle zeros
    np.fill_diagonal(correlation.values, 1)  # Ensure diagonal is 1
    return correlation

# from: https://www.geeksforgeeks.org/prims-minimum-spanning-tree-mst-greedy-algo-5/
def prim_mst(distance_matrix):
    if not isinstance(distance_matrix, pd.DataFrame):
        return []  # Or raise a TypeError if you prefer

    n = len(distance_matrix.index)
    if n == 0:  # Handle empty DataFrame case
        return []

    start_node = distance_matrix.index[0]  # Start with the first node
    visited = {start_node}
    mst_edges = []
    min_heap = []

    # Add initial edges from the start node to the heap
    for neighbor in distance_matrix.columns:
        if neighbor != start_node and not np.isnan(distance_matrix.loc[start_node, neighbor]):
            weight = distance_matrix.loc[start_node, neighbor]
            heapq.heappush(min_heap, (weight, start_node, neighbor))

    while min_heap:
        weight, u, v = heapq.heappop(min_heap)

        if v not in visited:
            visited.add(v)
            mst_edges.append((u, v, weight))

            for neighbor in distance_matrix.columns:
                if neighbor != v and neighbor not in visited and not np.isnan(distance_matrix.loc[v, neighbor]):
                    weight = distance_matrix.loc[v, neighbor]
                    heapq.heappush(min_heap, (weight, v, neighbor))

    return mst_edges

### All matrices in one file

The code below was to collect all the correlation and distance matrices into one data frame.

In [17]:
# whole_correl = pd.DataFrame()
# whole_dist = pd.DataFrame()
# skips = len(covar_matrix.columns)

# for i in range(0, len(covar_matrix.index), skips):
#     # this is to get the covar matrix
#     covar1 = covar_matrix.iloc[i:i+skips]
    
#     # this is to make the correlation matrix 
#     correl_matrix = covar_to_correl(covar1)
#     whole_correl = pd.concat([whole_correl, correl_matrix], axis=0)
    
#     # this is to make the distance matrix
#     distance_matrix = 1 - np.square(correl_matrix)
#     whole_dist = pd.concat([whole_dist, distance_matrix], axis=0)

#     # this is to create the MST
#     # d = distance_matrix.reset_index()
#     # to_dist = d.iloc[:, 1:]
    
#     # if to_dist.isnull().values.any():
#     #     continue
#     # else:
#     #     mst = prim_mst(to_dist)
        
#     #     if mst:
#     #         print("Minimum Spanning Tree Edges:")
#     #         for u, v, weight in mst:
#     #             print(f"({u}, {v}): {weight}")
#     #     else:
#     #         print("Could not compute MST. Check the input distance matrix.")

This cell is for storing the correlation and distance matrices into a pickle file.

In [18]:
# # for correlation matrix
# with open('correl_matrix.pickl', 'wb') as g:
#     pickle.dump(whole_correl, g)

# # for distance matrix
# with open('distance_matrix.pickl', 'wb') as f:
#     pickle.dump(whole_dist, f)

This cell is for opening correlation and distance matrices from the pickle file.

In [19]:
with open('distance_matrix.pickl', 'rb') as f:
    distance_matrix = pickle.load(f)
distance_matrix

Unnamed: 0_level_0,Unnamed: 1_level_0,PHIP,ASA,ABS,AGN,APC,CHP,CEU,CIR,CAA,EEQ,...,MJC,PHCK,MRP,LOT,BAG,H2O,PHRC,SSN,SIN,SHK
date_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2013-09-26,PHIP,0.0,,,,,,,,,,...,,,,,,,,,,
2013-09-26,ASA,,0.0,,,,,,,,,...,,,,,,,,,,
2013-09-26,ABS,,,0.000000,,,,,,,,...,,,,,,,,,,
2013-09-26,AGN,,,,0.000000,,,,,,,...,,,,,,,,,,
2013-09-26,APC,,,,,0.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-09-26,H2O,1.0,1.0,1.000000,1.000000,,1.0,1.000000,1.000000,1.000000,1.0,...,1.0,1.000000,1.0,1.000000,1.0,0.0,1.0,1.0,1.0,1.0
2023-09-26,PHRC,1.0,1.0,0.999986,0.941563,,1.0,0.999988,0.999996,0.999993,1.0,...,1.0,0.949221,1.0,0.851546,1.0,1.0,0.0,1.0,1.0,1.0
2023-09-26,SSN,1.0,1.0,1.000000,1.000000,,1.0,1.000000,1.000000,1.000000,1.0,...,1.0,1.000000,1.0,1.000000,1.0,1.0,1.0,0.0,1.0,1.0
2023-09-26,SIN,1.0,1.0,1.000000,1.000000,,1.0,1.000000,1.000000,1.000000,1.0,...,1.0,1.000000,1.0,1.000000,1.0,1.0,1.0,1.0,0.0,1.0


In [20]:
with open('correl_matrix.pickl', 'rb') as f:
    correl_matrix = pickle.load(f)
correl_matrix

Unnamed: 0_level_0,Unnamed: 1_level_0,PHIP,ASA,ABS,AGN,APC,CHP,CEU,CIR,CAA,EEQ,...,MJC,PHCK,MRP,LOT,BAG,H2O,PHRC,SSN,SIN,SHK
date_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2013-09-26,PHIP,1.0,,,,,,,,,,...,,,,,,,,,,
2013-09-26,ASA,,1.0,,,,,,,,,...,,,,,,,,,,
2013-09-26,ABS,,,1.000000,,,,,,,,...,,,,,,,,,,
2013-09-26,AGN,,,,1.000000,,,,,,,...,,,,,,,,,,
2013-09-26,APC,,,,,1.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-09-26,H2O,0.0,0.0,0.000000,0.000000,,0.0,0.000000e+00,0.000000,0.000000,0.0,...,0.0,0.000000e+00,0.0,0.000000e+00,0.0,1.0,0.000000e+00,0.0,0.0,0.000000e+00
2023-09-26,PHRC,0.0,0.0,0.003719,0.241738,,0.0,-3.518163e-03,0.002019,-0.002651,0.0,...,0.0,-2.253425e-01,0.0,3.852978e-01,0.0,0.0,1.000000e+00,0.0,0.0,-3.630060e-18
2023-09-26,SSN,0.0,0.0,0.000000,0.000000,,0.0,0.000000e+00,0.000000,0.000000,0.0,...,0.0,0.000000e+00,0.0,0.000000e+00,0.0,0.0,0.000000e+00,1.0,0.0,0.000000e+00
2023-09-26,SIN,0.0,0.0,0.000000,0.000000,,0.0,0.000000e+00,0.000000,0.000000,0.0,...,0.0,0.000000e+00,0.0,0.000000e+00,0.0,0.0,0.000000e+00,0.0,1.0,0.000000e+00


### Each timestamp as a file

This code is to save all the matrices of each timestamp into one file.

In [21]:
skips = len(covar_matrix.columns)

for i in range(0, len(covar_matrix.index), skips):
    # this is to get the covar matrix
    covar1 = covar_matrix.iloc[i:i+skips]
    
    # this is to make the correlation matrix 
    correl_matrix = covar_to_correl(covar1) 
    
    # this is to make the distance matrix
    distance_matrix = 1 - np.square(correl_matrix)

    # this is to save it into a pickle file
    tables = {"covariance matrix": covar1, 
              "correlation matrix": correl_matrix, 
              "distance matrix": distance_matrix}
    
    folder_path = "matrices"
    name = covar1.index.get_level_values("date_id").unique().to_list()[0].strftime("%Y-%m-%d")
    file_path = os.path.join(folder_path, f"{name}.pickl")

    pd.to_pickle(tables, file_path)