## Imports
This is for the imports in the code.

In [1]:
import os
import pickle
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

# for choosing random dates
import random
from datetime import datetime, timedelta

# for getting the wasserstein distance
import itertools
import persim
import ripser
from persim import wasserstein
from scipy.stats import wasserstein_distance

# to remove the warnings
import warnings
warnings.simplefilter("ignore", DeprecationWarning)

In [2]:
directory = r"matrices"

## Choosing Random Dates
This is for choosing random dates within noncrash years before the crash year. The files for these dates have been added to a folder called `test_files`.

In [3]:
random.seed(1050)
start_date = datetime(2013, 11, 1)
end_date = datetime(2019, 11, 30)
delta = (end_date - start_date).days

weekdays = [
    start_date + timedelta(days=i)
    for i in range((end_date - start_date).days + 1)
    if (start_date + timedelta(days=i)).weekday() < 5
]

random_dates = random.sample(weekdays, 5)

for date in sorted(random_dates):
    print(date.strftime("%Y-%m-%d"))

2016-05-12
2016-07-22
2017-02-14
2017-05-11
2018-08-17


## Wasserstein distance from ChatGPT
This is for getting the Wasserstein distance for the test files. The code was generated by ChatGPT just to see if I can get a code that accurately calculates the Wasserstein distance.

This code uses persim. 

In [4]:
diagrams = []
file_names = []

for name in sorted(os.listdir(directory)):
    file_path = os.path.join(directory, name)
    
    if os.path.isfile(file_path) and name.endswith('.pickl'):
        try:
            with open(file_path, 'rb') as f:
                data = pickle.load(f)
                dist_matrix = data.get('distance matrix')
                if dist_matrix is None:
                    print(f"Distance matrix not found in {name}")
        except (pickle.UnpicklingError, EOFError, KeyError) as e:
            print(f"Error loading {name}: {e}")
            continue
    
        dgm = ripser.ripser(dist_matrix, distance_matrix=True, maxdim=0)['dgms'][0]
        diagrams.append(dgm)
        file_names.append(name)
        
distance_matrix = pd.DataFrame(
    0.0, 
    index=file_names, 
    columns=file_names
)

for i in range(len(diagrams)):
    for j in range(i, len(diagrams)):
        d1 = diagrams[i][~np.isinf(diagrams[i][:, 1])]
        d2 = diagrams[j][~np.isinf(diagrams[j][:, 1])]
        
        dist = wasserstein(d1, d2, matching=False)
        distance_matrix.iloc[i, j] = dist
        distance_matrix.iloc[j, i] = dist

distance_matrix.round(4)

Unnamed: 0,110 2016-05-12 00_00_00.pickl,110 2016-07-22 00_00_00.pickl,110 2017-02-14 00_00_00.pickl,110 2017-05-11 00_00_00.pickl,110 2018-08-17 00_00_00.pickl
110 2016-05-12 00_00_00.pickl,0.0,16.5349,4.2957,5.9091,8.9897
110 2016-07-22 00_00_00.pickl,16.5349,0.0,12.8848,21.8747,10.9728
110 2017-02-14 00_00_00.pickl,4.2957,12.8848,0.0,9.3582,5.4162
110 2017-05-11 00_00_00.pickl,5.9091,21.8747,9.3582,0.0,13.9968
110 2018-08-17 00_00_00.pickl,8.9897,10.9728,5.4162,13.9968,0.0


This code uses the wasserstein_distance function from scipy. 

In [5]:
diagrams = []
file_names = []

for name in sorted(os.listdir(directory)):
    file_path = os.path.join(directory, name)
    
    if os.path.isfile(file_path) and name.endswith('.pickl'):
        try:
            with open(file_path, 'rb') as f:
                data = pickle.load(f)
                dist_matrix = data.get('distance matrix')
                if dist_matrix is None:
                    print(f"Distance matrix not found in {name}")
                    continue
        except (pickle.UnpicklingError, EOFError, KeyError) as e:
            print(f"Error loading {name}: {e}")
            continue

        dgm = ripser.ripser(dist_matrix, distance_matrix=True, maxdim=0)['dgms'][0]
        diagrams.append(dgm)
        file_names.append(name)

distance_matrix = pd.DataFrame(
    0.0, 
    index=file_names, 
    columns=file_names
)

for i in range(len(diagrams)):
    for j in range(i, len(diagrams)):
        d1 = diagrams[i][~np.isinf(diagrams[i][:, 1])]
        d2 = diagrams[j][~np.isinf(diagrams[j][:, 1])]
        
        lifespan1 = d1[:, 1] - d1[:, 0]
        lifespan2 = d2[:, 1] - d2[:, 0]
        
        dist = wasserstein_distance(lifespan1, lifespan2)
        
        distance_matrix.iloc[i, j] = dist
        distance_matrix.iloc[j, i] = dist

distance_matrix.round(4)

Unnamed: 0,110 2016-05-12 00_00_00.pickl,110 2016-07-22 00_00_00.pickl,110 2017-02-14 00_00_00.pickl,110 2017-05-11 00_00_00.pickl,110 2018-08-17 00_00_00.pickl
110 2016-05-12 00_00_00.pickl,0.0,0.0893,0.0244,0.0245,0.0484
110 2016-07-22 00_00_00.pickl,0.0893,0.0,0.0699,0.1116,0.0895
110 2017-02-14 00_00_00.pickl,0.0244,0.0699,0.0,0.0441,0.0386
110 2017-05-11 00_00_00.pickl,0.0245,0.1116,0.0441,0.0,0.0596
110 2018-08-17 00_00_00.pickl,0.0484,0.0895,0.0386,0.0596,0.0


## Wasserstein distance for the test files
This is for getting the Wasserstein distance for the test files for the plot for the H0 connected components. If the 5 files are good representatives, their Wasserstein distances should be near zero with each other.

In [None]:
b0_lists = []
file_names = sorted(os.listdir(directory))

for name in sorted(os.listdir(directory)):
    file_path = os.path.join(directory, name)
    
    if os.path.isfile(file_path) and name.endswith('.pickl'):
        try:
            with open(file_path, 'rb') as f:
                data = pickle.load(f)
                dist_matrix = data.get('distance matrix')
                if dist_matrix is None:
                    print(f"Distance matrix not found in {name}")
        except (pickle.UnpicklingError, EOFError, KeyError) as e:
            print(f"Error loading {name}: {e}")
            continue
    
    # making a persistence diagram from a rips filtration 
    pers_diag = ripser.ripser(dist_matrix, distance_matrix=True, maxdim=2)
    diagrams = pers_diag['dgms']
    thresh_list = np.linspace(0, 1, 50)
    b0_data = []

    for t in thresh_list:
        pers_diag2 = ripser.ripser(dist_matrix, distance_matrix=True, thresh=t, maxdim=2)
        diagrams2 = pers_diag2['dgms']
        b0 = sum(d[1] == np.inf for d in diagrams2[0]) if len(diagrams2) > 0 else 0
        b0_data.append((t, b0))
        
    b0_df = pd.DataFrame(b0_data, columns=['threshold', 'b0'])
    b0_df['min max'] = (b0_df['b0'] - min(b0_data)[1])/(max(b0_data)[1] - min(b0_data)[1])
    
    b0_lists.append(b0_df['min max'].to_list())

In [None]:
for list in b0_lists:
    plt.plot(list)
    plt.show()

In [None]:
distance_matrix = pd.DataFrame(
    0.0, 
    index=file_names, 
    columns=file_names
)

for i in range(len(b0_lists)):
    for j in range(i, len(b0_lists)):        
        dist = wasserstein_distance(b0_lists[i], b0_lists[j])
        
        distance_matrix.iloc[i, j] = dist
        distance_matrix.iloc[j, i] = dist

if '.ipynb_checkpoints' in distance_matrix.index:
    distance_matrix = distance_matrix.drop('.ipynb_checkpoints', axis=0)
    distance_matrix = distance_matrix.drop('.ipynb_checkpoints', axis=1)

distance_matrix.round(4)

Seeing that most of the Wasserstein distances between each are close to zero, then the 5 randomly chosen dates are good representatives of noncrash years prior to the crash year.