In [51]:
import pandas as pd
import numpy as np
from gudhi.representations import Landscape
from tqdm import tqdm
import os

num_landscapes = 5
resolution = 100
landscape=Landscape(
    num_landscapes=num_landscapes,
    resolution=resolution
)

class CalculateVoidSpace:
    def __init__(self, rips_frame_path: str = "../TMD/frames/rips_2_frame.parquet"):
        self.rips_frame_path = rips_frame_path
        rips_frame = self.collect_frame()
        self.void_frame = self.create_maximal_void_space_frame(rips_frame)

    def collect_frame(self):
        frame = pd.read_parquet(self.rips_frame_path, engine='pyarrow')
        frame = frame[['path', 'Folder', 'mol_num', 'transition', 'pers_id', 'birth', 'death']]
        return frame

    def find_largest_array(self, arrays: np.array):
        maxes = np.argmax(arrays, axis=0)
        unique, counts = np.unique(maxes, return_counts=True)
        unq_counts = dict(zip(unique, counts))
        max_key = max(unq_counts, key=unq_counts.get)
        return max_key

    def get_max_landscape(self, values: np.array):
        landscape_rips = np.array(landscape.fit_transform([values])).reshape((num_landscapes, resolution))
        max_landscape_idx = self.find_largest_array(landscape_rips)
        max_landscape = landscape_rips[max_landscape_idx].tolist()
        return max_landscape, max_landscape_idx

    def create_maximal_void_space_frame(
            self,
            rips_frame: pd.DataFrame, 
            outpath:str = "/workspaces/TwistedMaterials/notebooks/void_space.csv"
        ):
        if os.path.exists(outpath):
            return pd.read_csv(outpath)

        categorical_features = []
        data_features = []
        for k, v in tqdm(rips_frame.groupby(['Folder', 'mol_num', 'transition']).groups.items()):
            values = rips_frame.loc[v, ['birth', 'death']].values
            max_landscape, max_landscape_idx = self.get_max_landscape(values)
            category = list(k) + [max_landscape_idx]
            categorical_features.append(category)
            data_features.append(max_landscape)
        category_frame = pd.DataFrame(categorical_features, columns = ['Folder', 'mol_num', 'transition', 'max_landscape_idx'])
        data_frame = pd.DataFrame(np.array(data_features).tolist())
        total_frame = pd.concat([category_frame, data_frame], axis = 1)

        maximal_void_spaces = []
        for k, v in tqdm(total_frame.groupby(['Folder', 'mol_num']).groups.items()):
            subset_frame = total_frame.loc[v, :]
            data = subset_frame.iloc[:, 4:].values
            largest_landscape_idx = self.find_largest_array(data)
            largest_landscape = subset_frame.iloc[largest_landscape_idx, :3].values.tolist()
            maximal_void_spaces.append(largest_landscape)
        maximal_void_space_frame = pd.DataFrame(maximal_void_spaces, columns = ['Folder', 'mol_num', 'transition'])
        maximal_void_space_frame['mol_num'] = pd.to_numeric(maximal_void_space_frame['mol_num'])
        maximal_void_space_frame = maximal_void_space_frame.sort_values(by = ['Folder', 'mol_num']).reset_index(drop = True)
        maximal_void_space_frame.to_csv("/workspaces/TwistedMaterials/notebooks/void_space.csv", index = False)

        return maximal_void_space_frame

In [52]:
voidspace = CalculateVoidSpace()
voidframe = voidspace.void_frame
voidframe

Unnamed: 0,Folder,mol_num,transition
0,IVDW_10,0,249
1,IVDW_10,1,1177
2,IVDW_10,2,1058
3,IVDW_10,3,0
4,IVDW_10,4,1079
...,...,...,...
91,IVDW_4,27,369
92,IVDW_4,28,779
93,IVDW_4,29,779
94,IVDW_4,30,459
