# Precompute all features

In [2]:
%load_ext autoreload
%autoreload 2
import pickle
import nibabel as nib
import numpy as np 
import matplotlib.pyplot as plt
from skimage import measure # For marching cubes
import polyscope as ps # For mesh display
from persim import plot_diagrams, PersistenceImager
import pandas as pd
import os
import sys
import skimage
import skimage.io
sys.path.append("../src")
import glob
from geomstats import *
from topostats import *
from kernels import *
from utils3d import *


def load_dictionary(metadata_path):
    df = pd.read_csv(metadata_path)
    data = {}
    for index, row in df.iterrows():
        patient_id = row["ID"]
        patient_id = "M-0".join(patient_id.split("M-")) #File paths have an extra 0 in ID
        data[patient_id] = row
    del data["UCSF-PDGM-0541"] # Skip Patient 541 because segmentation file is empty
    return data

def argsort(seq):
    return np.array(sorted(range(len(seq)), key=seq.__getitem__), dtype=int)

metadata_path = "../Data/UCSF-PDGM-metadata_v2.csv"
all_data_path = "../Data/UCSF-PDGM-v3"
data = load_dictionary(metadata_path) 

patients = list(data.keys())
diagnosis = [data[p]["Final pathologic diagnosis (WHO 2021)"] for p in patients]
dead = np.array([data[p]["1-dead 0-alive"] for p in patients])
# Sort by dead/alive first, then by diagnosis
idx = np.argsort(dead)
idx = idx[argsort([diagnosis[i] for i in idx])]
patients = [patients[i] for i in idx]

iso_names = {1:"Necrotic", 2:"Edema", 4:"Main Tumor"} # What the labels actually mean
iso_levels = [2, 4, 1] # Column order of the labels
tumor_types = ["Edema", "Main Tumor", "Necrotic"]

for patient in os.listdir(all_data_path):
    print(".", end="")
    patient_folder_path = os.path.join(all_data_path, patient)
    patient = patient[:-6]

    tumor_seg_path = patient_folder_path + "/" + patient_folder_path[-20:-6] + "_tumor_segmentation.nii.gz"
    if not os.path.exists(tumor_seg_path) or not patient in data:
        continue
    tumor_seg_nifti = nib.load(tumor_seg_path)
    tumor_seg_mat = tumor_seg_nifti.get_fdata()
    
    for k, level in enumerate(iso_levels):
        binary = tumor_seg_mat==level
        level_name = iso_names[level]
        B = crop_binary_volume(binary)
        data[patient]["B{}".format(level_name)] = B
        X = binary_volume_2coords(binary)
        data[patient]["X{}".format(level_name)] = X
        
to_delete = []
for p in data:
    if not "XEdema" in data[p].keys():
        to_delete.append(p)
print(to_delete)
for p in to_delete:
    del data[p]
print(len(data))

.....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................['UCSF-PDGM-0138', 'UCSF-PDGM-0175', 'UCSF-PDGM-0181', 'UCSF-PDGM-0278', 'UCSF-PDGM-0289', 'UCSF-PDGM-0315']
494


## Total Persistences

(TODO Later: Grab and sort)

In [None]:
for pers_type in ["alpha", "cubical"]:
    out_dir = "../preprocessed/{}_total".format(pers_type)
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)
    for p in data:
        fileout = "{}/{}.pkl".format(out_dir, p)
        if os.path.exists(fileout):
            continue
        # Use precomputed persistence diagrams
        res = pickle.load(open("../preprocessed/{}.pkl".format(p), "rb"))
        pers = []
        for tumor_type in tumor_types:
            PDs = res["{}_{}_PDs".format(tumor_type, pers_type)]
            for PD in PDs:
                if PD.size > 0:
                    pers.append(np.sum(PD[:, 1]-PD[:, 0]))
                else:
                    pers.append(0)
        pickle.dump({"x":np.array(pers)}, open(fileout, "wb"))

## Shape Histograms

In [None]:
r_max = 75
n_shells = 20

out_dir = "../preprocessed/shapehist"
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
for p in data:
    fileout = "{}/{}.pkl".format(out_dir, p)
    if os.path.exists(fileout):
        continue
    hists = np.array([])
    for tumor_type in tumor_types:
        X = data[p]["X{}".format(tumor_type)]
        h = get_shape_hist(X, n_shells=n_shells, r_max=r_max)
        if hists.size == 0:
            hists = h
        else:
            hists = np.concatenate((hists, h))
    pickle.dump({"x":hists}, open(fileout, "wb"))

## Shape Shell Histograms

In [3]:
r_max = 75
n_shells = 20
subdiv = 1 # How many times to subdivide the sphere for sector points

out_dir = "../preprocessed/shapeshellhist"
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
for p in data:
    fileout = "{}/{}.pkl".format(out_dir, p)
    if os.path.exists(fileout):
        continue
    hists = np.array([])
    for tumor_type in tumor_types:
        X = data[p]["X{}".format(tumor_type)]
        h = get_shape_shell_hist(X, n_shells=n_shells, r_max=r_max, subdiv=subdiv)
        print(h.shape)
        if hists.size == 0:
            hists = h
        else:
            hists = np.concatenate((hists, h))
    pickle.dump({"x":hists}, open(fileout, "wb"))

## Shape PCA Histograms

In [None]:
r_max = 75
n_shells = 10

out_dir = "../preprocessed/shapehistpca"
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
for p in data:
    fileout = "{}/{}.pkl".format(out_dir, p)
    if os.path.exists(fileout):
        continue
    hists = np.array([])
    for tumor_type in tumor_types:
        X = data[p]["X{}".format(tumor_type)]
        h = get_shape_pca_hist(X, n_shells=n_shells, r_max=r_max)
        if hists.size == 0:
            hists = h
        else:
            hists = np.concatenate((hists, h))
    pickle.dump({"x":hists}, open(fileout, "wb"))

## D2 Histogram

In [None]:
r_max = 75
n_bins = 40 # Number of bins in the histogram between [0, d_max]
n_samples = 10000 # Number of random samples

out_dir = "../preprocessed/d2"
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
for p in data:
    fileout = "{}/{}.pkl".format(out_dir, p)
    if os.path.exists(fileout):
        continue
    hists = np.array([])
    for tumor_type in tumor_types:
        X = data[p]["X{}".format(tumor_type)]
        h = get_d2_hist(X, d_max=r_max*2, n_bins=n_bins, n_samples=n_samples)
        if hists.size == 0:
            hists = h
        else:
            hists = np.concatenate((hists, h))
    pickle.dump({"x":hists}, open(fileout, "wb"))

## Spin Images

In [None]:
r_max = 75
n_angles = 50
dim = 64

out_dir = "../preprocessed/spinimages"
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
for p in data:
    fileout = "{}/{}.pkl".format(out_dir, p)
    if os.path.exists(fileout):
        continue
    imgs = np.array([])
    for tumor_type in tumor_types:
        X = data[p]["X{}".format(tumor_type)]
        img = get_spin_image(X, n_angles, r_max, dim).flatten()
        if imgs.size == 0:
            imgs = img
        else:
            imgs = np.concatenate((imgs, img))
    pickle.dump({"x":imgs}, open(fileout, "wb"))

## Connected Components

In [None]:
max_components = 10

out_dir = "../preprocessed/connectedcomponents"
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
for p in data:
    fileout = "{}/{}.pkl".format(out_dir, p)
    if os.path.exists(fileout):
        continue
    all_counts = np.zeros(max_components*len(iso_names))
    for i, tumor_type in enumerate(tumor_types):
        B = data[p]["B{}".format(tumor_type)]
        B = crop_binary_volume(B)
        if B.size > 0:
            labels = label_volume_components(B, cluster_cutoff=1)
            counts = sorted(get_label_counts(labels).flatten())[::-1]
            i1 = i*max_components
            i2 = min((i+1)*max_components, i1+len(counts))
            all_counts[i1:i2] = counts[0:(i2-i1)]
    pickle.dump({"x":all_counts}, open(fileout, "wb"))