In [1]:
# Basic imports and configuration for the project.
import os
import sys
import pandas as pd  # type: ignore

from util import (
    identify_wsi_based_on_prefix,
    update_flag_and_check_missing,
    get_wsi_files_by_biopsy_id_and_stain,
    check_if_file_exists,
    check_if_file_openable
)

# Extend the system path to include the parent directory for module imports.
sys.path.append(os.path.abspath('..'))

# Import constants and utility functions from utils module.
from utils.utils_constants import (
    NEPTUNE_PAT_INFO_PATH as PAT_INFO_PATH,
    NEPTUNE_WSI_INFO_PATH as WSI_INFO_PATH,
    VESSEL_NEPTUNE_PAT_INFO_PATH,
    VESSEL_SEGMENTATION_REF_PATH,
    INNER_SEGMENTATION_REF_DIR,
    TRI_CASE_DIR
)

# Configure logging to help track application behavior and debug issues.
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define the stain type used across this module.
STAIN = "TRI"

In [2]:
# Load the data
pat_df = pd.read_csv(PAT_INFO_PATH)
wsi_df = pd.read_excel(WSI_INFO_PATH)

In [3]:
# Selecting cases in vessel segmentation task (Jayapandian & Chen, KI, 2020)
seg_ref = pd.read_excel(VESSEL_SEGMENTATION_REF_PATH, skiprows=3)
seg_ref = seg_ref[seg_ref["File_Name"].notna() & seg_ref[STAIN].notna()]
print(f"Total {int(seg_ref[STAIN].sum())} Arteries Annotated in {STAIN} Stained {len(seg_ref)} Slides")
seg_prefixes = [s.strip().replace("-", "_") for s in seg_ref["File_Name"].tolist()]
seg_biopsies, seg_biopsy_to_wsi_map = identify_wsi_based_on_prefix(wsi_df, seg_prefixes, STAIN)

Total 837 Arteries Annotated in TRI Stained 58 Slides
Notice: Prefix '11_26609_023_507' is associated with 3 TRI Stained WSI File(s).
Notice: Prefix '11_26609_023_510' is associated with 3 TRI Stained WSI File(s).
Notice: Prefix '11_26609_025_503' is associated with 2 TRI Stained WSI File(s).
Notice: Prefix '11_26609_027_506' is associated with 2 TRI Stained WSI File(s).
Notice: Prefix '12_26609_021_508' is associated with no unique biopsy ID(s).
Notice: Prefix '12_26609_024_502' is associated with 2 TRI Stained WSI File(s).
Notice: Prefix '12_26609_027_511' is associated with 3 TRI Stained WSI File(s).
Notice: Prefix '12_26609_027_513' is associated with 2 TRI Stained WSI File(s).
Notice: Prefix '12_26609_028_012' is associated with 2 TRI Stained WSI File(s).
Notice: Prefix '12_26609_032_002' is associated with 2 TRI Stained WSI File(s).
Notice: Prefix '12_26609_033_511' is associated with 0 TRI Stained WSI File(s).
Notice: Prefix '13_26609_027_521' is associated with 2 TRI Stained WS

In [4]:
# Selecting cases in intra-arterial segmentation task (Zhou, JMI, 2024)
inner_seg_prefixes = [x.replace(".geojson", "") for x in os.listdir(INNER_SEGMENTATION_REF_DIR) if x.endswith(".geojson")]
inner_seg_biopsies, inner_seg_biopsy_to_wsi_map = identify_wsi_based_on_prefix(wsi_df, inner_seg_prefixes, STAIN)



In [5]:
# Selecting cases in Tubule task (Fan)
tubule_biopsies = set(wsi_df[wsi_df["USE_Tubule"] == 1]["biopsyid"].unique())

In [6]:
# Update the patient info to flag biopsies used in various tasks.
# It also checks for any missing biopsies in the list.

# Update and check for "USE_Tubule"
pat_df = update_flag_and_check_missing(pat_df, tubule_biopsies, "USE_Tubule")

# Update and check for "Used_in_Vessel_Seg"
pat_df = update_flag_and_check_missing(pat_df, seg_biopsies, "Used_in_Vessel_Seg")

# Update and check for "Used_in_Inner_Structure_Seg"
pat_df = update_flag_and_check_missing(pat_df, inner_seg_biopsies, "Used_in_Inner_Structure_Seg")

Missing biopsy IDs for USE_Tubule: {'0_2245', '0_2910', '0_1641', '0_1678'}
Missing biopsy IDs for Used_in_Vessel_Seg: {'0_3244', '0_1558', '0_3086', '0_3206', '0_1554', '0_2237', '0_1577', '0_770', '0_3130', '0_1518', '0_3188', '0_3168', '0_3214'}
Missing biopsy IDs for Used_in_Inner_Structure_Seg: {'0_3282', '0_2992', '0_2972'}


In [7]:
# Update clinical DataFrame with WSI file names for vessel and inner structure segmentation
# Merge seg_biopsy_to_wsi_map and inner_seg_biopsy_to_wsi_map, with inner_seg_biopsy_to_wsi_map taking precedence in case of overlapping keys

for biopsy_id, wsi_file in {**seg_biopsy_to_wsi_map, **inner_seg_biopsy_to_wsi_map}.items():
    if biopsy_id in pat_df['BiopsyID'].values:
        pat_df.loc[pat_df['BiopsyID'] == biopsy_id, 'WSI_Selected'] = wsi_file

nonexistent_files = []
# For additional biopsies, select a TRI file if not already assigned
for biopsy_id in pat_df["BiopsyID"].values:
    # Check if 'WSI_Selected' is not null for the current biopsy_id
    if not pat_df.loc[pat_df['BiopsyID'] == biopsy_id, 'WSI_Selected'].isnull().all():
        continue  # Skip this iteration if WSI_Selected is already assigned
    filenames = get_wsi_files_by_biopsy_id_and_stain(wsi_df, STAIN, biopsy_id)
    for filename in filenames:
        file_exists = check_if_file_exists(TRI_CASE_DIR, filename)
        file_openable = file_exists and check_if_file_openable(TRI_CASE_DIR, filename)
        if file_openable:
            # Assign first openable file to WSI_Selected and stop checking further
            pat_df.loc[pat_df['BiopsyID'] == biopsy_id, 'WSI_Selected'] = filename
            break  # Found a suitable file, exit the loop
        elif not file_exists:
            # Assign the filename for further action but note it's nonexistent
            pat_df.loc[pat_df['BiopsyID'] == biopsy_id, 'WSI_Selected'] = filename
            nonexistent_files.append(filename)  # Keep track of nonexistent files
            break  # Exit the loop after assignment
        else:
            # File exists but is not openable; print a message and continue checking other files
            print(f"Cannot open: {filename}")
            
# Check if there are any nonexistent files recorded
if nonexistent_files:
    # Join the list of nonexistent files into a single string for better readability in the print statement
    files_list_str = ', '.join(nonexistent_files)
    print(f"The following files need to be found and uploaded: {files_list_str}")


Cannot open: 13_26609_027_520 L03 TRI.svs
The following files need to be found and uploaded: 0_2245_A_0056525.svs, 0_2910_A_0053486.svs, 0_1641_A_0047866.svs, 0_1678_A_0047884.svs, 17_26609_035_507 L3 TRI.svs, 11_26609_023_507 L12 TRI.ndpi, 12_26609_028_012 L05 TRI.ndpi, 11_26609_027_506 L03 TRI.svs


In [8]:
# Assuming wsi_df DataFrame is updated with "Used_in_Vessel_Seg", "Used_in_Inner_Structure_Seg", "USE_Tubule", "use_vessel", and "WSI_Selected" columns

# Calculate statistics
total_biopsies = len(pat_df)
used_in_vessel_seg = pat_df["Used_in_Vessel_Seg"].sum()
used_in_inner_seg = pat_df["Used_in_Inner_Structure_Seg"].sum()
files_assigned_w_clinical_info = pat_df[pat_df["WSI_Selected"].notna()
                                             & pat_df["ESRDorEGFR40BX_LR"].notna()
                                             & pat_df["DaysBXtoESRDorEGFR40_LR"].notna()].shape[0]
# Print the statistics
print(f"Total Biopsies: {total_biopsies}")
print(f"Biopsies Used in Vessel Segmentation: {used_in_vessel_seg}")
print(f"Biopsies Used in Inner Structure Segmentation: {used_in_inner_seg}")
print(f"Biopsies with a File Assigned for Vessel Project: {files_assigned_w_clinical_info}")

Total Biopsies: 314
Biopsies Used in Vessel Segmentation: 57
Biopsies Used in Inner Structure Segmentation: 20
Biopsies with a File Assigned for Vessel Project: 247


In [9]:
# Creating fake names for WSI (Whole Slide Images) selected for de-identification purposes.
# 'Biopsy_index' is where the biopsy information is located. For example, 'Biopsy_001_WSI_001'.
# We choose 'WSI_001' because, for each biopsy, we currently have only one WSI selected.
# This naming convention is used in anticipation of potentially using multiple WSIs in the future.
# The code iterates over each row in 'pat_df'. If the 'WSI_Selected' column is not null (indicating a selected WSI),
# it assigns a fake name using the format 'Biopsy_{index}_WSI_001'. Otherwise, it assigns None.
pat_df['WSI_Selected_Fake_Name'] = [
    f"Biopsy_{i+1:03d}_WSI_001" if pd.notna(row['WSI_Selected']) else None
    for i, row in pat_df.iterrows()
]

In [10]:
pat_df = pat_df[pat_df["WSI_Selected"].notna()
                & pat_df["ESRDorEGFR40BX_LR"].notna()
                & pat_df["DaysBXtoESRDorEGFR40_LR"].notna()]

In [11]:
# Save the updated clinical DataFrame to the specified file
pat_df.to_csv(VESSEL_NEPTUNE_PAT_INFO_PATH, index=False)