Breast cancer stage prediction from pathological whole slide images with hierarchical image pyramid transformers.
Project developed under the "High Risk Breast Cancer Prediction Contest Phase 2" 
by Nightingale, Association for Health Learning & Inference (AHLI)
and Providence St. Joseph Health

Copyright (C) 2023 Zsolt Bedohazi, Andras Biricz, Istvan Csabai

In [None]:
import argparse
import glob
import os
import random
from importlib.resources import path
import cv2
import matplotlib.pyplot as plt
import numpy as np
import tifffile
from openslide import OpenSlide
import PIL.Image
import pandas as pd

In [None]:
parent_folder = 'clam_data_dir_level1/holdout/'

In [None]:
stitches = np.array( sorted( [ i for i in os.listdir(parent_folder+'stitches/') if 'png' in i] ) )
stitches.shape

In [None]:
masks = np.array( sorted( [ i for i in os.listdir(parent_folder+'masks/') if 'png' in i] ) )
masks.shape

In [None]:
patches = np.array( sorted( [ i.replace('h5', 'png') for i in os.listdir(parent_folder+'patches/') if 'h5' in i] ) )
patches.shape

In [None]:
to_delete = masks[ ~np.in1d(masks, stitches) ]
to_delete.shape

In [None]:
np.savetxt( 'remove_corrupted_masks.sh', [ f'rm clam_data_dir_level1/holdout/masks/{j}' for j in to_delete ], fmt='%s' )

In [None]:
to_delete = patches[ ~np.in1d(patches, stitches) ]
to_delete = np.array( [ k.replace('png', 'h5') for k in to_delete] )
to_delete.shape

In [None]:
np.savetxt( 'remove_corrupted_patches.sh', [ f'rm clam_data_dir_level1/holdout/patches/{j}' for j in to_delete ], fmt='%s' )

### Clean corrupted holdout files

In [None]:
ndpi_holdout = np.array( sorted( [ i for i in os.listdir('/home/ngsci/datasets/brca-psj-path/ndpi-holdout/') if 'ndpi' in i] ) )
ndpi_holdout.shape

In [None]:
ndpi_holdout

In [None]:
holdout_df = pd.read_csv( '/home/ngsci/20230220_corupted_holdout_files.txt', delimiter='\t', names=['du', 'files'])
holdout_df.head()

In [None]:
corrupted_files = holdout_df[ holdout_df.du == '512' ].files.values
corrupted_files.shape, corrupted_files[:5]

In [None]:
np.in1d( corrupted_files, '0093f0ae-bdbb-4a46-904c-14855c668857.ndpi').sum()

In [None]:
to_check = corrupted_files[2]
#!openslide-show-properties  /home/ngsci/datasets/brca-psj-path/ndpi-holdout/$to_check
!du -sh /home/ngsci/datasets/brca-psj-path/ndpi-holdout/$to_check

In [None]:
to_check = corrupted_files[3]
#!openslide-show-properties  /home/ngsci/datasets/brca-psj-path/ndpi-holdout/$to_check
!du -sh /home/ngsci/datasets/brca-psj-path/ndpi-holdout/$to_check

In [None]:
!openslide-show-properties  /home/ngsci/datasets/brca-psj-path/ndpi-holdout/$to_check

In [None]:
corrupted_files_masks = np.array([ o.replace('ndpi', 'png') for o in ndpi_holdout ])
corrupted_files_masks

In [None]:
corrupted_files_patches = np.array([ o.replace('ndpi', 'h5') for o in ndpi_holdout ])
corrupted_files_patches

In [None]:
corrupted_files_stitches = np.array([ o.replace('ndpi', 'png') for o in ndpi_holdout])
corrupted_files_stitches

In [None]:
np.savetxt( 'remove_corrupted_masks.sh', [ f'rm clam_data_dir_level1/masks/{j}' for j in corrupted_files_masks ], fmt='%s' )

In [None]:
np.savetxt( 'remove_corrupted_patches.sh', [ f'rm clam_data_dir_level1/patches/{j}' for j in corrupted_files_patches ], fmt='%s' )

In [None]:
np.savetxt( 'remove_corrupted_stitches.sh', [ f'rm clam_data_dir_level1/stitches/{j}' for j in corrupted_files_stitches ], fmt='%s' )

In [None]:
!ls ~ | grep cor