Breast cancer stage prediction from pathological whole slide images with hierarchical image pyramid transformers.
Project developed under the "High Risk Breast Cancer Prediction Contest Phase 2" 
by Nightingale, Association for Health Learning & Inference (AHLI)
and Providence St. Joseph Health

Copyright (C) 2023 Zsolt Bedohazi, Andras Biricz, Istvan Csabai

In [None]:
import numpy as np
#from geojson import GeoJSON
import json
import os
import glob
#import shapely
#from rtree import index
#from shapely.ops import cascaded_union, unary_union
from collections import Counter
import matplotlib.pyplot as plt
import h5py
from tqdm import tqdm
from PIL import Image

import sys

#from hipt_4k import HIPT_4K
#from hipt_model_utils import get_vit256, get_vit4k, eval_transforms
#from hipt_heatmap_utils import *

### Locate data

In [None]:
#parent_folder = '../../clam_dev/clam_data_dir_level1/train/'
#parent_folder = '/home/ngsci/train/'
#source = '/home/ngsci/datasets/brca-psj-path/contest-phase-2/clam-preprocessing-holdout/resnet50-features/h5_files/'
source = '/home/ngsci/clam_level1_tiles_vit_16-256_finetuned_embeddings_holdout/'

In [None]:
slide_fp = os.path.join(source, f'*.h5')
files = np.array( sorted( glob.glob(slide_fp) ) )
files.shape, files[:3]

In [None]:
idxs = np.append( np.arange(0, files.shape[0], files.shape[0]//64), files.shape[0] )
idxs = np.vstack( (idxs[:-1], idxs[1:]) ).T
idxs.shape

In [None]:
destination = '/home/ngsci/vitsmall_embeddings_4096region_256times384_level1_file_splits_holdout/'

In [None]:
os.makedirs(destination, exist_ok=True)

In [None]:
!ls $destination | wc -l

In [None]:
for l in range(idxs.shape[0]):
    files_to_process_thread = files[ idxs[l,0]:idxs[l,1] ]
    np.savetxt( destination+f'files_to_process_thread_{l}', files_to_process_thread, fmt='%s' ) 

In [None]:
!ls $destination | wc -l

In [None]:
scripts = np.array( [ f'nohup python3 generate_256times384_embeddings_vit.py  --thread_num {k}  > logs_level1/log_vit_holdout_thread_{k}.txt &' for k in range(idxs.shape[0]) ] )
np.savetxt(f'scripts_to_run_generate_256times384_embeddings_vit256-16_level1_on_{idxs.shape[0]}_threads_holdout', scripts, fmt='%s')

### Correct - run here only if runnings are dead and pre-generated filesplits and scripts are deleted 

In [None]:
#source_done = '/home/ngsci/project/resnet50_embeddings_4096region_256times1024_level0/'
source_done = '/home/ngsci/resnet50_embeddings_4096region_256times1024_level1/'
slide_fp_done = os.path.join(source_done, f'*.npy')
files_done = np.array( sorted( glob.glob(slide_fp_done) ))
files_done.shape, files_done[:3]

In [None]:
files_done_splitted = np.array([ k.split('/')[-1].split('.npy')[0] for k in files_done ])
files_done_splitted.shape, files_done_splitted[:3]

In [None]:
files_splitted = np.array([ k.split('/')[-1].split('.h5')[0] for k in files ])
files_splitted.shape, files_splitted[:3]

In [None]:
files_to_num = dict( zip( files_splitted, np.arange(files_splitted.shape[0]) ) )

In [None]:
files_idx_already_running = np.array( [ files_to_num[p] for p in files_done_splitted ])
files_idx_already_running

In [None]:
files_idx_already_running.shape

In [None]:
files_to_process = files[ ~np.in1d( np.arange(files_splitted.shape[0]), files_idx_already_running ) ]
files_to_process.shape

In [None]:
idxs = np.append( np.arange(0, files_to_process.shape[0], files_to_process.shape[0]//64), files_to_process.shape[0] )
idxs = np.vstack( (idxs[:-1], idxs[1:]) ).T
idxs.shape

In [None]:
idxs.flatten().max(), files_to_process.shape[0]

In [None]:
destination = '/home/ngsci/resnet50_embeddings_for_4k_regions_file_splits_restart_holdout/'

In [None]:
os.makedirs(destination, exist_ok=True)

In [None]:
for l in range(idxs.shape[0]):
    files_to_process_thread = files_to_process[ idxs[l,0]:idxs[l,1] ]
    np.savetxt( destination+f'files_to_process_thread_{l}', files_to_process_thread, fmt='%s' ) 

In [None]:
!ls $destination

### Running scripts

#### original

In [None]:
scripts = np.array( [ f'nohup python3 generate_256times1024_embeddings_resnet50_level0.py  --thread_num {k}  > logs_holdout/log_thread_{k}.txt &' for k in range(idxs.shape[0]) ] )
np.savetxt(f'scripts_to_run_generate_256times1024_embeddings_resnet50_level0_on_{idxs.shape[0]}_threads_holdout', scripts, fmt='%s')

#### corr

### Restart where left

In [None]:
scripts = np.array( [ f'nohup python3 generate_256times1024_embeddings_resnet50_level0.py  --thread_num {k}  > logs_restart_holdout/log_thread_{k}.txt &' for k in range(idxs.shape[0]) ] )
np.savetxt(f'scripts_to_run_generate_256times1024_embeddings_resnet50_level0_on_{idxs.shape[0]}_threads', scripts, fmt='%s')

### EXTRA

In [None]:
idxs = np.append( np.arange(0, files.shape[0], files.shape[0]//16), files.shape[0] )
idxs = np.vstack( (idxs[:-1], idxs[1:]) ).T
idxs.shape, #idxs[:5]

In [None]:
scripts = np.array( [ f'nohup clam_vit256_embedder.py   --source   /home/ngsci/clam_level1_tiles_holdout/  --dest_dir   /home/ngsci/clam_level1_tiles_vit_16-256_finetuned_embeddings_holdout/  --start_idx {idxs[k,0]}  --end_idx  {idxs[k,1]}  > logs_vit_training_holdout/log_thread_{k}.txt &' for k in range(idxs.shape[0]) ] )
np.savetxt(f'scripts_to_run_generate_256times384_embeddings_vit_finetuned_level1_on_{idxs.shape[0]}_threads_holdout', scripts, fmt='%s')

### SOME CHECKS

In [None]:
folder = '/home/ngsci/vitsmall_embeddings_4096region_256times384_level1/'
#folder = '/home/ngsci/resnet50_embeddings_4096region_256times1024_level1/'

In [None]:
os.listdir(folder)[:5]

In [None]:
def load_h5_file(filename):
    with h5py.File(filename, "r") as f:
        coords = f['coords'][()]
        features = f['features_4k'][()]
    return coords, features

In [None]:
coords, features = load_h5_file(folder+'0000459a-838d-4865-8bbf-ea66f2e5ee4d.h5')
coords.shape, features.shape

In [None]:
features.mean(), features.min(), features.max()

In [None]:
plt.pcolormesh( features[0] )

In [None]:
plt.pcolormesh( features[10] )

In [None]:
features.min(), features.max(), features.mean()