# Pre-processing images as center-surround
### a.k.a. retinal ganglion cell processing

The processing pipeline involves converting to grayscale, rescaling, CLAHE filtering, and finally whitening.

First, define the functions for the preprocessing pipeline.

## Preparing the CovidNet dataset
Now import the CovidNet data and pre-process by:
* converting to grayscale (some CovidNet images are RGB)
* apply a small-window CLAHE
* resize to standard size
* whiten

In [6]:
def img2grayscale(img):
    from skimage.color import rgb2gray
    if len(img.shape) > 2:
        img = rgb2gray(img)
    return img

def whiten_img(img):
    import numpy as np
    width = np.max(img) - np.min(img)
    img = img - np.min(img)
    img = img/width
    return img

def resize_img(img, sz=128):
    from skimage.transform import resize
    img = resize(img, (sz,sz))
    img = whiten_img(img)
    from skimage.util import img_as_uint
    img = img_as_uint(img)
    return img

import cv2
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))

def filter_img(img):
    img = clahe.apply(img)
    return img

We will use standard code for showing thumbnails as we progress

In [7]:
import show_original_decoded

Define helpers for keeping track of the original and processed images, as dictionaries from the original file stem

In [8]:
original_imgs = {}
processed_imgs = {} 
sz = 128

def show_thumbnail_progress():
    keys = list(original_imgs.keys())[-11:-1]
    print(keys)
    show_original_decoded(
        [resize_img(img2grayscale(original_imgs[key]),sz) for key in keys],
        [processed_imgs[key] for key in keys], sz)

def add_processed_img(name, original_img, processed_img, all_count):
    original_imgs[name] = original_img
    processed_imgs[name] = processed_img
    print(f'{len(processed_imgs)} of {all_count}', end = '\r')
    if len(processed_imgs) % 100 == 0:
        from IPython.display import clear_output
        clear_output(True)
        if len(processed_imgs) > 10:
            show_thumbnail_progress()

Scan data from original location, and compare to temp to see how many are left

In [9]:
import os
from pathlib import Path
chest_root = Path(os.environ['DATA_ALL']) / 'chest-nihcc'
nofindings_png_filenames = chest_root.glob('by_class/no_finding/*.png')
nofindings_png_filenames = list(nofindings_png_filenames)[:]
print(f"{len(nofindings_png_filenames)} original files")

60361 original files


In [10]:
clahe_temp = Path(os.environ['DATA_TEMP']) / 'anat0mixer_temp' / 'clahe_processed'
existing_clahe = list([fn.stem for fn in clahe_temp.glob("*.npy")])

remaining = len(nofindings_png_filenames) - len(existing_clahe)
print(f"pre-processing {remaining} of {len(nofindings_png_filenames)}")

# iterate over png files, reading and processing
skipped = 0
for png_filename in nofindings_png_filenames:    
    if png_filename.stem in existing_clahe:
        skipped = skipped + 1
        print(f"skipping {skipped}...", end = '\r')
        continue
        
    # read the png image
    import imageio            
    original_img = imageio.imread(png_filename)
    
    # process the image
    img = img2grayscale(original_img)
    img = resize_img(img, sz)
    img = filter_img(img)
    img = whiten_img(img)
    add_processed_img(png_filename.stem, 
                      original_img, img, 
                      len(nofindings_png_filenames) - len(existing_clahe))
    
    # and save the npy file
    import numpy as np
    np.save(clahe_temp / png_filename.stem, img)
    
print('done')

pre-processing 53 of 60361
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[    0     0     0 ... 14775   146     0]
 [    0     0     0 ...  8704   146     0]
 [    0     0     0 ...  1463     0     0]
 ...
 [    0     0     0 ...   731     0     0]
 [    0     0     0 ...   366     0     0]
 [    0     0     0 ...     0     0     0]]
[[192 176 170 ... 247 251 251]
 [179 170 168 ... 143 234 250]
 [172 168 162 ...  25 100 168]
 ...
 [201 201 200 ... 198 198 195]
 [201 201 200 ... 202 201 199]
 [201 200 199 ... 205 203 203]]
[[38768 19081  5529 ...  2158  2495  8630]
 [37622 18002  5057 ...  1618  1753  1955]
 [37015 17597  4989 ...  1618  1618  1618]
 ...
 [50972 50432 50297 ... 49354 44634 40184]
 [53264 53129 53129 ... 53264 50095 46522]
 [53669 53399 53399 ... 56905 54882 52455]]
[[ 14  14  14 ...  19  19  19]
 [ 15  16  16 ...  21  21  21]
 [ 16  15  15 ...  20  20  20]
 ...
 [156 157 159 ...  44  43  43]
 [15

 [135 131 125 ...  16  11   0]]
[[53595 38605 25338 ... 56713 57774 58106]
 [30711 12736  4378 ... 44773 46299 46962]
 [13996  3449   796 ... 11077 11873 12603]
 ...
 [27063 21558 17511 ...  3449  3449  3184]
 [28522 23216 19236 ...  3449  3449  3184]
 [30645 26134 22088 ...  3449  3449  3184]]
[[43 38 34 ... 59 67 39]
 [32 29 27 ... 50 54 31]
 [26 23 22 ... 43 47 25]
 ...
 [12 12 12 ... 14  8  0]
 [12 12 12 ... 15  8  0]
 [12 12 12 ... 15  9  0]]
[[4254 3456 3323 ... 2924 3523 6647]
 [2924 2924 2924 ... 2924 2924 4187]
 [2924 2924 2924 ... 2924 2924 3921]
 ...
 [2924 2659 2260 ...  266  798 1063]
 [2924 2659 2260 ...  266  798 1063]
 [2924 2659 2260 ...  598  864 1795]]
[[ 0  0  0 ...  7  7  7]
 [ 0  0  0 ... 11 11 11]
 [ 0  0  0 ... 10 11 11]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]]
[[    0     0     0 ...  2212  2489  2489]
 [    0     0     0 ...  1797  1936  2489]
 [    0     0     0 ...  2074  1936  2489]
 ...
 [    0     0  1175 ... 51986 

[[225 227 217 ... 226 237 150]
 [221 219 210 ... 206 216 138]
 [215 205 200 ... 177 188 120]
 ...
 [  0   5  14 ... 113 123  76]
 [  0   2  13 ... 117 124  76]
 [  0   2  12 ... 117 124  77]]
[[38731 22759 13573 ... 27146 27078 29751]
 [14053  4799  1508 ...  4799  3428  3153]
 [ 3702   274     0 ...   137     0     0]
 ...
 [ 7266  7129  9871 ... 32493 28175 26461]
 [ 6444  7883 11380 ... 33864 29340 27489]
 [ 5690  9597 13984 ... 37292 32219 29751]]
[[221 219 220 ... 235 247 165]
 [222 219 219 ... 228 240 161]
 [217 214 216 ... 228 240 161]
 ...
 [ 12  12  12 ... 213 228 153]
 [ 12  12  12 ... 209 227 152]
 [ 12  12  12 ... 209 226 157]]
[[49801 37077 29210 ... 57463 58899 59310]
 [23122  9851  4310 ... 48707 50348 51306]
 [ 6430  1368   547 ... 18607 19838 22233]
 ...
 [ 3010  3010  2873 ... 44397 46654 52127]
 [ 3010  3010  2873 ... 45081 47544 52948]
 [ 3010  3010  2873 ... 45149 48365 53906]]
[[  2   3   2 ...   2   2   1]
 [  3   5   4 ...   4   4   2]
 [  3   5   4 ...   4   4 