In [1]:
import json
import torch
from torch.utils.data import DataLoader, random_split
from torchvision.datasets import ImageFolder
import numpy as np
from sklearn import model_selection
from pandas import read_excel
import os
import glob
from PIL import Image
from tqdm import tqdm

# used to filter out large images (not of single cells)
IMAGE_SIZE_CUTOFF_UPPER = 800000
# size in bytes
IMAGE_SIZE_CUTOFF_LOWER = 100

In [2]:
def get_patient_orders(exclude_orders=None):
    base_path = '/hddraid5/data/colin/covid-data/'
    label_files = glob.glob(os.path.join(base_path, '*Covid*.xlsx'))
    orders = []
    test_results = []
    for label_file in label_files:
        table = read_excel(label_file)
        table_orders = list(table['Order #'])
        table_test_results = list(table['Covid Test result'])
        orders = orders + table_orders
        test_results = test_results + table_test_results
    positive_images = {}
    negative_images = {}
    for order, test_result in tqdm(zip(orders, test_results), desc='reading excel files', total=len(orders)):
        try:
            np.int(order)
            if test_result.lower() == 'positive':
                label = True
            elif test_result.lower() == 'negative':
                label = False
            else:
                continue
        except (TypeError, AttributeError, ValueError):
            continue
        all_image_paths = glob.glob(os.path.join(base_path, 'COVID Research Images', '**', str(order), '**', '*.jpg'),
                                    recursive=True)
        image_paths = [image_path for image_path in all_image_paths if
                       (os.path.getsize(image_path) < IMAGE_SIZE_CUTOFF_UPPER and os.path.getsize(
                           image_path) > IMAGE_SIZE_CUTOFF_LOWER)]
        if len(image_paths) == 0:
            continue
        if label:
            positive_images[str(order)] = image_paths
        else:
            negative_images[str(order)] = image_paths
    # sort by order number, python 3.7 has dictionaries ordered by default
    negative_images = dict(sorted(negative_images.items()))
    positive_images = dict(sorted(positive_images.items()))
    all_images = dict(negative_images, **positive_images)
    return negative_images, positive_images, all_images

In [3]:
negative_image_paths, positive_image_paths, all_image_paths = get_patient_orders()


reading excel files: 100%|██████████| 1850/1850 [00:44<00:00, 41.52it/s]


In [12]:
cell_counts = [len(c) for k,c in all_image_paths.items()]

In [15]:
np.sum(cell_counts)

21516

In [13]:
np.std(cell_counts)

71.69808618555444

In [14]:
np.mean(cell_counts)

133.63975155279502

In [5]:
total_neg = len(negative_image_paths)
total_pos = len(positive_image_paths)

In [6]:
total_neg

67

In [7]:
total_pos

94

In [9]:
(total_neg + total_pos)

161