In [2]:

from data_pipeline.ukb_data_extractor import UkbDataExtractor
import data_pipeline.data_processing_utils as dpu
from data_pipeline.odir_5k_data_extractor import ODIR5KDataExtractor
from data_pipeline.rfmid_data_extractor import RFMiDDataExtractor
from data_pipeline.rfmid2_data_extractor import RFMiD2DataExtractor
from data_pipeline.ukb_data_extractor import UkbDataExtractor
from data_pipeline.rips_data_extractor import RIPSDataExtractor
from data_pipeline.ses_data_extractor import SESDataExtractor
from data_pipeline.one_thousand_images_data_extractor import OneThousandImagesDataExtractor
from data_pipeline.data_processing_utils import standardize_labels
import data_pipeline.data_processing_utils as dpu

import numpy as np
from uuid import uuid4
import json
from datetime import datetime
import os
import pandas as pd


In [3]:

dataset_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
dataset_path = f'datasets/{dataset_name}'

In [4]:
labels_to_encode = np.array(["Age-related Macular Degeneration", "Best Disease", "Bietti crystalline dystrophy",
                              "cataract", "Cone Dystrophie or Cone-rod Dystrophie", "Diabetic Retinopathy",
                              "glaucoma", "Maculopathy", "Myopia", "Normal", "Retinitis Pigmentosa", "Stargardt Disease"])

In [5]:
ukb_database_path = 'databases/ird_dataset/IRD-Dataset-Complete-03-anonymized.xlsx'
ukb_data_path ='databases/ird_dataset/export_heyex_original_dataset_03/DICOM'
ukb_extractor = UkbDataExtractor(database_path=ukb_data_path, label_path=ukb_database_path)

odir5k_data_extractor = ODIR5KDataExtractor(database_path='databases/ODIR-5K/full_df.csv', database_test_images_path='databases/ODIR-5K/Testing Images',
                                                database_train_images_path='databases/ODIR-5K/Training Images')

rfmid_train_data_extractor = RFMiDDataExtractor(database_path='databases/RFMiD/Training_Set/RFMiD_Training_Labels.csv',
                                            data_path='databases/RFMiD/Training_Set/Training', file_format='png')

rfmid_validation_datae_xtractor = RFMiDDataExtractor(database_path='databases/RFMiD/Evaluation_Set/RFMiD_Validation_Labels.csv',
                                            data_path='databases/RFMiD/Evaluation_Set/Validation', file_format='png')

rfmid_test_data_extractor = RFMiDDataExtractor(database_path='databases/RFMiD/Test_Set/RFMiD_Testing_Labels.csv',
                                                data_path='databases/RFMiD/Test_Set/Test', file_format='png')

rfmid2_train_data_extractor = RFMiD2DataExtractor(database_path='databases/RFMiD2_0/Training_set/RFMiD_2_Training_labels.csv',
                                                    data_path='databases/RFMiD2_0/Training_set')
rfmid2_validation_data_extractor = RFMiD2DataExtractor(database_path='databases/RFMiD2_0/Validation_set/RFMiD_2_Validation_labels.csv',
                                                            data_path='databases/RFMiD2_0/Validation_set')

rfmid2_test_data_extractor = RFMiD2DataExtractor(database_path='databases/RFMiD2_0/Test_set/RFMiD_2_Testing_labels.csv',
                                                        data_path='databases/RFMiD2_0/Test_set')


one_thousand_images_data_extractor = OneThousandImagesDataExtractor(database_path='databases/1000images/')

rips_data_extractor = RIPSDataExtractor(database_path='databases/RIPS/Original')

ses_data_extractor = SESDataExtractor(database_path='databases/SES/')


In [6]:
#create the data extraction list
default_data_extractors = [odir5k_data_extractor, rfmid_train_data_extractor, rfmid_validation_datae_xtractor, rfmid_test_data_extractor,
                    rfmid2_train_data_extractor, rfmid2_validation_data_extractor, rfmid2_test_data_extractor,
                    one_thousand_images_data_extractor, rips_data_extractor, ses_data_extractor]

dicom_data_extractors = [ukb_extractor]

data_extractors = default_data_extractors + dicom_data_extractors


In [7]:
#extract the data
for data_extractor in data_extractors:
    data_extractor.extract()

#standardize the data
#get the labels of the data
datasets_labels = []
for data_extractor in data_extractors:
    datasets_labels.append(data_extractor.get_labels())
#flatten
labels = []
for dataset_labels in datasets_labels:
    labels.extend(dataset_labels)
#concatenate the labels
labels = np.concatenate(labels)
#drop the None values
labels = labels[labels != None]

In [11]:
not_summarize_set = set(RFMiD2DataExtractor.abbreviation_map.values())
ukb_label_mapping_dict = {'Morbus Best': 'Best Disease', 'Morbus Stargardt': 'Stargardt Disease', 'Retinitis pigmentosa': 'Retinitis Pigmentosa', 'Morbus Stargardt ': 'Stargardt Disease'}
label_standertizer = standardize_labels(labels = labels, not_summarize_set=not_summarize_set)
label_standertizer.update(ukb_label_mapping_dict)

In [13]:
print(label_standertizer)

{'CSNB ': 'CSNB', 'drusen': 'Drusen', 'Drusen ': 'Drusen', 'Drusens': 'Drusen', 'myopia retinopathy': 'Myopia', 'Pathological myopia': 'Myopia', 'pathological myopia': 'Myopia', 'normal fundus': 'Normal', 'Optic atrophy': 'atrophy', 'optic nerve atrophy': 'atrophy', 'chorioretinal atrophy': 'atrophy', 'peripapillary atrophy': 'atrophy', 'diffuse retinal atrophy': 'atrophy', 'oval yellow-white atrophy': 'atrophy', 'diffuse chorioretinal atrophy': 'atrophy', 'Chorioretinal atrophy-coloboma': 'Coloboma', 'retinal pigment epithelium atrophy': 'atrophy', 'chorioretinal atrophy with pigmentation proliferation': 'atrophy', 'macular coloboma': 'Coloboma', 'retinochoroidal coloboma': 'Coloboma', 'congenital choroidal coloboma': 'Coloboma', 'Retinitis pigmentosa': 'Retinitis Pigmentosa', 'retinitis pigmentosa': 'Retinitis Pigmentosa', 'Laser Spots': 'laser spot', 'maculopathy': 'Maculopathy', 'myopic maculopathy': 'Maculopathy', 'Optic Disc Pit Maculopathy': 'Maculopathy', 'low image quality,mac

In [14]:
#find all keys that contain a ,
keys_with_comma = [key for key in label_standertizer.keys() if ',' in key]
print(keys_with_comma)

['low image quality,maculopathy', 'hypertensive retinopathy,diabetic retinopathy']


In [9]:
#value count the labels
label_counts = pd.Series(labels).value_counts()
#get the median label count
median_label_count = label_counts.median()
label_instance_limit = int((max(label_counts) - median_label_count) // 4)
#balance the labels
#find the over represented labels
for labels,extractor in zip(datasets_labels, data_extractors):
    #find the over represented labels
    #replace none with empty string
    over_represented_labels_idxs, _, _ = dpu.find_over_represented_samples(file_paths=extractor.get_file_paths(), labels=labels,
                                                                            max_samples_per_class=label_instance_limit)
    #remove the over represented labels
    #conver the indexes to a boolean array
    over_represented_labels_series = np.isin(np.arange(len(labels)), over_represented_labels_idxs)
    extractor.extracted_data = extractor.extracted_data[~over_represented_labels_series]

In [10]:
#split the data stratified by the labels
train_portion = 0.7
val_portion = 0.1
test_portion = 0.2
split_portions = [train_portion, val_portion, test_portion]

In [11]:
splits = []
for data_extractor in data_extractors:
    splits.extend(data_extractor.split_extracted_data(split_portions = split_portions, stratify=True))

In [12]:
lambda_strip = lambda x: x.strip() if isinstance(x, str) else x
label_translation = lambda x: label_standertizer.get(x, x)
for i, split in enumerate(splits):
    #strip trailing and leading whitespaces
    #split.labels = np.vectorize(lambda_strip)(split.labels)
    split.labels = np.vectorize(label_translation)(split.labels)


In [13]:
#get all the labels
all_labels = []
for split in splits:
    all_labels.extend(split.labels)
#create set of all labels
all_labels = np.concatenate(all_labels)
#filter out the None values
all_labels = all_labels[all_labels != None]
all_labels = np.unique(all_labels)

In [14]:
label_encoder = dpu.create_one_hot_encoder(unique_labels=labels_to_encode)

In [15]:
for split in splits:
    split.labels = dpu.encode_multistring_labels(split.labels, label_encoder)

In [16]:
print(len(splits[0].get_labels()))

2977


In [17]:
#find out which datapoints have a full 0 label
for split in splits:
    print(split.data_source_name)
    labels = split.get_labels()
    no_zero_labels = np.sum(labels, axis=1) != 0
    #print len false values
    print('n filtered', len(no_zero_labels) - np.sum(no_zero_labels), 'out of', len(no_zero_labels))
    #throw away the datapoints with no labels
    split.labels = labels[no_zero_labels]
    split.data = split.data[no_zero_labels]
    split.instance_ids = split.instance_ids[no_zero_labels]


ODIR-5K
n filtered 1641 out of 2977
ODIR-5K
n filtered 232 out of 425
ODIR-5K
n filtered 467 out of 850
RFMiD
n filtered 671 out of 1341
RFMiD
n filtered 103 out of 200
RFMiD
n filtered 189 out of 379
RFMiD
n filtered 215 out of 444
RFMiD
n filtered 34 out of 66
RFMiD
n filtered 64 out of 130
RFMiD
n filtered 225 out of 448
RFMiD
n filtered 33 out of 64
RFMiD
n filtered 63 out of 128
RFMiD2
n filtered 178 out of 310
RFMiD2
n filtered 30 out of 50
RFMiD2
n filtered 57 out of 95
RFMiD2
n filtered 59 out of 106
RFMiD2
n filtered 9 out of 15
RFMiD2
n filtered 21 out of 35
RFMiD2
n filtered 63 out of 104
RFMiD2
n filtered 10 out of 16
RFMiD2
n filtered 16 out of 29
1000images
n filtered 561 out of 699
1000images
n filtered 81 out of 100
1000images
n filtered 162 out of 201
RIPS
n filtered 0 out of 60
RIPS
n filtered 0 out of 30
RIPS
n filtered 0 out of 30
SES
n filtered 2 out of 87
SES
n filtered 0 out of 12
SES
n filtered 0 out of 26
UKB
n filtered 745 out of 1427
UKB
n filtered 90 out of 

In [19]:
len(default_data_extractors[0].get_current_split()[0].get_labels())

1336

In [23]:
#reshape the splits list so that always 3 splits are in a row
buffer = []
for i in range(0, len(splits), 3):
    inner_list = splits[i:i + 3]
    buffer.append(inner_list)
splits = buffer

In [36]:
from datetime import datetime
time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
dataset_path = f'datasets/{time}'
train_datas_save_path = f'{dataset_path}/train'
val_datas_save_path = f'{dataset_path}/val'
test_datas_save_path = f'{dataset_path}/test'
save_path_list = [train_datas_save_path, val_datas_save_path, test_datas_save_path]
os.makedirs(train_datas_save_path, exist_ok=True)
os.makedirs(val_datas_save_path, exist_ok=True)
os.makedirs(test_datas_save_path, exist_ok=True)
#crearte a dataset configuration
dataset_config = {'labels_to_encode': labels_to_encode, 'label_standertizer': label_standertizer}
with open(f'{dataset_path}/dataset_config.json', 'w') as f:
    json.dump(dataset_config, f)
for split in splits:
    for package, path in zip(split, save_path_list):
        total_path = f'{path}/{package.data_source_name}.json'
        #check if the path already exists
        if os.path.exists(total_path):
            #if it exists, append a uuid to the path
            total_path = f'{path}/{package.data_source_name}_{str(uuid4())[:4]}.json'
        package.save(f'{total_path}')
    
