In [1]:
import pandas as pd
import pydicom as dicom
from PIL import Image
import numpy as np
import os
import cv2
import shutil
from tqdm import tqdm
import matplotlib.pyplot as plt

# Helpful functions


In [2]:
def check_wether_patient_has_records(path_to_patient_folder, get_names_of_records=False):
    
    """
    Args:
        get_names_of_records (bool): wether return names of record files
        
    Returns:
        bool: Retuns value. True if patient folder contains the records and False otherwise. 
        or
        tuple (bool, list): returns bool and names of the record files for the patient.  
    """
    names_of_the_records = [x for x in os.listdir(path_to_patient_folder) if 'doc' in x or 'xlsx' in x]
    if get_names_of_records:
        return len(names_of_the_records) >=1, names_of_the_records
    else:
        return len(names_of_the_records) >=1

def check_wether_patient_has_image_data(path_to_patient_folder):
    """
    Returns:
        bool : Returns True if patient folder contatin image data and False otherwise
    """
    names_of_the_records = [x for x in os.listdir(path_to_patient_folder) if 'DICOMOBJ' in x]
    return len(names_of_the_records) >= 1

def get_structure_of_the_dataset(path_to_dataset):
    """
    
    Returns:
        dict :  keys - patient names(str): values (list of strings) - paths to the images and records
                                                    
    """
    patients_database = {}
    
    reports_folders = [x for x in os.listdir(path_to_dataset) if not any(i in x for i in
                            ['System', 'BIN', '$.BIN', 'Autorun.inf','Seagate', 'SeagateExpansion.ico', 
                             'Start_Here_Mac.app', 'Start_Here_Win.exe', 'Warranty.pdf'])]
                       #'System' not in x and 'BIN' not in x]
    for report_folder in tqdm(reports_folders):
        patients_per_folder = os.listdir(os.path.join(path_to_dataset, report_folder))
        
        for patient in patients_per_folder:
            
            files_in_patient_folder = os.listdir(os.path.join(path_to_dataset, report_folder, patient))
            
            if check_wether_patient_has_image_data(os.path.join(path_to_dataset, report_folder, patient)):
                patient_images = os.listdir(os.path.join(path_to_dataset, report_folder, patient, 'DICOMOBJ'))
                patient_images_paths = [os.path.join(path_to_dataset, report_folder, patient, 'DICOMOBJ', x) 
                                 for x in patient_images]
            else:
                patient_images = []
                patient_images_paths = []
            _, patient_records = check_wether_patient_has_records(
                                      os.path.join(path_to_dataset, report_folder, patient), 
                                      get_names_of_records=True)
            patient_records_paths = [os.path.join(path_to_dataset, report_folder, patient, x) for x in patient_records]
            patients_database[patient] = []
            patients_database[patient] += patient_records_paths
            patients_database[patient] += patient_images_paths
    
    return patients_database

# Dataset statistics

In [3]:
PATH_TO_THE_WHOLE_DATASET = 'E:\\'

In [4]:
patients_database = get_structure_of_the_dataset(PATH_TO_THE_WHOLE_DATASET)

100%|██████████████████████████████████████████████████████████████████████████████████| 55/55 [00:09<00:00,  6.35it/s]


### Patients without images or even Empty folders

In [55]:
for i in patients_database:
    if len(patients_database[i]) <=1:
        print(i, patients_database[i])

18 CTCAHAS15091955 ['E:\\CTCA NO RECONS EXCEL REPORTS 1-30\\18 CTCAHAS15091955\\REPORT CTCAHAS15091955.xlsx']
19 CTCAYOM10121943 ['E:\\CTCA NO RECONS EXCEL REPORTS 1-30\\19 CTCAYOM10121943\\REPORT CTCAYOM10121943.xlsx']
119 CTCADIJ08101969 ['E:\\CTCA NO RECONS EXCEL REPORTS 111-130\\119 CTCADIJ08101969\\REPORT CTCADIJ08101969.xlsx']
127 CTCALUT11021950 ['E:\\CTCA NO RECONS EXCEL REPORTS 111-130\\127 CTCALUT11021950\\REPORT CTCALUT11021950.xlsx']
129 CTCABOE18111946 ['E:\\CTCA NO RECONS EXCEL REPORTS 111-130\\129 CTCABOE18111946\\REPORT CTCABOE18111946.xlsx']
275 CTCAMCM21081951 ['E:\\CTCA NO RECONS EXCEL REPORTS 271-290\\275 CTCAMCM21081951\\REPORT CTCAMCM21081951.xlsx']
278 CTCATRJ09081960 ['E:\\CTCA NO RECONS EXCEL REPORTS 271-290\\278 CTCATRJ09081960\\REPORT CTCATRJ09081960.xlsx']
29 CTCAGIS19031946 []
32 CTCAPAS14011976 []
41 CTCADUA20091962 ['E:\\CTCA NO RECONS EXCEL REPORTS 31-50\\41 CTCADUA20091962\\REPORT CTCADUA20091962.xlsx']
42 CTCANAB01011955 ['E:\\CTCA NO RECONS EXCEL REPO

### Patient with a very few images

In [56]:
for i in patients_database:
    if len(patients_database[i]) <=20 and len(patients_database[i]) >1:
        print(i, patients_database[i])

24 CTCAAHG14111965 ['E:\\CTCA NO RECONS EXCEL REPORTS 1-30\\24 CTCAAHG14111965\\REPORT CTCAAHG14111965.xlsx', 'E:\\CTCA NO RECONS EXCEL REPORTS 1-30\\24 CTCAAHG14111965\\DICOMOBJ\\00000001']
140 CTCACHP07101955 ['E:\\CTCA NO RECONS EXCEL REPORTS 131-150\\140 CTCACHP07101955\\REPORT CTCACHP07101955.xlsx', 'E:\\CTCA NO RECONS EXCEL REPORTS 131-150\\140 CTCACHP07101955\\DICOMOBJ\\00000001']
798 CTCASAC09091967 ['E:\\CTCA WITH RECONS EXCEL REPORTS 791-810\\798 CTCASAC09091967\\REPORT CTCASAC09091967.xlsx', 'E:\\CTCA WITH RECONS EXCEL REPORTS 791-810\\798 CTCASAC09091967\\DICOMOBJ\\00000001']


### Patients without Records

In [57]:
for i in patients_database:
    if len(patients_database[i])>=1:
        
        if not patients_database[i][0].endswith('docx') and \
            not patients_database[i][0].endswith('xlsx'):
            print(i)

1 CTCACOB24121949
349 CTCANGM17081945
428 CTCASIN11111959
629 CTCATAA17041965
952 CTCASTV03081936


### Number of DOCX and XLSX

In [58]:
word = 0
excel = 0
for i in patients_database:
    if len(patients_database[i])>=1:
        if patients_database[i][0].endswith('docx'):
            word+=1
        elif patients_database[i][0].endswith('xlsx'):
            excel += 1

print("Num of docs: ", word)
print("Num of excels: ", excel)

Num of docs:  20
Num of excels:  1044


# Copy only images and reports

In [59]:
def copy_dataset(patients_database, path_to_copy):
    """
    Copy only image data and records without DICOM viewer program
    Args:
        patients_database (dict): dictionary with patients and corresponding 
                                  images and records
        path_to_copy (str): destination folder, where all dataset will
                            be located
        
    Returns:
        None
    """
    # Create folder to the dataset
    if not os.path.exists(path_to_copy):
        os.mkdir(path_to_copy)
    
    for patient in tqdm(patients_database):
        # Check wether patient's folder contains images
        if len(patients_database[patient]) <=2:
            continue
            
        # Check wether patient contains the records
        path_to_the_patient = patients_database[patient][0]
        path_to_the_patient = '\\'.join(path_to_the_patient.split('\\')[:2])
        if not check_wether_patient_has_records(path_to_the_patient):
            continue
         
        group_folder_name = patients_database[patient][0].split('\\')[0][2:]
        group_folder_name = '_'.join([x.lower() for x in group_folder_name.split()])
        patient_folder_name = patients_database[patient][0].split('\\')[1]
        patient_folder_name = '_'.join([x for x in patient_folder_name.split()])
        
        # Create directories
        if not os.path.exists(os.path.join(path_to_copy, group_folder_name)):
            os.mkdir(os.path.join(path_to_copy, group_folder_name))
        if not os.path.exists(os.path.join(path_to_copy, group_folder_name, patient_folder_name)):
            os.mkdir(os.path.join(path_to_copy, group_folder_name, patient_folder_name))
        
        # Copy Records
        shutil.copy(patients_database[patient][0], os.path.join(
                path_to_copy, group_folder_name, patient_folder_name, patients_database[patient][0].split('\\')[-1]))
        
        # Create folder patients's for images
        if not os.path.exists(os.path.join( path_to_copy, group_folder_name, patient_folder_name, 'images')):
            os.mkdir(os.path.join( path_to_copy, group_folder_name, patient_folder_name, 'images'))
            
        # Copy images
        for i in range(1, len(patients_database[patient])):
            shutil.copy(patients_database[patient][i], os.path.join(
                path_to_copy, group_folder_name, patient_folder_name, 'images', patients_database[patient][i].split('\\')[-1]))

copy_dataset(patients_database, 'D:\coronaryProject\dataset')

100%|███████████████████████████████████████████████████████████████████████████| 1071/1071 [00:00<00:00, 11400.34it/s]


In [7]:
for key in patients_database:
    print(key)
    break

1 CTCACOB24121949
