In [88]:
import pandas as pd
import pydicom as dicom
from PIL import Image
import numpy as np
import os
import cv2
import shutil
from tqdm import tqdm
import matplotlib.pyplot as plt

# Helpful functions


In [11]:
def check_wether_patient_has_records(path_to_patient_folder, get_names_of_records=False):
    
    """
    Args:
        get_names_of_records (bool): wether return names of record files
        
    Returns:
        bool: Retuns value. True if patient folder contains the records and False otherwise. 
        or
        tuple (bool, list): returns bool and names of the record files for the patient.  
    """
    names_of_the_records = [x for x in os.listdir(path_to_patient_folder) if 'doc' in x or 'xlsx' in x]
    if get_names_of_records:
        return len(names_of_the_records) >=1, names_of_the_records
    else:
        return len(names_of_the_records) >=1

def check_wether_patient_has_image_data(path_to_patient_folder):
    """
    Returns:
        bool : Returns True if patient folder contatin image data and False otherwise
    """
    names_of_the_records = [x for x in os.listdir(path_to_patient_folder) if 'DICOMOBJ' in x]
    return len(names_of_the_records) >= 1

def get_structure_of_the_dataset(path_to_dataset):
    """
    
    Returns:
        dict :  keys - patient names(str): values (list of strings) - paths to the images and records
                                                    
    """
    patients_database = {}
    
    reports_folders = [x for x in os.listdir(path_to_dataset) if 'System' not in x and 'BIN' not in x]
    
    for report_folder in reports_folders:
        patients_per_folder = os.listdir(os.path.join(path_to_dataset, report_folder))
        
        for patient in patients_per_folder:
            
            files_in_patient_folder = os.listdir(os.path.join(path_to_dataset, report_folder, patient))
            
            if check_wether_patient_has_image_data(os.path.join(path_to_dataset, report_folder, patient)):
                patient_images = os.listdir(os.path.join(path_to_dataset, report_folder, patient, 'DICOMOBJ'))
                patient_images_paths = [os.path.join(path_to_dataset, report_folder, patient, 'DICOMOBJ', x) 
                                 for x in patient_images]
            else:
                patient_images = []
                patient_images_paths = []
            _, patient_records = check_wether_patient_has_records(
                                      os.path.join(path_to_dataset, report_folder, patient), 
                                      get_names_of_records=True)
            patient_records_paths = [os.path.join(path_to_dataset, report_folder, patient, x) for x in patient_records]
            patients_database[patient] = []
            patients_database[patient] += patient_records_paths
            patients_database[patient] += patient_images_paths
    
    return patients_database

# Dataset statistics

In [29]:
PATH_TO_THE_WHOLE_DATASET = 'E:'

In [30]:
patients_database = get_structure_of_the_dataset(PATH_TO_THE_WHOLE_DATASET)

### Patients without images or even Empty folders

In [40]:
for i in patients_database:
    if len(patients_database[i]) <=1:
        print(i, patients_database[i])

18 CTCAHAS15091955 ['E:CTCA NO RECONS EXCEL REPORTS 1-30\\18 CTCAHAS15091955\\REPORT CTCAHAS15091955.xlsx']
19 CTCAYOM10121943 ['E:CTCA NO RECONS EXCEL REPORTS 1-30\\19 CTCAYOM10121943\\REPORT CTCAYOM10121943.xlsx']
32 CTCAPAS14011976 []
41 CTCADUA20091962 ['E:CTCA NO RECONS EXCEL REPORTS 31-50\\41 CTCADUA20091962\\REPORT CTCADUA20091962.xlsx']
42 CTCANAB01011955 ['E:CTCA NO RECONS EXCEL REPORTS 31-50\\42 CTCANAB01011955\\REPORT CTCANAB01011955.xlsx']
76 CTCARER11081939 ['E:CTCA NO RECONS EXCEL REPORTS 71-90\\76 CTCARER11081939\\REPORT CTCARER11081939.xlsx']
119 CTCADIJ08101969 ['E:CTCA NO RECONS EXCEL REPORTS 111-130\\119 CTCADIJ08101969\\REPORT CTCADIJ08101969.xlsx']
127 CTCALUT11021950 ['E:CTCA NO RECONS EXCEL REPORTS 111-130\\127 CTCALUT11021950\\REPORT CTCALUT11021950.xlsx']
129 CTCABOE18111946 ['E:CTCA NO RECONS EXCEL REPORTS 111-130\\129 CTCABOE18111946\\REPORT CTCABOE18111946.xlsx']
275 CTCAMCM21081951 ['E:CTCA NO RECONS EXCEL REPORTS 271-290\\275 CTCAMCM21081951\\REPORT CTCAMC

### Patient with a very few images

In [43]:
for i in patients_database:
    if len(patients_database[i]) <=20 and len(patients_database[i]) >1:
        print(i, patients_database[i])

24 CTCAAHG14111965 ['E:CTCA NO RECONS EXCEL REPORTS 1-30\\24 CTCAAHG14111965\\REPORT CTCAAHG14111965.xlsx', 'E:CTCA NO RECONS EXCEL REPORTS 1-30\\24 CTCAAHG14111965\\DICOMOBJ\\00000001']
140 CTCACHP07101955 ['E:CTCA NO RECONS EXCEL REPORTS 131-150\\140 CTCACHP07101955\\REPORT CTCACHP07101955.xlsx', 'E:CTCA NO RECONS EXCEL REPORTS 131-150\\140 CTCACHP07101955\\DICOMOBJ\\00000001']


### Patients without Records

In [32]:
for i in patients_database:
    if len(patients_database[i])>=1:
        
        if not patients_database[i][0].endswith('docx') and \
            not patients_database[i][0].endswith('xlsx'):
            print(i)

1 CTCACOB24121949
349 CTCANGM17081945


### Number of DOCX and XLSX

In [34]:
word = 0
excel = 0
for i in patients_database:
    if len(patients_database[i])>=1:
        if patients_database[i][0].endswith('docx'):
            word+=1
        elif patients_database[i][0].endswith('xlsx'):
            excel += 1

print("Num of docs: ", word)
print("Num of excels: ", excel)

Num of docs:  20
Num of excels:  326


# Copy only images and reports

In [108]:
def copy_dataset(patients_database, path_to_copy):
    """
    Args:
        patients_database (dict): dictionary with patients and corresponding 
                                  images and records
        path_to_copy (str): destination folder, where all dataset will
                            be located
        
    Returns:
        None
    """
    # Create folder to the dataset
    if not os.path.exists(path_to_copy):
        os.mkdir(path_to_copy)
    
    for patient in tqdm(patients_database):
        # Check wether patient's folder contains images
        if len(patients_database[patient]) <=2:
            continue
            
        # Check wether patient contains the records
        path_to_the_patient = patients_database[patient][0]
        path_to_the_patient = '\\'.join(path_to_the_patient.split('\\')[:2])
        if not check_wether_patient_has_records(path_to_the_patient):
            continue
         
        group_folder_name = patients_database[patient][0].split('\\')[0][2:]
        group_folder_name = '_'.join([x.lower() for x in group_folder_name.split()])
        patient_folder_name = patients_database[patient][0].split('\\')[1]
        patient_folder_name = '_'.join([x for x in patient_folder_name.split()])
        
        # Create directories
        if not os.path.exists(os.path.join(path_to_copy, group_folder_name)):
            os.mkdir(os.path.join(path_to_copy, group_folder_name))
        if not os.path.exists(os.path.join(path_to_copy, group_folder_name, patient_folder_name)):
            os.mkdir(os.path.join(path_to_copy, group_folder_name, patient_folder_name))
        
        # Copy Records
        shutil.copy(patients_database[patient][0], os.path.join(
                path_to_copy, group_folder_name, patient_folder_name, patients_database[patient][0].split('\\')[-1]))
        
        # Create folder patients's for images
        if not os.path.exists(os.path.join( path_to_copy, group_folder_name, patient_folder_name, 'images')):
            os.mkdir(os.path.join( path_to_copy, group_folder_name, patient_folder_name, 'images'))
            
        # Copy images
        for i in range(1, len(patients_database[patient])):
            shutil.copy(patients_database[patient][i], os.path.join(
                path_to_copy, group_folder_name, patient_folder_name, 'images', patients_database[patient][i].split('\\')[-1]))

copy_dataset(patients_database, 'D:\coronaryProject\dataset')




  0%|                                                                                          | 0/350 [00:00<?, ?it/s]


  1%|▍                                                                                 | 2/350 [00:03<10:56,  1.89s/it]


  1%|▋                                                                                 | 3/350 [00:09<17:56,  3.10s/it]


  1%|▉                                                                                 | 4/350 [00:15<22:22,  3.88s/it]


  1%|█▏                                                                                | 5/350 [00:22<26:59,  4.69s/it]


  2%|█▍                                                                                | 6/350 [00:33<37:59,  6.63s/it]


  2%|█▋                                                                                | 7/350 [00:42<42:33,  7.44s/it]


  2%|█▊                                                                                | 8/350 [00:47<37:47,  6.63s/it]


  3%|██              

 42%|█████████████████████████████████▎                                              | 146/350 [18:44<27:29,  8.08s/it]


 42%|█████████████████████████████████▌                                              | 147/350 [18:49<24:37,  7.28s/it]


 42%|█████████████████████████████████▊                                              | 148/350 [18:54<22:29,  6.68s/it]


 43%|██████████████████████████████████                                              | 149/350 [19:00<21:14,  6.34s/it]


 43%|██████████████████████████████████▎                                             | 150/350 [19:08<23:12,  6.96s/it]


 43%|██████████████████████████████████▌                                             | 151/350 [19:18<25:50,  7.79s/it]


 43%|██████████████████████████████████▋                                             | 152/350 [19:24<23:27,  7.11s/it]


 44%|██████████████████████████████████▉                                             | 153/350 [19:29<21:52,  6.66s/it]


 44%|███████████████████

 81%|████████████████████████████████████████████████████████████████▍               | 282/350 [36:34<06:52,  6.06s/it]


 81%|████████████████████████████████████████████████████████████████▋               | 283/350 [36:39<06:34,  5.88s/it]


 81%|████████████████████████████████████████████████████████████████▉               | 284/350 [36:46<06:40,  6.07s/it]


 81%|█████████████████████████████████████████████████████████████████▏              | 285/350 [36:55<07:41,  7.11s/it]


 82%|█████████████████████████████████████████████████████████████████▎              | 286/350 [37:02<07:34,  7.10s/it]


 82%|█████████████████████████████████████████████████████████████████▌              | 287/350 [37:14<08:57,  8.53s/it]


 82%|█████████████████████████████████████████████████████████████████▊              | 288/350 [37:20<07:58,  7.72s/it]


 83%|██████████████████████████████████████████████████████████████████▎             | 290/350 [37:34<07:31,  7.52s/it]


 83%|███████████████████