In [None]:
import pandas as pd
import pydicom
import pydicom as dicom
from PIL import Image
import numpy as np
import os
import cv2
import shutil
from tqdm import tqdm
from shutil import copyfile
from numpy import nan as Nan
import cv2
import re

import matplotlib.pyplot as plt

# 1. Copy data without DICOM viewer

In [2]:
def check_wether_patient_has_records(path_to_patient_folder, get_names_of_records=False):
    
    """
    Args:
        get_names_of_records (bool): wether return names of record files
        
    Returns:
        bool: Retuns value. True if patient folder contains the records and False otherwise. 
        or
        tuple (bool, list): returns bool and names of the record files for the patient.  
    """
    names_of_the_records = [x for x in os.listdir(path_to_patient_folder) if 'doc' in x or 'xlsx' in x]
    if get_names_of_records:
        return len(names_of_the_records) >=1, names_of_the_records
    else:
        return len(names_of_the_records) >=1

def check_wether_patient_has_image_data(path_to_patient_folder):
    """
    Returns:
        bool : Returns True if patient folder contatin image data and False otherwise
    """
    names_of_the_records = [x for x in os.listdir(path_to_patient_folder) if 'DICOMOBJ' in x]
    return len(names_of_the_records) >= 1

def get_structure_of_the_dataset(path_to_dataset):
    """
    
    Returns:
        dict :  keys - patient names(str): values (list of strings) - paths to the images and records
                                                    
    """
    patients_database = {}
    
    reports_folders = [x for x in os.listdir(path_to_dataset) if not any(i in x for i in
                            ['System', 'BIN', '$.BIN', 'Autorun.inf','Seagate', 'SeagateExpansion.ico', 
                             'Start_Here_Mac.app', 'Start_Here_Win.exe', 'Warranty.pdf'])]
                       #'System' not in x and 'BIN' not in x]
    for report_folder in tqdm(reports_folders):
        patients_per_folder = os.listdir(os.path.join(path_to_dataset, report_folder))
        
        for patient in patients_per_folder:
            
            files_in_patient_folder = os.listdir(os.path.join(path_to_dataset, report_folder, patient))
            
            if check_wether_patient_has_image_data(os.path.join(path_to_dataset, report_folder, patient)):
                patient_images = os.listdir(os.path.join(path_to_dataset, report_folder, patient, 'DICOMOBJ'))
                patient_images_paths = [os.path.join(path_to_dataset, report_folder, patient, 'DICOMOBJ', x) 
                                 for x in patient_images]
            else:
                patient_images = []
                patient_images_paths = []
            _, patient_records = check_wether_patient_has_records(
                                      os.path.join(path_to_dataset, report_folder, patient), 
                                      get_names_of_records=True)
            patient_records_paths = [os.path.join(path_to_dataset, report_folder, patient, x) for x in patient_records]
            patients_database[patient] = []
            patients_database[patient] += patient_records_paths
            patients_database[patient] += patient_images_paths
    
    return patients_database

def copy_dataset(patients_database, path_to_copy):
    """
    Copy only image data and records without DICOM viewer program
    Args:
        patients_database (dict): dictionary with patients and corresponding 
                                  images and records
        path_to_copy (str): destination folder, where all dataset will
                            be located
        
    Returns:
        None
    """
    # Create folder to the dataset
    if not os.path.exists(path_to_copy):
        os.mkdir(path_to_copy)
    
    for patient in tqdm(patients_database):
        # Check wether patient's folder contains images
        if len(patients_database[patient]) <=2:
            continue
        # Check wether patient contains the records
        path_to_the_patient = patients_database[patient][0]

        path_to_the_patient = '\\'.join(path_to_the_patient.split('\\')[:4])
        if not check_wether_patient_has_records(path_to_the_patient):
            continue

        group_folder_name = patients_database[patient][0].split('\\')[0][2:]
        group_folder_name = '_'.join([x.lower() for x in group_folder_name.split()])
        
#         patient_folder_name = patients_database[patient][0].split('\\')[1]
        patient_folder_name = patient #'_'.join([x for x in patient_folder_name.split()])

        # Create directories
        if not os.path.exists(os.path.join(path_to_copy, group_folder_name)):
            os.mkdir(os.path.join(path_to_copy, group_folder_name))
        if not os.path.exists(os.path.join(path_to_copy, group_folder_name, patient_folder_name)):
            os.mkdir(os.path.join(path_to_copy, group_folder_name, patient_folder_name))
        
        # Copy Records
        shutil.copy(patients_database[patient][0], os.path.join(
                path_to_copy, group_folder_name, patient_folder_name, patients_database[patient][0].split('\\')[-1]))
        
        # Create folder patients's for images
        if not os.path.exists(os.path.join( path_to_copy, group_folder_name, patient_folder_name, 'images')):
            os.mkdir(os.path.join( path_to_copy, group_folder_name, patient_folder_name, 'images'))
            

        # Copy images
        for i in range(1, len(patients_database[patient])):
            shutil.copy(patients_database[patient][i], os.path.join(
                path_to_copy, patient_folder_name, 'images', patients_database[patient][i].split('\\')[-1]))



In [None]:
PATH_TO_THE_WHOLE_DATASET = 'D:\dataset'

In [None]:
patients_database = get_structure_of_the_dataset(PATH_TO_THE_WHOLE_DATASET)
copy_dataset(patients_database, 'D:\data')


# 2. Take only MPRs

'AW electronic film', 'Scout', 'Correction Halted 80% - Series 304 - adjust vessel tracking'

'73','81', '71'

'45%','42%','80%',

'CTCA HD STD', 'CTCA','CTCA 73%','CTCA SMARTPHASE','CTCA 79%'

'CALCIUM SCORE', 'CALCIUM SCORE C2'

'SS-Freeze 39% - Original Series 306', 'Auto State - series 405 - 256 images','Auto State - series 301 - 256 images','Auto State - series 3 - 440 images', ,'Auto State - series 3 - 464 images', 'SS-Freeze 80% - Original Series 305','SS-Freeze 75% - Original Series 304','SS-Freeze 42% - Original Series 304','SS-Freeze 77% - Original Series 304','SS-Freeze 75% - Original Series 305','SS-Freeze 75% - Original Series 307','Auto State - series 405 - 256 images','SS-Freeze 39% - Original Series 306','SS-Freeze 39% - Original Series 306','SS-Freeze 77% - Original Series 305','SS-Freeze 81% - Original Series 305'

 '3D Saved State -  FINAL 76%'
 
 'SVG-PDA graft'
 
 
 'CALCIUM SCORE', 'CTCA', 'LAD *', 'LAD-D1 *', 'LAD-D2 *', 'OM *', 'PDA *', 'PLB *', 'PLB2 *', 'AW electronic film', 'Scout', 'LCX *', 'LCX-OM *', 'LAD-D3 *', 'LCX-OM2 *', 'PDA2 *', '73', 'RIB *', 'AM *', 'Correction Halted 80% - Series 304 - adjust vessel tracking', 'RCA *', 'RCA-AM *', 'PLB*', 'CTCA HD STD', '45%', 'RCA -PDA*', 'RCA -PLB*', 'RCA -AM*', 'OM2 *', 'RCA-PDA *', 'SS-Freeze 75% - Original Series 304', 'AM*', 'RIB-Branch of RIB *', 'LCX-OM1 *', 'CT ARTERIES', 'CTCA 73%', 'PLB1 *', '42%', 'SS-Freeze 42% - Original Series 304', 'SS-Freeze 77% - Original Series 304', 'LCX TO PDA *', 'LCX TO PDA2 *', 'LCX TO PLB *', 'AM1 *', 'AM2 *', '80%', 'LAD - D4 *', 'SS-Freeze 75% - Original Series 305', 'SS-Freeze 81% - Original Series 305', 'LCX-OM*', 'LCX-OM3 *', 'CALCIUM SCORE C2', '81', 'LCX - OM *', 'LCX - OM2 *', 'SS-Freeze 77% - Original Series 305', 'LAD', 'LAD-D1', 'LAD-D2', 'OM', 'OM2', 'RCA', 'PDA', 'SS-Freeze 75% - Original Series 307', 'RCA-PDA', 'RCA-PLB', 'OM1', 'OM3', 'RIB', 'CTCA SMARTPHASE', 'LCX', 'LCX-OM', 'PLB', 'LAD*', 'LAD-D1*', 'LCX TO PDA*', '71', 'OM3 *', 'PDA1 *', 'Branch 2 of PLB *', 'Branch of PLB *', 'LCX-OM2*', 'LCX-OM3*', 'RCA*', 'RCA-AM*', 'RCA-PDA*', 'RCA-PLB*', 'LAD-D2*', 'LCX*', 'LAD - D1', 'LCX-OM2', 'SS-Freeze 39% - Original Series 306', 'RCA-AM', 'RCA-PDA2', 'RCA-PLB2', 'RCA-PDA2*', 'Auto State - series 405 - 256 images', 'LCX-PLB', 'LCX-PDA', 'LAD-D3', 'LCX-OM1', 'LCX-OM3', 'LCX-PLB1', 'PDA2', '33', 'PLB2', 'PDA*', 'Auto State - series 3 - 256 images', 'CTCA 70%', 'Auto State - series 306 - 256 images', 'Auto State - series 301 - 256 images', '3D Saved State -  FINAL 76%', 'Auto State - series 304 - 256 images', 'LIMA-LAD', 'Aorta-D1', 'Aorta-D2', 'LAD original', 'LAD-D1 original', 'LCX original', 'LCX-OM original', 'PDA original', 'PDA2 original', 'PLB original', 'Aorta-LCX', 'Auto State - series 3 - 440 images', 'CTCA 79%', 'Auto State - series 305 - 256 images', 'Auto State - series 407 - 256 images', 'SS-Freeze 38% - Original Series 307', 'PLB1', 'LAD-D2 original', 'LAD-D3 original', 'OM original', 'RCA original', 'RCA-PDA original', 'RCA-PLB1 original', 'RCA-PLB2 original', 'LIMA-LAD graft', 'Aorta-LCX graft', 'Aorta-PDA graft', 'Auto State - series 3 - 464 images', 'Auto State - series 301 - 224 images', 'SS-Freeze 80% - Original Series 305', 'LAD-Branch of D1', 'LCX-Branch of OM', '59', 'Auto State - series 405 - 224 images', 'LCX-PDA2', 'RCA-AM1', 'RCA-AM2', 'OM4', 'LCX-PLB2', 'Auto State - series 304 - 224 images', 'LAD-D4', 'SS-Freeze 52% - Original Series 305', '70%', 'Auto State - series 304 - 376 images', 'SS-Freeze 78% - Original Series 305', 'RCA-PDA1', 'SS-Freeze 79% - Original Series 305', 'SS-Freeze 80% - Original Series 304', 'RCA-PLB1', 'Auto State - series 404 - 224 images', 'SS-Freeze 70% - Original Series 305', 'SS-Freeze 75% - Original Series 306', 'Auto State - series 406 - 224 images', 'Auto State - series 3 - 432 images', 'Auto State - series 403 - 256 images', 'SS-Freeze 75% - Original Series 303', 'LCX-OM4', 'RIB-Branch of RIB', 'Auto State - series 3 - 224 images', 'PDA1', 'OM-Branch of OM', 'SS-Freeze 74% - Original Series 305', 'LOCALISER', 'LAD Original', 'Auto State - series 3 - 456 images', 'LAD-D1 Original', 'Aorta-LAD graft', 'Aorta-D1 graft', 'SVG-PDA graft', 'SS-Freeze 45% - Original Series 306', 'Auto State - series 406 - 256 images', '80', 'SS-Freeze 45% - Original Series 310', 'Auto State - series 410 - 256 images', 'PDLB', 'SS-Freeze 33% - Original Series 305', 'SS-Freeze 50% - Original Series 305', '40', 'LCX-OM1*', 'SS-Freeze 51% - Original Series 308', 'SS-Freeze 72% - Original Series 309', 'RIB*', 'SS-Freeze 62% - Original Series 308', 'OM*', 'LAD-D3*', 'PDA2*', 'CALCIUM SCORE/70', 'SS-Freeze 70% - Original Series 306', 'LCX-PDA *', 'PLB2*', 'SS-Freeze 45% - Original Series 305', 'SS-Freeze 72% - Original Series 305', 'D1*', 'LCX-OM4 *', 'LCX-PLB *', 'RCA-PLB *', 'RCA-PDA1 *', 'RCA-PDA2 *', 'SS-Freeze 77% - Original Series 306', 'PLB3*', 'OM-Branch of OM *', '85', 'RCA-AM1 *', 'RCA-AM2 *', 'SS-Freeze 45% - Original Series 304', '43', 'LAD-Branch of D2 *', 'SS-Freeze 80% - Original Series 306', '75%', 'LAD TO PDA *', 'PDA TO LAD *', 'PDA3 *', 'RCA-PLB2 *', '28', 'AXIAL LUNG 2.5MM', '43%'
 



 
 
 'LCX-OM*', 'RCA-AM*', 'RCA-PDA*', 'RCA-PLB*', 'AW electronic film', 'Scout', 'CALCIUM SCORE', 'CTCA SMARTPHASE', 'LAD*', 'LAD-D1*', 'LAD-D2*', 'LCX*', 'LAD', 'LAD - D1', 'LCX', 'LCX-OM', 'LCX-OM2', 'RCA-PDA', 'RCA-PLB', 'LAD-D1', 'RIB', 'SS-Freeze 39% - Original Series 306', 'RCA-AM', 'LAD-D2', 'RCA-PDA2', 'RCA-PLB2', 'SS-Freeze 75% - Original Series 305', 'RCA-PDA2*', 'CTCA', 'Auto State - series 405 - 256 images', 'LCX-PLB', 'RCA', 'OM', 'LCX-PDA', 'LAD-D3', 'LCX-OM1', 'LCX-OM3', 'LCX-PLB1', 'PDA', 'PLB', 'PDA2', '33', 'PLB2', 'LCX-OM2*', 'LCX-OM3*', 'PDA*', 'PLB*', 'Auto State - series 3 - 256 images', 'CTCA 70%', 'Auto State - series 306 - 256 images', 'OM2', 'Auto State - series 301 - 256 images', '3D Saved State -  FINAL 76%'

In [9]:

def get_unique_mpr_names(path_to_dataset):
    """
    Prints all unique mpr section names.
    """
    folders = os.listdir(path_to_dataset)

    unique_modalities = []
    for folder_name in tqdm(folders):
        for patient_name in tqdm(os.listdir(os.path.join(path_to_dataset))):
            images = list(os.walk(os.path.join(path_to_dataset, patient_name, 'images')))[0][2]
            
            for image_name in images:
                dicom_obj = pydicom.dcmread(os.path.join(path_to_dataset,patient_name, 'images', image_name))
                if dicom_obj.SeriesDescription not in unique_modalities:
                    unique_modalities.append(dicom_obj.SeriesDescription)
#                     print(unique_modalities)     
    print('       FINAL RESULTS:      ')
    print(unique_modalities)
    
def copy_mpr_images_per_patient(path_to_dataset, path_to_save):
    """
    Creates in path_to_save folder for each patient, where all MPR DICOM files are located.
    """
    folders = os.listdir(path_to_dataset)
    raw_images = ['CTCA', 'CALCIUM SCORE', 'Scout', 'AW electronic film', '40', '81', '85','Freeze','Auto State','33','3D']
    for patient_name in tqdm(os.listdir(os.path.join(path_to_dataset))):
        images = list(os.walk(os.path.join(path_to_dataset, patient_name, 'images')))[0][2]
        if not os.path.exists(os.path.join(path_to_save,patient_name)):
            os.mkdir(os.path.join(path_to_save,patient_name))
        for image_name in images:
            dicom_obj = pydicom.dcmread(os.path.join(path_to_dataset,patient_name, 'images', image_name))
            if not any(sub_string in dicom_obj.SeriesDescription for sub_string in raw_images):
                copyfile(os.path.join(path_to_dataset,patient_name, 'images', image_name),
                         os.path.join(path_to_save,patient_name,image_name))
# copy_mpr_images_per_patient(PATH_TO_DATA, 'lol')

In [10]:
PATH_TO_DATA = 'D:\\test\without_viewer'
PATH_TO_SAVE = r'D:\test\only_mprs'

In [12]:
copy_mpr_images_per_patient(PATH_TO_DATA, PATH_TO_SAVE)

# 3. Extract PNG images from the MPRs

In [4]:
def split_mpr_name(mpr_name):
    return \
        "".join(mpr_name.split()).replace('*', '').replace('original', '') \
        .replace('LIMA-', '').replace('Branchof','').replace('TOPDA', '').replace('PDATO', '')

def get_patient_dictionary(path_to_patient_folder):
    """
    
    Returns dict of different types of images in the folder of patient. 
    
    Returns:
        dict: key - type of images; value - list of DICOM files, which sorted in the ascending order with restepct to the
                    depth of the image slice.
    """
    patient_dict = {}
    
    dicom_file_names = os.listdir(path_to_patient_folder)
    
    for i in range(len(dicom_file_names)):
        cur_dicom_obj = dicom.dcmread(os.path.join(path_to_patient_folder, dicom_file_names[i]))
        
        if cur_dicom_obj.SeriesDescription not in patient_dict.keys():
            patient_dict[cur_dicom_obj.SeriesDescription] = []
        patient_dict[cur_dicom_obj.SeriesDescription].append(cur_dicom_obj)
        
    # sort each type of images with respect to their depth in ascending order
    for i in patient_dict:
        patient_dict[i].sort(key=lambda x: x.InstanceNumber)
    
    return patient_dict

def get_pixels_hu(list_of_imgs):
    """
    Convert stack of the images into Houndsfeld units
    """
    image = np.stack([s.pixel_array for s in list_of_imgs])
    # Convert to int16 (from sometimes int16), 
    # should be possible as values should always be low enough (<32k)
    image = image.astype(np.int16)

    # Set outside-of-scan pixels to 1
    # The intercept is usually -1024, so air is approximately 0
    image[image == -2000] = 0
    
    # Convert to Hounsfield units (HU)
    intercept = list_of_imgs[0].RescaleIntercept
    slope = list_of_imgs[0].RescaleSlope
    
    if slope != 1:
        image = slope * image.astype(np.float64)
        image = image.astype(np.int16)
        
    image += np.int16(intercept)
    
    return np.array(image, dtype=np.int16)

In [5]:
# path_to_data = r'D:\coronaryProject\dataset\binary_classification_MPR\images'
# path_to_new_data = r'E:\ONLY_LAD\\'
path_to_data = r'D:\test\only_mprs'
path_to_new_data = r'D:\test\lad_png_images'
list_of_patients = os.listdir(path_to_data)
# patient_dictionary = get_patient_dictionary(path_to_data + '\\'+ list_of_patients[0])

In [16]:
for i in tqdm(range(len(list_of_patients))):
    patient_dictionary = get_patient_dictionary(path_to_data + '\\'+ list_of_patients[i])
    splited_mpr_names = [split_mpr_name(x) for x in patient_dictionary.keys()]
    splited_mpr_names_filtered = [split_mpr_name(x).replace('LAD-', '').replace('D','D-').replace('AD-', 'AD') for x in patient_dictionary.keys() 
                                  if 'LAD' in split_mpr_name(x)]
    dict_keys = list(patient_dictionary.keys())
    
    # change keys in the dict to the corresponding labels in the reports
    for key_element in dict_keys:
        patient_dictionary[split_mpr_name(key_element).replace('LAD-', '').replace('D','D-').replace('AD-', 'AD')] = \
            patient_dictionary[key_element]
        del patient_dictionary[key_element]
        
    if not os.path.exists(os.path.join(path_to_new_data, list_of_patients[i])):
        os.mkdir(os.path.join(path_to_new_data, list_of_patients[i]))

    for key in patient_dictionary.keys():
        if key not in splited_mpr_names_filtered:
            continue
        
        for dicom_file in patient_dictionary[key]:
            if not os.path.exists(os.path.join(path_to_new_data, list_of_patients[i])):
                os.mkdir(os.path.join(path_to_new_data, list_of_patients[i]))
            
            if not os.path.exists(os.path.join(path_to_new_data, list_of_patients[i], key)):
                os.mkdir(os.path.join(path_to_new_data, list_of_patients[i], key))
#             dicom_file.save_as(os.path.join(path_to_new_data, 
#                                             list_of_patients[i], 
# #                                             key,
# #                                             list_of_patients[i]+'_'+str(dicom_file.InstanceNumber)
# #                                            )
# #                               )
            cv2.imwrite(os.path.join(path_to_new_data, 
                                            list_of_patients[i], 
                                            key,
                                            list_of_patients[i]+'_'+str(dicom_file.InstanceNumber)+'.png'
                                           ),
                        cv2.normalize(dicom_file.pixel_array, None, alpha = 0, 
                                      beta = 255, norm_type = cv2.NORM_MINMAX, dtype = cv2.CV_32F)
                       )

    




  0%|                                                                                           | 0/27 [00:00<?, ?it/s]


  4%|███                                                                                | 1/27 [00:06<02:47,  6.45s/it]


  7%|██████▏                                                                            | 2/27 [00:19<03:32,  8.52s/it]


 11%|█████████▏                                                                         | 3/27 [00:52<06:19, 15.80s/it]


 15%|████████████▎                                                                      | 4/27 [01:18<07:12, 18.78s/it]


 19%|███████████████▎                                                                   | 5/27 [01:49<08:13, 22.41s/it]


 22%|██████████████████▍                                                                | 6/27 [02:27<09:32, 27.24s/it]


 26%|█████████████████████▌                                                             | 7/27 [02:55<09:07, 27.38s/it]


 30%|████████████████

# 4. Copy reports

In [18]:
def copy_mpr_records(path_to_dataset, path_to_save):
    """
    Copy all records from the dataset to path_to_save folder.
    """
    if not os.path.exists(path_to_save):
        os.mkdir(path_to_save)
    folders = [x for x in os.listdir(path_to_dataset) if 'WITH RECONS ' in x]

    for patient_name in os.listdir(os.path.join(path_to_dataset)):
        files = os.listdir(os.path.join(path_to_dataset, patient_name))
        files = [x for x in files if ('xlsx' in x) or ('doc' in x)]
        files = files[0] if len(files)>0 else None
        if files:
            copyfile(os.path.join(path_to_dataset, patient_name, files), os.path.join(path_to_save, files))


In [24]:
PATH_TO_DATA = r'd:\test\without_viewer'
PATH_TO_SAVE = r'd:\test\records'

In [25]:
copy_mpr_records(PATH_TO_DATA, PATH_TO_SAVE)

# 5. Merge Reports

In [26]:
def strip_mpr_lad_name(mpr_name):
    """
    Strip MPR name of the LAD artery. We do this step because the name in the doctor's report 
    is not equal to the name in the MPR. 
    
    Returns:
        - str: striped string
    """
    return "".join(mpr_name.split()).replace('*', '').replace('original', '')

def read_and_strip_record(path_to_record):
    '''
    Read record file and remove empty rows and rows with all NaNs.
    
    Returns:
        - Pandas DataFrame: 
    '''
    excel_file = pd.read_excel(path_to_record,index_col=None, header=None)
    excel_file.dropna(how='all')
    excel_file.rename(columns={0: 'a', 1: 'b'}, inplace=True)
    excel_file = excel_file.fillna('  ')
    excel_file = excel_file.replace('', '  ', regex=True)
    excel_file = excel_file.drop(excel_file[excel_file['a'].str.isspace()].index)
    return excel_file

def get_lad_info_from_report(striped_record, artery_type):
    """
    Takes striped(without any empty lines and NaNs) and returns info only about the certain artery type. 
    
    Returns:
        - list: each element is the string with some info about certain artery type
    """
    lad_info = []
    wether_add = False
    lad_info.append(striped_record.iloc[0]['b'])
    for ind, row_value in striped_record.iterrows():
        
        if wether_add and row_value['a'].isupper():
            break
        if wether_add:
            lad_info.append(row_value['a'])
        
        if artery_type in row_value['a']:
            wether_add = True
    return lad_info

def get_level_of_stenosis_from_string(artery_info):
    """
    Returns:
        - list of str: each element is the string with percentage of stenosis. 
    """
    return [x.strip() for x in re.findall(r'.\d{1,3}.?\d{1,3}\%', artery_info)]

In [27]:
path_to_records = r'd:\test\records'
list_of_files = os.listdir(path_to_records)

In [30]:
extracted_lad_df = pd.DataFrame(columns=['PATIENT_ID','PROXIMAL', 'MID', 'DISTAL', 'D-1', 'D-2', 'D-3', 'D-4'])

for i in tqdm(range(len(list_of_files))):
    cur_file = read_and_strip_record(os.path.join(path_to_records, list_of_files[i]))
    cur_patient_lad_info = get_lad_info_from_report(cur_file, 'LEFT ANTERIOR')
    
    new_row = pd.Series(#[Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan],
                        ['-','-','-','-','-','-','-','-'],
                        index=extracted_lad_df.columns)
    new_row['PATIENT_ID'] = cur_patient_lad_info[0]
    cur_patient_lad_info.pop(0)
    list_of_lda_branches = list(extracted_lad_df.columns)
    
    for line_info in cur_patient_lad_info:
        
        artery_area_name = [x for x in list_of_lda_branches 
                            if x in line_info or x.lower() in line_info or x.title() in line_info]
        if len(artery_area_name) >=1:
            artery_area_name = artery_area_name[0]
        else:
            continue
        stenosis_score = get_level_of_stenosis_from_string(line_info)
        stenosis_score =  stenosis_score[0] if stenosis_score else 'NORMAL'
        new_row.loc[artery_area_name] = stenosis_score
    extracted_lad_df = extracted_lad_df.append(new_row, ignore_index=True)




  0%|                                                                                           | 0/27 [00:00<?, ?it/s]


 11%|█████████▏                                                                         | 3/27 [00:00<00:00, 25.22it/s]


 22%|██████████████████▍                                                                | 6/27 [00:00<00:00, 24.07it/s]


 33%|███████████████████████████▋                                                       | 9/27 [00:00<00:00, 24.35it/s]


 44%|████████████████████████████████████▍                                             | 12/27 [00:00<00:00, 24.91it/s]


 56%|█████████████████████████████████████████████▌                                    | 15/27 [00:00<00:00, 24.69it/s]


 67%|██████████████████████████████████████████████████████▋                           | 18/27 [00:00<00:00, 25.04it/s]


 78%|███████████████████████████████████████████████████████████████▊                  | 21/27 [00:00<00:00, 23.95it/s]


 89%|████████████████

In [33]:
extracted_lad_df.head()

Unnamed: 0,PATIENT_ID,PROXIMAL,MID,DISTAL,D-1,D-2,D-3,D-4
0,COV10031966,NORMAL,NORMAL,NORMAL,NORMAL,NORMAL,-,-
1,CTCAALC04021959,NORMAL,<25%,NORMAL,NORMAL,NORMAL,-,-
2,CTCAALS24081961,NORMAL,NORMAL,NORMAL,NORMAL,-,-,-
3,CTCAARD10111950,NORMAL,<25%,NORMAL,NORMAL,NORMAL,-,-
4,CTCABIB16101983,NORMAL,NORMAL,NORMAL,NORMAL,NORMAL,-,-


In [34]:
extracted_lad_df.to_excel('lad_reports.xlsx', index=False)

# 6. Create binary dataset

In [None]:
def get_label_for_patient(lad_segment, reports, patient_id):
    """
    
    
    Example:
        get_label_for_patient('LAD', reports, 'DDJ261Z' )

    """
    classes_to_positive = ['<25%','25%', 'NORMAL', '-']

    if lad_segment == 'LAD':
        curr_section_label = reports.loc[reports['PATIENT_ID'] == patient_id][['MID', 'PROXIMAL', 'DISTAL']].iloc[0]
        stenosis_score = [x for x in curr_section_label if x not in classes_to_positive]
        label = 1 if len(stenosis_score) > 0 else 0
        return label, '___'.join(curr_section_label.values)
        
    else:
        curr_section_label =  reports.loc[reports['PATIENT_ID'] == patient_id][lad_segment].iloc[0]
        label = 0 if curr_section_label in classes_to_positive else 1    
        return label, curr_section_label

def get_labels(path_to_patient, reports):
    """
    Returns:
        - dict: key(str) - type of the artery, value(int) - label(0 or 1)
    """
    
    types_of_images = os.listdir(path_to_patient)
    patient_name = [x for x in path_to_patient.split('/') if len(x) > 0][-1]
    labels_dict = {}
    
    for i in range(len(types_of_images)):
        labels_dict[types_of_images[i]] = get_label_for_patient(types_of_images[i], reports, patient_name)
        
    return labels_dict

def get_imgs_names_to_the_labels(path_to_patient, labels_dict):
    """
    Args:
        - dict: key - branch of artery(str), value - label(int) 0 or 1.   
    Returns:
        - dict: key - label(int) 0 or 1, value(list) list of images, belong to the labelT
    """
    img_labels = {}

    for key in labels_dict.keys():
        list_of_images = os.listdir(os.path.join(path_to_patient, key))
        if labels_dict[key][1]=='-' or labels_dict[key][1]=='-___-___-':
            continue
        if labels_dict[key] in img_labels:
            img_labels[labels_dict[key]]+= [x for x in list_of_images if '_text_deleted' in x]
        else:
            img_labels[labels_dict[key]] = [x for x in list_of_images if '_text_deleted' in x]
            
    return img_labels
