In [1]:
import pandas as pd
import pydicom
import pydicom as dicom
from PIL import Image
import numpy as np
import os
import cv2
import shutil
# from __future__ import print_function
from tqdm import tqdm_notebook as tqdm
from shutil import copyfile
from numpy import nan as Nan
from ast import literal_eval
import h5py
import cv2
import re

import matplotlib.pyplot as plt

# 1. Copy data without DICOM viewer

In [94]:
def check_wether_patient_has_records(path_to_patient_folder, get_names_of_records=False):
    
    """
    Args:
        get_names_of_records (bool): wether return names of record files
        
    Returns:
        bool: Retuns value. True if patient folder contains the records and False otherwise. 
        or
        tuple (bool, list): returns bool and names of the record files for the patient.  
    """
    names_of_the_records = [x for x in os.listdir(path_to_patient_folder) if 'doc' in x or 'xlsx' in x]
    if get_names_of_records:
        return len(names_of_the_records) >=1, names_of_the_records
    else:
        return len(names_of_the_records) >=1

def check_wether_patient_has_image_data(path_to_patient_folder):
    """
    Returns:
        bool : Returns True if patient folder contatin image data and False otherwise
    """
    names_of_the_records = [x for x in os.listdir(path_to_patient_folder) if 'DICOMOBJ' in x]
    return len(names_of_the_records) >= 1

def get_structure_of_the_dataset(path_to_dataset):
    """
    
    Returns:
        dict :  keys - patient names(str): values (list of strings) - paths to the images and records
                                                    
    """
    patients_database = {}
    
    reports_folders = [x for x in os.listdir(path_to_dataset) if not any(i in x for i in
                            ['System', 'BIN', '$.BIN', 'Autorun.inf','Seagate', 'SeagateExpansion.ico', 
                             'Start_Here_Mac.app', 'Start_Here_Win.exe', 'Warranty.pdf'])]
                       #'System' not in x and 'BIN' not in x]
    for report_folder in tqdm(reports_folders):
        patients_per_folder = os.listdir(os.path.join(path_to_dataset, report_folder))
        
        for patient in patients_per_folder:
            
            files_in_patient_folder = os.listdir(os.path.join(path_to_dataset, report_folder, patient))
            
            if check_wether_patient_has_image_data(os.path.join(path_to_dataset, report_folder, patient)):
                patient_images = os.listdir(os.path.join(path_to_dataset, report_folder, patient, 'DICOMOBJ'))
                patient_images_paths = [os.path.join(path_to_dataset, report_folder, patient, 'DICOMOBJ', x) 
                                 for x in patient_images]
            else:
                patient_images = []
                patient_images_paths = []
            _, patient_records = check_wether_patient_has_records(
                                      os.path.join(path_to_dataset, report_folder, patient), 
                                      get_names_of_records=True)
            patient_records_paths = [os.path.join(path_to_dataset, report_folder, patient, x) for x in patient_records]
            patients_database[patient] = []
            patients_database[patient] += patient_records_paths
            patients_database[patient] += patient_images_paths
    
    return patients_database

def copy_dataset(patients_database, path_to_copy):
    """
    Copy only image data and records without DICOM viewer program
    Args:
        patients_database (dict): dictionary with patients and corresponding 
                                  images and records
        path_to_copy (str): destination folder, where all dataset will
                            be located
        
    Returns:
        None
    """
    # Create folder to the dataset
    if not os.path.exists(path_to_copy):
        os.mkdir(path_to_copy)
    
    for patient in tqdm(patients_database):
        # Check wether patient's folder contains images
        if len(patients_database[patient]) <=2:
            continue
        # Check wether patient contains the records
        path_to_the_patient = patients_database[patient][0]

        path_to_the_patient = '\\'.join(path_to_the_patient.split('\\')[:4])
        if not check_wether_patient_has_records(path_to_the_patient):
            continue

        group_folder_name = patients_database[patient][0].split('\\')[0][2:]
        group_folder_name = '_'.join([x.lower() for x in group_folder_name.split()])
        
#         patient_folder_name = patients_database[patient][0].split('\\')[1]
        patient_folder_name = patient #'_'.join([x for x in patient_folder_name.split()])

        # Create directories
        if not os.path.exists(os.path.join(path_to_copy, group_folder_name)):
            os.mkdir(os.path.join(path_to_copy, group_folder_name))
        if not os.path.exists(os.path.join(path_to_copy, group_folder_name, patient_folder_name)):
            os.mkdir(os.path.join(path_to_copy, group_folder_name, patient_folder_name))
        
        # Copy Records
        shutil.copy(patients_database[patient][0], os.path.join(
                path_to_copy, group_folder_name, patient_folder_name, patients_database[patient][0].split('\\')[-1]))
        
        # Create folder patients's for images
        if not os.path.exists(os.path.join( path_to_copy, group_folder_name, patient_folder_name, 'images')):
            os.mkdir(os.path.join( path_to_copy, group_folder_name, patient_folder_name, 'images'))
            

        # Copy images
        for i in range(1, len(patients_database[patient])):
            shutil.copy(patients_database[patient][i], os.path.join(
                path_to_copy, patient_folder_name, 'images', patients_database[patient][i].split('\\')[-1]))



In [95]:
PATH_TO_THE_WHOLE_DATASET = 'D:\dataset'

In [None]:
patients_database = get_structure_of_the_dataset(PATH_TO_THE_WHOLE_DATASET)
copy_dataset(patients_database, 'D:\data')


# 2. Take only MPRs

'AW electronic film', 'Scout', 'Correction Halted 80% - Series 304 - adjust vessel tracking'

'73','81', '71'

'45%','42%','80%',

'CTCA HD STD', 'CTCA','CTCA 73%','CTCA SMARTPHASE','CTCA 79%'

'CALCIUM SCORE', 'CALCIUM SCORE C2'

'SS-Freeze 39% - Original Series 306', 'Auto State - series 405 - 256 images','Auto State - series 301 - 256 images','Auto State - series 3 - 440 images', ,'Auto State - series 3 - 464 images', 'SS-Freeze 80% - Original Series 305','SS-Freeze 75% - Original Series 304','SS-Freeze 42% - Original Series 304','SS-Freeze 77% - Original Series 304','SS-Freeze 75% - Original Series 305','SS-Freeze 75% - Original Series 307','Auto State - series 405 - 256 images','SS-Freeze 39% - Original Series 306','SS-Freeze 39% - Original Series 306','SS-Freeze 77% - Original Series 305','SS-Freeze 81% - Original Series 305'

 '3D Saved State -  FINAL 76%'
 
 'SVG-PDA graft'
 
 
 'CALCIUM SCORE', 'CTCA', 'LAD *', 'LAD-D1 *', 'LAD-D2 *', 'OM *', 'PDA *', 'PLB *', 'PLB2 *', 'AW electronic film', 'Scout', 'LCX *', 'LCX-OM *', 'LAD-D3 *', 'LCX-OM2 *', 'PDA2 *', '73', 'RIB *', 'AM *', 'Correction Halted 80% - Series 304 - adjust vessel tracking', 'RCA *', 'RCA-AM *', 'PLB*', 'CTCA HD STD', '45%', 'RCA -PDA*', 'RCA -PLB*', 'RCA -AM*', 'OM2 *', 'RCA-PDA *', 'SS-Freeze 75% - Original Series 304', 'AM*', 'RIB-Branch of RIB *', 'LCX-OM1 *', 'CT ARTERIES', 'CTCA 73%', 'PLB1 *', '42%', 'SS-Freeze 42% - Original Series 304', 'SS-Freeze 77% - Original Series 304', 'LCX TO PDA *', 'LCX TO PDA2 *', 'LCX TO PLB *', 'AM1 *', 'AM2 *', '80%', 'LAD - D4 *', 'SS-Freeze 75% - Original Series 305', 'SS-Freeze 81% - Original Series 305', 'LCX-OM*', 'LCX-OM3 *', 'CALCIUM SCORE C2', '81', 'LCX - OM *', 'LCX - OM2 *', 'SS-Freeze 77% - Original Series 305', 'LAD', 'LAD-D1', 'LAD-D2', 'OM', 'OM2', 'RCA', 'PDA', 'SS-Freeze 75% - Original Series 307', 'RCA-PDA', 'RCA-PLB', 'OM1', 'OM3', 'RIB', 'CTCA SMARTPHASE', 'LCX', 'LCX-OM', 'PLB', 'LAD*', 'LAD-D1*', 'LCX TO PDA*', '71', 'OM3 *', 'PDA1 *', 'Branch 2 of PLB *', 'Branch of PLB *', 'LCX-OM2*', 'LCX-OM3*', 'RCA*', 'RCA-AM*', 'RCA-PDA*', 'RCA-PLB*', 'LAD-D2*', 'LCX*', 'LAD - D1', 'LCX-OM2', 'SS-Freeze 39% - Original Series 306', 'RCA-AM', 'RCA-PDA2', 'RCA-PLB2', 'RCA-PDA2*', 'Auto State - series 405 - 256 images', 'LCX-PLB', 'LCX-PDA', 'LAD-D3', 'LCX-OM1', 'LCX-OM3', 'LCX-PLB1', 'PDA2', '33', 'PLB2', 'PDA*', 'Auto State - series 3 - 256 images', 'CTCA 70%', 'Auto State - series 306 - 256 images', 'Auto State - series 301 - 256 images', '3D Saved State -  FINAL 76%', 'Auto State - series 304 - 256 images', 'LIMA-LAD', 'Aorta-D1', 'Aorta-D2', 'LAD original', 'LAD-D1 original', 'LCX original', 'LCX-OM original', 'PDA original', 'PDA2 original', 'PLB original', 'Aorta-LCX', 'Auto State - series 3 - 440 images', 'CTCA 79%', 'Auto State - series 305 - 256 images', 'Auto State - series 407 - 256 images', 'SS-Freeze 38% - Original Series 307', 'PLB1', 'LAD-D2 original', 'LAD-D3 original', 'OM original', 'RCA original', 'RCA-PDA original', 'RCA-PLB1 original', 'RCA-PLB2 original', 'LIMA-LAD graft', 'Aorta-LCX graft', 'Aorta-PDA graft', 'Auto State - series 3 - 464 images', 'Auto State - series 301 - 224 images', 'SS-Freeze 80% - Original Series 305', 'LAD-Branch of D1', 'LCX-Branch of OM', '59', 'Auto State - series 405 - 224 images', 'LCX-PDA2', 'RCA-AM1', 'RCA-AM2', 'OM4', 'LCX-PLB2', 'Auto State - series 304 - 224 images', 'LAD-D4', 'SS-Freeze 52% - Original Series 305', '70%', 'Auto State - series 304 - 376 images', 'SS-Freeze 78% - Original Series 305', 'RCA-PDA1', 'SS-Freeze 79% - Original Series 305', 'SS-Freeze 80% - Original Series 304', 'RCA-PLB1', 'Auto State - series 404 - 224 images', 'SS-Freeze 70% - Original Series 305', 'SS-Freeze 75% - Original Series 306', 'Auto State - series 406 - 224 images', 'Auto State - series 3 - 432 images', 'Auto State - series 403 - 256 images', 'SS-Freeze 75% - Original Series 303', 'LCX-OM4', 'RIB-Branch of RIB', 'Auto State - series 3 - 224 images', 'PDA1', 'OM-Branch of OM', 'SS-Freeze 74% - Original Series 305', 'LOCALISER', 'LAD Original', 'Auto State - series 3 - 456 images', 'LAD-D1 Original', 'Aorta-LAD graft', 'Aorta-D1 graft', 'SVG-PDA graft', 'SS-Freeze 45% - Original Series 306', 'Auto State - series 406 - 256 images', '80', 'SS-Freeze 45% - Original Series 310', 'Auto State - series 410 - 256 images', 'PDLB', 'SS-Freeze 33% - Original Series 305', 'SS-Freeze 50% - Original Series 305', '40', 'LCX-OM1*', 'SS-Freeze 51% - Original Series 308', 'SS-Freeze 72% - Original Series 309', 'RIB*', 'SS-Freeze 62% - Original Series 308', 'OM*', 'LAD-D3*', 'PDA2*', 'CALCIUM SCORE/70', 'SS-Freeze 70% - Original Series 306', 'LCX-PDA *', 'PLB2*', 'SS-Freeze 45% - Original Series 305', 'SS-Freeze 72% - Original Series 305', 'D1*', 'LCX-OM4 *', 'LCX-PLB *', 'RCA-PLB *', 'RCA-PDA1 *', 'RCA-PDA2 *', 'SS-Freeze 77% - Original Series 306', 'PLB3*', 'OM-Branch of OM *', '85', 'RCA-AM1 *', 'RCA-AM2 *', 'SS-Freeze 45% - Original Series 304', '43', 'LAD-Branch of D2 *', 'SS-Freeze 80% - Original Series 306', '75%', 'LAD TO PDA *', 'PDA TO LAD *', 'PDA3 *', 'RCA-PLB2 *', '28', 'AXIAL LUNG 2.5MM', '43%'
 



 
 
 'LCX-OM*', 'RCA-AM*', 'RCA-PDA*', 'RCA-PLB*', 'AW electronic film', 'Scout', 'CALCIUM SCORE', 'CTCA SMARTPHASE', 'LAD*', 'LAD-D1*', 'LAD-D2*', 'LCX*', 'LAD', 'LAD - D1', 'LCX', 'LCX-OM', 'LCX-OM2', 'RCA-PDA', 'RCA-PLB', 'LAD-D1', 'RIB', 'SS-Freeze 39% - Original Series 306', 'RCA-AM', 'LAD-D2', 'RCA-PDA2', 'RCA-PLB2', 'SS-Freeze 75% - Original Series 305', 'RCA-PDA2*', 'CTCA', 'Auto State - series 405 - 256 images', 'LCX-PLB', 'RCA', 'OM', 'LCX-PDA', 'LAD-D3', 'LCX-OM1', 'LCX-OM3', 'LCX-PLB1', 'PDA', 'PLB', 'PDA2', '33', 'PLB2', 'LCX-OM2*', 'LCX-OM3*', 'PDA*', 'PLB*', 'Auto State - series 3 - 256 images', 'CTCA 70%', 'Auto State - series 306 - 256 images', 'OM2', 'Auto State - series 301 - 256 images', '3D Saved State -  FINAL 76%'

In [40]:
def get_unique_mpr_names(path_to_dataset,reports):
    """
    Prints all unique mpr section names.
    """
    folders = os.listdir(path_to_dataset)
    
    lol = ['PDA ', 'PDA original', 'PDA', 'PDA', 'PDA  *']
    raw_pda = 0
    lol2 = ['PLB2 *', 'PLB2*', 'PLB2']
    plb2=0
    lol3 = ['PLB3*', 'PLB3',]
    plb1=0
    unique_modalities = []
    for patient_name in tqdm(os.listdir(os.path.join(path_to_dataset))):
        if patient_name.split(' ')[1] not in ['CTCA14101949',
 '1063 CTCA1979',
 'CTCABLG30101947',
 'CTCABAT27011976',
 'CTCALAR04081987',
 'CTCAREA01101963',
 'CTCAPHN24041956',
 'CTCALOG09081952',
 'CTCALEK24101973',
 'CTCAKAG03101958',
 'CTCAZDV13081958',
 'CTCASMR20051961',
 'CTCAJOJ30031955',
 'CTCALID04021953',
 '1040 CTCA1962']:
            continue
        
        print('hi')
        images = list(os.walk(os.path.join(path_to_dataset, patient_name)))[0][2]
    
        patient_info = reports[reports['PATIENT_ID'] == patient_name.split(' ')[1]]
        patient_info.reset_index(inplace=True)
        print(patient_info)
        print(patient_name)
        res_to_print= []
        for image_name in images:
            dicom_obj = pydicom.dcmread(os.path.join(path_to_dataset,patient_name, image_name))
            
            if dicom_obj.SeriesDescription in lol:
                raw_pda+=1
            elif dicom_obj.SeriesDescription in lol2:
                plb2+=1
            elif dicom_obj.SeriesDescription in lol3:
                plb1+=1
            
            if dicom_obj.SeriesDescription not in unique_modalities:
                unique_modalities.append(dicom_obj.SeriesDescription)
            
            if ('PLB' in dicom_obj.SeriesDescription or 'PDA' in dicom_obj.SeriesDescription) and \
                dicom_obj.SeriesDescription not in res_to_print:
                res_to_print.append(dicom_obj.SeriesDescription)
        print(res_to_print)
    return raw_pda, plb2, plb1
#                     print(unique_modalities)     
#     print('       FINAL RESULTS:      ')
#     print(unique_modalities)
    
def copy_mpr_images_per_patient(path_to_dataset, path_to_save):
    """
    Creates in path_to_save folder for each patient, where all MPR DICOM files are located.
    """
    folders = os.listdir(path_to_dataset)
    raw_images = ['CTCA', 'CALCIUM SCORE', 'Scout', 'AW electronic film', '40', '81', '85','Freeze','Auto State','33','3D']
    for patient_name in tqdm(os.listdir(os.path.join(path_to_dataset))):
        images = list(os.walk(os.path.join(path_to_dataset, patient_name, 'images')))[0][2]
        if not os.path.exists(os.path.join(path_to_save,patient_name)):
            os.mkdir(os.path.join(path_to_save,patient_name))
        for image_name in images:
            dicom_obj = pydicom.dcmread(os.path.join(path_to_dataset,patient_name, 'images', image_name))
            if not any(sub_string in dicom_obj.SeriesDescription for sub_string in raw_images):
                copyfile(os.path.join(path_to_dataset,patient_name, 'images', image_name),
                         os.path.join(path_to_save,patient_name,image_name))
# copy_mpr_images_per_patient(PATH_TO_DATA, 'lol')

In [41]:
PATH_TO_DATA = '/home/petryshak/CoronaryArteryPlaqueIdentification/data/extended_test_set_without_program'
PATH_TO_SAVE = r'/home/petryshak/CoronaryArteryPlaqueIdentification/data/test_only_MPR'

df2 = pd.read_excel('ExtractLabels/lad_rca_lcx.xlsx')

df2.drop(['Unnamed: 0', 'LAD', 'D-1', 'D-2', 'D-3','RCA', 'LCX', 'OM', 'OM-1', 'OM-2', 'OM-3'],
         axis=1, inplace=True)

In [42]:
df2[(df2.PLV_RCA != '-') &(df2.PLV_LCX !='-')]

Unnamed: 0,REPORT_ID,PATIENT_ID,PDA_RCA,PLV_RCA,PDA_LCX,PLV_LCX
483,CTCACHN27011976,CTCACHN27011976,NORMAL,NORMAL,-,NORMAL
540,CTCADIJ01051959,CTCADIJ01051959,NORMAL,NORMAL,-,NORMAL


In [43]:
df2[(df2.PDA_RCA != '-') &(df2.PDA_LCX !='-')]

Unnamed: 0,REPORT_ID,PATIENT_ID,PDA_RCA,PLV_RCA,PDA_LCX,PLV_LCX
76,CTCACHY14101949,CTCA14101949,NORMAL,NORMAL,NORMAL,-
113,1063 CTCA1979,1063 CTCA1979,NORMAL,NORMAL,NORMAL,-
192,CTCABLG30101947,CTCABLG30101947,NORMAL,-,NORMAL,70-90%
221,CTCABAT27011976,CTCABAT27011976,NORMAL,NORMAL,NORMAL,-
262,CTCALAR04081987,CTCALAR04081987,NORMAL,NORMAL,NORMAL,-
379,CTCAREA01101963,CTCAREA01101963,NORMAL,NORMAL,NORMAL,-
382,CTCAPHN24041956,CTCAPHN24041956,NORMAL,NORMAL,NORMAL,-
403,CTCALOG09081952,CTCALOG09081952,NORMAL,-,NORMAL,NORMAL
430,CTCALEK24101973,CTCALEK24101973,NORMAL,NORMAL,NORMAL,-
494,CTCAKAG03101958,CTCAKAG03101958,NORMAL,NORMAL,NORMAL,-


In [382]:
raw_pda, plb2, plb1 = get_unique_mpr_names(PATH_TO_DATA, df2)



















  0%|          | 0/743 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

hi
   index        REPORT_ID       PATIENT_ID PDA_RCA PLV_RCA PDA_LCX PLV_LCX
0    510  CTCAZDV13081958  CTCAZDV13081958  NORMAL  NORMAL  NORMAL       -
901 CTCAZDV13081958




















 10%|▉         | 74/743 [00:00<00:03, 192.23it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

['PLB *', 'PDA *']
hi
   index        REPORT_ID       PATIENT_ID PDA_RCA PLV_RCA PDA_LCX PLV_LCX
0    262  CTCALAR04081987  CTCALAR04081987  NORMAL  NORMAL  NORMAL       -
945 CTCALAR04081987




















 15%|█▍        | 111/743 [00:00<00:04, 154.23it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

['PDA *', 'PLB *']
hi
   index        REPORT_ID       PATIENT_ID PDA_RCA PLV_RCA PDA_LCX PLV_LCX
0    579  CTCAJOJ30031955  CTCAJOJ30031955  NORMAL  NORMAL  NORMAL       -
803 CTCAJOJ30031955




















 20%|█▉        | 148/743 [00:01<00:04, 132.34it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

['PDA *', 'PLB *']
hi
   index        REPORT_ID       PATIENT_ID PDA_RCA PLV_RCA PDA_LCX PLV_LCX
0    382  CTCAPHN24041956  CTCAPHN24041956  NORMAL  NORMAL  NORMAL       -
596 CTCAPHN24041956




















 21%|██        | 157/743 [00:01<00:09, 60.21it/s] [A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

['PDA *', 'PLB *']
hi
   index        REPORT_ID       PATIENT_ID PDA_RCA PLV_RCA PDA_LCX PLV_LCX
0    192  CTCABLG30101947  CTCABLG30101947  NORMAL       -  NORMAL  70-90%
658 CTCABLG30101947




















 29%|██▉       | 218/743 [00:02<00:07, 67.40it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

['PDA *', 'PLB *']
hi
   index        REPORT_ID       PATIENT_ID PDA_RCA PLV_RCA PDA_LCX PLV_LCX
0    430  CTCALEK24101973  CTCALEK24101973  NORMAL  NORMAL  NORMAL       -
910 CTCALEK24101973




















 35%|███▌      | 261/743 [00:02<00:06, 77.56it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

['PLB *', 'PDA *', 'PLB2 *']
hi
   index        REPORT_ID       PATIENT_ID PDA_RCA PLV_RCA PDA_LCX PLV_LCX
0    572  CTCASMR20051961  CTCASMR20051961  NORMAL       -  NORMAL       -
458 CTCASMR20051961




















 47%|████▋     | 351/743 [00:02<00:04, 95.89it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

['PDA', 'LCX-PLB']
hi
   index        REPORT_ID       PATIENT_ID PDA_RCA PLV_RCA PDA_LCX PLV_LCX
0    403  CTCALOG09081952  CTCALOG09081952  NORMAL       -  NORMAL  NORMAL
649 CTCALOG09081952




















 63%|██████▎   | 470/743 [00:03<00:02, 120.92it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

['PDA *', 'LCX-PLB *']
hi
   index        REPORT_ID       PATIENT_ID PDA_RCA PLV_RCA PDA_LCX PLV_LCX
0    379  CTCAREA01101963  CTCAREA01101963  NORMAL  NORMAL  NORMAL       -
951 CTCAREA01101963




















 79%|███████▉  | 589/743 [00:03<00:00, 155.28it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

['PDA *', 'PLB *']
hi
   index        REPORT_ID       PATIENT_ID PDA_RCA PLV_RCA PDA_LCX PLV_LCX
0    494  CTCAKAG03101958  CTCAKAG03101958  NORMAL  NORMAL  NORMAL       -
567 CTCAKAG03101958




















 90%|████████▉ | 668/743 [00:03<00:00, 174.00it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

['PDA2 *', 'PDA1 *', 'PLB *']
hi
   index        REPORT_ID       PATIENT_ID PDA_RCA PLV_RCA PDA_LCX PLV_LCX
0    221  CTCABAT27011976  CTCABAT27011976  NORMAL  NORMAL  NORMAL       -
906 CTCABAT27011976




















 93%|█████████▎| 694/743 [00:04<00:00, 130.89it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















100%|██████████| 743/743 [00:04<00:00, 178.41it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

['PDA *', 'PLB *']


In [310]:
raw_pda, plb2, plb3

(4550, 4780, 100)

In [44]:
copy_mpr_images_per_patient(PATH_TO_DATA, PATH_TO_SAVE)

HBox(children=(IntProgress(value=0), HTML(value='')))

In [None]:
'RCA-PDA','RCA -PDA', 'RCA-PDA*', 'RCA-PDA *' 
'RCA-PDA1',                                       # 5148
'RCA-PDA2', 'RCA-PDA2 *','RCA-PDA2', 'RCA-PDA2*'

'RCA-PLB', 'RCA-PLB ', 'RCA-PLB', 'RCA -PLB*'
'RCA-PLB1 *','RCA-PLB1', 'RCA-PLB1 *',            # 3450
'RCA-PLB2', 'RCA-PLB2 *',

'LCX-PLB', 'LCX-PLB *', 'LCX-PLB1', 'LCX-PLB2',  'LCX-PLB2 *' # 1650

'LCX-PDA *', 'LCX-PDA', 'LCX-PDA2', 'LCX-PDA2 *', # 1250
#-----------------------------------------------------------------------------------------------------------------------





'PLB  *', 'PLB *', 'PLB original', 'PLB','PLB*' # 24312

'PLB1 *', 'PLB1*', 'PLB1', # 1250

'PLB2 *', 'PLB2*', 'PLB2', # 4780  

'PDA ', 'PDA original', 'PDA', 'PDA', 'PDA  *' # 4550

'PDA1 *',                                  
'PDA2', 'PDA 2', 'PDA2 original', 'PDA2 *', 'PDA2 *', 


# THRASH
'PLB3*', 'PLB3', # 100
'LCX  TO PDA *', 'LCX TO PDA',  'LCX TO PDA *', 'LCX TO PLB', 'LCX TO PDA2 *', 'LCX TO PDA*' # 1600


### All unique values
'LAD-D2',  'LAD-D1', 'LAD', 'LAD *', 'LAD-D1 *', 'LAD-D2 *', , 'LAD-D3 *', 'LIMA-LAD', 'LAD-D1 original', 'LAD original', 'LAD*', 'LAD-D1*', 'LAD-D3', 'LAD-D2*', 'LAD-D3*', 'LAD - D4 *', 'LAD-D4', 'LAD TO PDA *', '2LAD-D2', 'LAD -D1*', 'LAD -D2*', 'LAD-D3 original', 'LAD-D2 original',  'LAD -D1', 'LAD -D2', 'LAD-D1 Original', 'LAD Original', 'LAD-D4 *', 'D1*', 'LAD - D1'
'LAD *','LAD*'


'RCA', 'RCA-PDA', 'PDA *', 'PDA2 original', 'PDA original', 'PDA2 *','RCA-AM *', 'RCA *', 'RCA-AM1', 'PDA', 'PDA1 *','PDA*','PDA3 *', 'PDA2','RCA*', 'PDA2*',  'PDA1*', 'PDA1', 'PDA 2*','PDA-AM *', 'RCA -PDA*'

'RCA-AM',

'RCA-PLB', 'RCA-PLB *', 'RCA-PLB*', 'RCA -PLB*', 'RCA-PLB1', 'RCA-PLB1 *', 'RCA-PLB2', 'RCA-PLB2 *',

'RCA-PDA*', 'RCA-PDA *', 'RCA-PDA1', 'RCA-PLB1 *', 'RCA-PDA2', 'RCA-PDA2 *'

'RCA -AM*', 'RCA-AM*', 'RCA-AM2 *', 'RCA-AM1 *', 'PDA TO LAD *', 'PDA2  *', 

'RCA-PLB1 original', 'RCA-PLB2 original', 'RCA original', 'RCA-PDA original'



'LCX', 'LCX-OM', 'LCX-OM *', 'LCX *', 'LCX-OM1 *', 'OM *', 'LCX-OM3 *', 'LCX-OM2 *', 'LCX-OM original', 'LCX original', 'OM2 *', 'LCX  *', 'LCX-OM  *', 'OM', 'OM1 *', 'LCX -OM3 *', 'LCX-OM*', 'LCX*', 'LCX-OM2', 'OM2', 'LCX2 *', 'LCX - OM2 *', 'LCX - OM *', 'OM3', 'OM1', 'LCX-OM3*', 'LCX-OM3', 'LCX-OM1', 'LCX-OM2*', 'OM3 *', 'OM3*',  'OM2*', 'OM4 *','LCX -OM2 *', 'LCX -OM1 *', 'OM4','LCX-OM4 *', 'LCX-OM1*', 'OM*', 'OM original', 

'LCX  TO PDA *', 'LCX TO PLB', 'LCX TO PDA', 'LCX TO PDA *', 'LCX TO PDA2 *', 'LCX TO PDA*', 'LCX-OM4'

'LCX-PLB', 'LCX-PLB *', 'LCX-PLB1', 'LCX-PLB2',  'LCX-PLB2 *'

'LCX-PDA *', 'LCX-PDA', 'LCX-PDA2', 'LCX-PDA2 *', 



'PLB  *', 'PLB *', 'PLB2 *', 'PLB original', 'PLB2*', 'PLB1 *', 'PLB', 'PLB*', 'PLB3*', 'PLB2', 'PLB1', 'PLB3', 'PLB1*', 'RCA-PDA2*'

'RIB *', 'RIB1 *', 'RIB2 *','RIB', 'RIB*','RIB2'

'AM *', 'AM1 *', 'AM2 *', 'RCA-AM2', 'AM*', 'AM1', 'AM2', 'LB *',  'AM'


### 500 images
'Aorta-D1', 'Aorta-LCX', 'Aorta-D2',  'Aorta-OM graft*', 'Aorta-PDA graft *', 'Aorta-D1 graft *', 'Aorta-OM graft *','Aorta-LCX graft', 'Aorta-PDA graft', 'Aorta-D1 graft'

-----------------------------------------------------------------------------------------------------------------------------

### 4000 images

'78SSF', 'LOCALISER', 'Lt Septal Branch *', 'CT ARTERIES', 'Conus Artery *', 'AXIAL LUNG 2.5MM', 'LIMA-LAD graft', 'Aorta-LAD graft', 'RT Anomaly *', 'AXIAL MEDIASTINUM 2.5MM', 'Localiser', 'LIMA-LAD graft*', 'SScoreSerSav',  'PatientReport', 'ECG Report', 'Dose Report', 'ScreenSave', 

------------------------------------------------------------------------------------------------------------------------------

### 1400 images
'OM-Branch of OM *', 'LCX-Branch of OM *', 'RIB-Branch of RIB *', 'Branch of PLB *', 'Branch 2 of PLB *', 'OM-Branch of OM',  'LCX-Branch of OM', 'RIB-Branch of RIB', 'OM-Branch OM *', 'LAD-Branch of D1', 'LCX-Branch of OM1 *', '81SSF', 'PLB3 *', 'LAD-Branch of D2 *', 'Branch of AM *', 'LAD-Branch of D1 *'

------------------------------------------------------------------------------------------------------------------------------
'PDLB'

, 'PDA  *'



# 3. Extract PNG images from the MPRs

In [2]:
def map_mpr_name_to_record_name(mpr_name):
    main_branches_dict = {
        'LAD': ['LAD', 'LAD ', 'LAD Original', 'LAD original', 'LAD *', 'LAD*'],
        'D-1':['LAD-D1 original', 'LAD-D1 Original', 'LAD-D1', 'LAD-D1 *', 'LAD -D1', 'LAD -D1', 'LAD - D1', 'D1'],
        'D-2':['LAD-D2', 'LAD-D2 *', 'LAD-D2', '2LAD-D2', 'LAD -D2', 'LAD-D2 original', 'LAD -D2'],
        'D-3': ['LAD-D3', 'LAD-D3 *', 'LAD-D3', 'LAD-D3 original', ],
        'D-4': [ 'LAD - D4 *', 'LAD-D4', 'LAD-D4 *'],
        'RCA': ['RCA', 'RCA *', 'RCA*', 'RCA original'],
        'OM':['OM*', 'LCX-OM  *', 'OM *', 'OM', 'LCX-OM*', 'LCX - OM *', 'LCX-OM original', 'LCX-OM *', 'LCX-OM', 'OM original'],
        'OM-1': ['LCX-OM1 *', 'OM1 *', 'OM1', 'LCX-OM1', 'LCX -OM1 *', 'LCX-OM1*'],
        'OM-2': ['LCX-OM2 *', 'OM2 *', 'LCX-OM2', 'LCX - OM2 *', 'LCX -OM2 *', 'OM2*', 'LCX-OM2*'],
        'OM-3': ['LCX-OM3 *', 'LCX -OM3 *', 'OM3',  'LCX-OM3*', 'LCX-OM3', 'OM3 *', 'OM3*'],
        'OM-4': ['OM4 *', 'OM4', 'LCX-OM4 *'],
        'LCX': ['LCX', 'LCX *', 'LCX original', 'LCX  *', 'LCX*'],
        'PDA_RCA': ['RCA-PDA','RCA -PDA', 'RCA-PDA*', 'RCA-PDA *', 'RCA-PDA1','RCA-PDA2', 'RCA-PDA2 *','RCA-PDA2', 
                    'RCA-PDA2*'],
        'PLV_RCA': ['RCA-PLB', 'RCA-PLB ', 'RCA-PLB', 'RCA -PLB*', 'RCA-PLB1 *','RCA-PLB1', 'RCA-PLB1 *','RCA-PLB2', 
                    'RCA-PLB2 *'],
        'PDA_LCX': ['LCX-PDA *', 'LCX-PDA', 'LCX-PDA2', 'LCX-PDA2 *'],
        'PLV_LCX': ['LCX-PLB', 'LCX-PLB *', 'LCX-PLB1', 'LCX-PLB2',  'LCX-PLB2 *'],
        'THRASH': ['PLB  *', 'PLB *', 'PLB original', 'PLB','PLB*','PLB1 *', 'PLB1*', 'PLB1','PLB2 *', 'PLB2*', 'PLB2']
    }
    
    for key in main_branches_dict:
        if mpr_name in main_branches_dict[key]:
            return key

def split_mpr_name(mpr_name):
    return \
        "".join(mpr_name.split()).replace('*', '').replace('original', '') \
        .replace('LIMA-', '').replace('Branchof','').replace('TOPDA', '').replace('PDATO', '')

def get_patient_dictionary(path_to_patient_folder):
    """
    
    Returns dict of different types of images in the folder of patient. 
    
    Returns:
        dict: key - type of images; value - list of DICOM files, which sorted in the ascending order with restepct to the
                    depth of the image slice.
    """
    patient_dict = {}
    
    dicom_file_names = os.listdir(path_to_patient_folder)
    
    for i in range(len(dicom_file_names)):
        cur_dicom_obj = dicom.dcmread(os.path.join(path_to_patient_folder, dicom_file_names[i]))
        
        if cur_dicom_obj.SeriesDescription not in patient_dict.keys():
            patient_dict[cur_dicom_obj.SeriesDescription] = []
        patient_dict[cur_dicom_obj.SeriesDescription].append(cur_dicom_obj)
        
    # sort each type of images with respect to their depth in ascending order
    for i in patient_dict:
#         if len(patient_dict[i]) < 50:
#             print(patient_dict[i])
        patient_dict[i].sort(key=lambda x: x.InstanceNumber)
    
    return patient_dict

def get_pixels_hu(list_of_imgs):
    """
    Convert stack of the images into Houndsfeld units
    """
    image = np.stack([s.pixel_array for s in list_of_imgs])
    # Convert to int16 (from sometimes int16), 
    # should be possible as values should always be low enough (<32k)
    image = image.astype(np.int16)

    # Set outside-of-scan pixels to 1
    # The intercept is usually -1024, so air is approximately 0
    image[image == -2000] = 0
    
    # Convert to Hounsfield units (HU)
    intercept = list_of_imgs[0].RescaleIntercept
    slope = list_of_imgs[0].RescaleSlope
    
    if slope != 1:
        image = slope * image.astype(np.float64)
        image = image.astype(np.int16)
        
    image += np.int16(intercept)
    
    return np.array(image, dtype=np.int16)

def remove_text(img):
    mask = cv2.threshold(img, 250, 255, cv2.THRESH_BINARY)[1][:,:,0]
    dilated_mask = cv2.dilate(mask, cv2.getStructuringElement(cv2.MORPH_ELLIPSE,(3,3)))
    dst = cv2.inpaint(img, dilated_mask, 5, cv2.INPAINT_NS)
    return dst

def save_img(img, img_name):
    with h5py.File(img_name, 'w') as hf: 
        Xset = hf.create_dataset \
                (
                    name='X',
                    data=img,
                    shape=(img.shape[0], img.shape[1]),
                    maxshape=(img.shape[0], img.shape[1]),
                    compression="lzf",
                )
        

In [3]:
# path_to_data = r'D:\coronaryProject\dataset\binary_classification_MPR\images'
# path_to_new_data = r'E:\ONLY_LAD\\'
path_to_data = r'/home/petryshak/CoronaryArteryPlaqueIdentification/data/test_only_MPR'
path_to_new_data = r'/home/petryshak/CoronaryArteryPlaqueIdentification/data/extended_test_h5'
list_of_patients = os.listdir(path_to_data)
# patient_dictionary = get_patient_dictionary(path_to_data + '\\'+ list_of_patients[0])

In [4]:
df2 = pd.read_csv('ExtractLabels/lad_rca_lcx_all.csv')

In [47]:
list_of_patients[0].split(' ')[1] in list(df2['PATIENT_ID'])

False

In [7]:
[x for x in list_of_patients 
 if 
     x.split(' ')[1] not in list(df2['REPORT_ID']) and
     x.split(' ')[1] not in list(df2['PATIENT_ID']) and 
     x not in list(df2['REPORT_ID']) and
     x not in list(df2['PATIENT_ID'])
]
 

['9 CTCA1956', '100 CTCA1958']

In [12]:
a = os.listdir('/home/petryshak/CoronaryArteryPlaqueIdentification/data/all_branches_with_pda_plv')
for i in range(len(a)):
    b = os.listdir('/home/petryshak/CoronaryArteryPlaqueIdentification/data/all_branches_with_pda_plv/{}'.format(a[i]))
    if len(b) == 0:
        print(a[i])

In [10]:
for i in tqdm(range(len(list_of_patients))):
    if list_of_patients[i] == '89 CTCA1958':
        continue
    patient_dictionary = get_patient_dictionary(path_to_data + '/'+ list_of_patients[i])
    splited_mpr_names_filtered = [map_mpr_name_to_record_name(x) for x in patient_dictionary.keys()]
    dict_keys = list(patient_dictionary.keys())
    
    # change keys in the dict to the corresponding labels in the reports
    for k in range(len(dict_keys)):
        if splited_mpr_names_filtered[k]:
            if dict_keys[k] in splited_mpr_names_filtered:
                pass
            else:
                patient_dictionary[splited_mpr_names_filtered[k]] = patient_dictionary[dict_keys[k]]
                del patient_dictionary[dict_keys[k]]
        else:
            del patient_dictionary[dict_keys[k]]
#     if i > 5:
#         break
    if not os.path.exists(os.path.join(path_to_new_data, list_of_patients[i])):
        os.mkdir(os.path.join(path_to_new_data, list_of_patients[i]))
        

    if list_of_patients[i] in [x for x in list_of_patients if 
             x.split(' ')[1] not in list(df2['REPORT_ID']) and
             x.split(' ')[1] not in list(df2['PATIENT_ID']) and 
             x not in list(df2['REPORT_ID']) and
             x not in list(df2['PATIENT_ID'])
            ]:
        continue

    if list_of_patients[i].split(' ')[1] in list(df2['REPORT_ID']):
        patient_row = df2[df2['REPORT_ID'] == list_of_patients[i].split(' ')[1]]
    elif  list_of_patients[i].split(' ')[1] in list(df2['PATIENT_ID']):
        patient_row = df2[df2['REPORT_ID'] == list_of_patients[i].split(' ')[1]]
    elif list_of_patients[i] in df2['REPORT_ID']:
        patient_row = df2[df2['REPORT_ID'] == list_of_patients[i]]
    else:# list_of_patients[i] in df2['PATIENT_ID']:
        patient_row = df2[df2['PATIENT_ID'] == list_of_patients[i]]
    
    if not patient_row.empty:
        if 'THRASH' in patient_dictionary.keys():
            if patient_row['PLV_RCA'].iloc[0] != '-':
                patient_dictionary['PLV_RCA'] = patient_dictionary['THRASH']
            elif patient_row['PLV_LCX'].iloc[0] != '-':
                patient_dictionary['PLV_LCX'] = patient_dictionary['THRASH']
    if 'THRASH' in patient_dictionary.keys():
        del patient_dictionary['THRASH']


    for key in patient_dictionary.keys():
        for dicom_file in patient_dictionary[key]:
            if not os.path.exists(os.path.join(path_to_new_data, list_of_patients[i])):
                os.mkdir(os.path.join(path_to_new_data, list_of_patients[i]))
            
            if not os.path.exists(os.path.join(path_to_new_data, list_of_patients[i], key)):
                os.mkdir(os.path.join(path_to_new_data, list_of_patients[i], key))
                
            cur_img = dicom_file.pixel_array
            cur_img[cur_img == -2000] = 0
            intercept = dicom_file.RescaleIntercept
            slope = dicom_file.RescaleSlope
            if slope != 1:
                cur_img = slope * cur_img.astype(np.float64)
                cur_img = cur_img.astype(np.int16)

            cur_img += np.int16(intercept)

            final_result = np.array(cur_img, dtype=np.int16)
            save_img(final_result, 
                     os.path.join(
                                    path_to_new_data, 
                                    list_of_patients[i], 
                                    key,
                                    list_of_patients[i]+'_'+str(dicom_file.InstanceNumber)+'.h5'
                                  )
                     )

#             cv2.imwrite(os.path.join(path_to_new_data, 
#                                             list_of_patients[i], 
#                                             key,
#                                             list_of_patients[i]+'_'+str(dicom_file.InstanceNumber)+'.png'
#                                            ),
#                         cv2.normalize(dicom_file.pixel_array, None, alpha = 0, 
#                                       beta = 255, norm_type = cv2.NORM_MINMAX, dtype = cv2.CV_32F)
#                        )

HBox(children=(IntProgress(value=0), HTML(value=u'')))




In [390]:
for i in tqdm(range(len(list_of_patients))):
    patient_dictionary = get_patient_dictionary(path_to_data + '/'+ list_of_patients[i])
    splited_mpr_names_filtered = [map_mpr_name_to_record_name(x) for x in patient_dictionary.keys()]
#     splited_mpr_names = [split_mpr_name(x) for x in patient_dictionary.keys()]
#     splited_mpr_names_filtered = [split_mpr_name(x).replace('LAD-', '').replace('D','D-').replace('AD-', 'AD') for x in patient_dictionary.keys() 
#                                   if 'LAD' in split_mpr_name(x)]
    dict_keys = list(patient_dictionary.keys())
    
    # change keys in the dict to the corresponding labels in the reports
    for k in range(len(dict_keys)):
#         patient_dictionary[split_mpr_name(key_element).replace('LAD-', '').replace('D','D-').replace('AD-', 'AD')] = \
#             patient_dictionary[key_element]
        if splited_mpr_names_filtered[k]:
            if dict_keys[k] in splited_mpr_names_filtered:
                pass
            else:
                patient_dictionary[splited_mpr_names_filtered[k]] = patient_dictionary[dict_keys[k]]
                del patient_dictionary[dict_keys[k]]
        else:
            del patient_dictionary[dict_keys[k]]

    if not os.path.exists(os.path.join(path_to_new_data, list_of_patients[i])):
        os.mkdir(os.path.join(path_to_new_data, list_of_patients[i]))

    for key in patient_dictionary.keys():
        
        for dicom_file in patient_dictionary[key]:
            if not os.path.exists(os.path.join(path_to_new_data, list_of_patients[i])):
                os.mkdir(os.path.join(path_to_new_data, list_of_patients[i]))
            
            if not os.path.exists(os.path.join(path_to_new_data, list_of_patients[i], key)):
                os.mkdir(os.path.join(path_to_new_data, list_of_patients[i], key))
# #             dicom_file.save_as(os.path.join(path_to_new_data, 
# #                                             list_of_patients[i], 
# # #                                             key,
# # #                                             list_of_patients[i]+'_'+str(dicom_file.InstanceNumber)
# # #                                            )
# # #                               )
            cv2.imwrite(os.path.join(path_to_new_data, 
                                            list_of_patients[i], 
                                            key,
                                            list_of_patients[i]+'_'+str(dicom_file.InstanceNumber)+'.png'
                                           ),
                        cv2.normalize(dicom_file.pixel_array, None, alpha = 0, 
                                      beta = 255, norm_type = cv2.NORM_MINMAX, dtype = cv2.CV_32F)
                       )




    



















  0%|          | 0/743 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

FileNotFoundError: [Errno 2] No such file or directory: '/home/petryshak/CoronaryArteryPlaqueIdentification/data/all_branches/multibranch_dataset/346 CTCASHA02091971'

In [None]:
def copy_mpr_records(path_to_dataset, path_to_save):
    """
    Copy all records from the dataset to path_to_save folder.
    """
    if not os.path.exists(path_to_save):
        os.mkdir(path_to_save)
    folders = [x for x in os.listdir(path_to_dataset) if 'WITH RECONS ' in x]

    for patient_name in os.listdir(os.path.join(path_to_dataset)):
        files = os.listdir(os.path.join(path_to_dataset, patient_name))
        files = [x for x in files if ('xlsx' in x) or ('doc' in x)]
        files = files[0] if len(files)>0 else None
        if files:
            copyfile(os.path.join(path_to_dataset, patient_name, files), os.path.join(path_to_save, files))


In [None]:
PATH_TO_DATA = r'd:\test\without_viewer'
PATH_TO_SAVE = r'd:\test\records'

In [None]:
copy_mpr_records(PATH_TO_DATA, PATH_TO_SAVE)

# 5. Merge Reports

In [None]:
def strip_mpr_lad_name(mpr_name):
    """
    Strip MPR name of the LAD artery. We do this step because the name in the doctor's report 
    is not equal to the name in the MPR. 
    
    Returns:
        - str: striped string
    """
    return "".join(mpr_name.split()).replace('*', '').replace('original', '')

def read_and_strip_record(path_to_record):
    '''
    Read record file and remove empty rows and rows with all NaNs.
    
    Returns:
        - Pandas DataFrame: 
    '''
    excel_file = pd.read_excel(path_to_record,index_col=None, header=None)
    excel_file.dropna(how='all')
    excel_file.rename(columns={0: 'a', 1: 'b'}, inplace=True)
    excel_file = excel_file.fillna('  ')
    excel_file = excel_file.replace('', '  ', regex=True)
    excel_file = excel_file.drop(excel_file[excel_file['a'].str.isspace()].index)
    return excel_file

def get_lad_info_from_report(striped_record, artery_type):
    """
    Takes striped(without any empty lines and NaNs) and returns info only about the certain artery type. 
    
    Returns:
        - list: each element is the string with some info about certain artery type
    """
    lad_info = []
    wether_add = False
    lad_info.append(striped_record.iloc[0]['b'])
    for ind, row_value in striped_record.iterrows():
        
        if wether_add and row_value['a'].isupper():
            break
        if wether_add:
            lad_info.append(row_value['a'])
        
        if artery_type in row_value['a']:
            wether_add = True
    return lad_info

def get_level_of_stenosis_from_string(artery_info):
    """
    Returns:
        - list of str: each element is the string with percentage of stenosis. 
    """
    return [x.strip() for x in re.findall(r'.\d{1,3}.?\d{1,3}\%', artery_info)]

In [None]:
path_to_records = r'/home/petryshak/CoronaryArteryPlaqueIdentification/data/reports'
list_of_files = os.listdir(path_to_records)

In [None]:
extracted_lad_df = pd.DataFrame(columns=['PATIENT_ID','PROXIMAL', 'MID', 'DISTAL', 'D-1', 'D-2', 'D-3', 'D-4'])

for i in tqdm(range(len(list_of_files))):
    cur_file = read_and_strip_record(os.path.join(path_to_records, list_of_files[i]))
    cur_patient_lad_info = get_lad_info_from_report(cur_file, 'LEFT ANTERIOR')
    
    new_row = pd.Series(#[Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan],
                        ['-','-','-','-','-','-','-','-'],
                        index=extracted_lad_df.columns)
    new_row['PATIENT_ID'] = cur_patient_lad_info[0]
    cur_patient_lad_info.pop(0)
    list_of_lda_branches = list(extracted_lad_df.columns)
    
    for line_info in cur_patient_lad_info:
        
        artery_area_name = [x for x in list_of_lda_branches 
                            if x in line_info or x.lower() in line_info or x.title() in line_info]
        if len(artery_area_name) >=1:
            artery_area_name = artery_area_name[0]
        else:
            continue
        stenosis_score = get_level_of_stenosis_from_string(line_info)
        stenosis_score =  stenosis_score[0] if stenosis_score else 'NORMAL'
        new_row.loc[artery_area_name] = stenosis_score
    extracted_lad_df = extracted_lad_df.append(new_row, ignore_index=True)

In [None]:
extracted_lad_df.head()

In [None]:
extracted_lad_df.to_excel('lad_reports.xlsx', index=False)

# 6. Remove text from the images

In [17]:
def remove_text(img):
    mask = cv2.threshold(img, 250, 255, cv2.THRESH_BINARY)[1][:,:,0]
    dilated_mask = cv2.dilate(mask, cv2.getStructuringElement(cv2.MORPH_ELLIPSE,(3,3)))
    dst = cv2.inpaint(img, dilated_mask, 5, cv2.INPAINT_NS)
    return dst


In [None]:
PATH_TO_DATA = '/home/petryshak/CoronaryArteryPlaqueIdentification/data/all_branches_with_pda_plv'
patients_names = os.listdir(PATH_TO_DATA)

for patient_name in tqdm(patients_names):
    images_types = os.listdir(os.path.join(PATH_TO_DATA, patient_name))
    
    for img_type in images_types:
        images_names = os.listdir(os.path.join(PATH_TO_DATA, patient_name, img_type))
        filtered_imgs = [x for x in images_names if '_text_deleted' in x]
        if len(filtered_imgs) > 0:
            continue
            
        for img_name in images_names:
            img_path = os.path.join(PATH_TO_DATA, patient_name, img_type, img_name)
            cur_img = cv2.imread(img_path)
            img_without_text = remove_text(cur_img)
            cv2.imwrite(img_path, img_without_text)
#             cv2.imwrite(os.path.join(PATH_TO_DATA, patient_name, img_type, img_name.split('.')[0] + '_text_deleted.png'), img_without_text)





  0%|          | 0/743 [00:00<?, ?it/s][A[A[A


  0%|          | 1/743 [00:11<2:24:32, 11.69s/it][A[A[A


  0%|          | 2/743 [00:21<2:17:06, 11.10s/it][A[A[A


  0%|          | 3/743 [00:29<2:06:32, 10.26s/it][A[A[A


  1%|          | 4/743 [00:40<2:09:15, 10.49s/it][A[A[A


  1%|          | 5/743 [00:45<1:48:33,  8.83s/it][A[A[A


  1%|          | 7/743 [00:55<1:33:17,  7.60s/it][A[A[A


  1%|          | 8/743 [01:03<1:35:49,  7.82s/it][A[A[A


  1%|          | 9/743 [01:10<1:30:49,  7.42s/it][A[A[A


  1%|▏         | 10/743 [01:16<1:27:29,  7.16s/it][A[A[A


  1%|▏         | 11/743 [01:27<1:42:08,  8.37s/it][A[A[A


  2%|▏         | 12/743 [01:34<1:35:34,  7.85s/it][A[A[A


  2%|▏         | 13/743 [01:39<1:24:08,  6.92s/it][A[A[A


  2%|▏         | 14/743 [01:47<1:28:16,  7.26s/it][A[A[A


  2%|▏         | 15/743 [01:59<1:47:36,  8.87s/it][A[A[A


  2%|▏         | 16/743 [02:08<1:44:59,  8.67s/it][A[A[A


  2%|▏         | 17/743 

 36%|███▌      | 265/743 [41:41<1:27:09, 10.94s/it][A[A[A


 36%|███▌      | 266/743 [41:52<1:29:03, 11.20s/it][A[A[A


 36%|███▌      | 267/743 [42:01<1:23:08, 10.48s/it][A[A[A


 36%|███▌      | 268/743 [42:10<1:18:58,  9.98s/it][A[A[A


 36%|███▌      | 269/743 [42:21<1:20:01, 10.13s/it][A[A[A


 36%|███▋      | 270/743 [42:37<1:35:17, 12.09s/it][A[A[A


 36%|███▋      | 271/743 [42:47<1:30:47, 11.54s/it][A[A[A


 37%|███▋      | 272/743 [42:58<1:28:11, 11.23s/it][A[A[A


 37%|███▋      | 273/743 [43:10<1:29:46, 11.46s/it][A[A[A


 37%|███▋      | 274/743 [43:20<1:26:53, 11.12s/it][A[A[A


 37%|███▋      | 275/743 [43:32<1:28:23, 11.33s/it][A[A[A


 37%|███▋      | 276/743 [43:42<1:25:34, 11.00s/it][A[A[A


 37%|███▋      | 277/743 [43:54<1:27:13, 11.23s/it][A[A[A


 37%|███▋      | 278/743 [43:59<1:13:02,  9.42s/it][A[A[A


 38%|███▊      | 279/743 [44:06<1:07:07,  8.68s/it][A[A[A


 38%|███▊      | 280/743 [44:13<1:03:15,  8.20s/it][A

 71%|███████   | 525/743 [1:23:19<26:50,  7.39s/it][A[A[A


 71%|███████   | 526/743 [1:23:30<31:07,  8.60s/it][A[A[A


 71%|███████   | 527/743 [1:23:40<32:11,  8.94s/it][A[A[A


 71%|███████   | 528/743 [1:23:46<29:24,  8.21s/it][A[A[A


 71%|███████   | 529/743 [1:23:53<27:27,  7.70s/it][A[A[A


 71%|███████▏  | 530/743 [1:24:04<30:51,  8.69s/it][A[A[A


 71%|███████▏  | 531/743 [1:24:13<31:52,  9.02s/it][A[A[A


 72%|███████▏  | 532/743 [1:24:23<32:23,  9.21s/it][A[A[A


 72%|███████▏  | 533/743 [1:24:34<34:19,  9.81s/it][A[A[A


 72%|███████▏  | 534/743 [1:24:45<34:41,  9.96s/it][A[A[A


 72%|███████▏  | 535/743 [1:24:55<35:13, 10.16s/it][A[A[A


 72%|███████▏  | 536/743 [1:25:06<36:05, 10.46s/it][A[A[A


 72%|███████▏  | 537/743 [1:25:21<39:59, 11.65s/it][A[A[A


 72%|███████▏  | 538/743 [1:25:31<37:52, 11.09s/it][A[A[A


 73%|███████▎  | 539/743 [1:25:39<34:35, 10.17s/it][A[A[A


 73%|███████▎  | 540/743 [1:25:45<30:34,  9.04s/it][A

# 7. Create binary dataset

In [7]:
def get_label_for_patient_lad(lad_segment, reports, patient_id):
    """
    
    
    Example:
        get_label_for_patient('LAD', reports, 'DDJ261Z' )

    """
    classes_to_positive = ['<25%', 'NORMAL', '-']

    if lad_segment == 'LAD':
        curr_section_label = reports.loc[reports['PATIENT_ID'] == patient_id][['MID', 'PROXIMAL', 'DISTAL']].iloc[0]
        stenosis_score = [x for x in curr_section_label if x not in classes_to_positive]
        label = 1 if len(stenosis_score) > 0 else 0
        return label, '___'.join(curr_section_label.values)
        
    else:
        curr_section_label =  reports.loc[reports['PATIENT_ID'] == patient_id][lad_segment].iloc[0]
        label = 0 if curr_section_label in classes_to_positive else 1    
        return label, curr_section_label

def get_label_for_patient(artery_segment, reports, patient_id):
    """
    
    
    Example:
        get_label_for_patient('LAD', reports, 'DDJ261Z' )

    """    
    classes_to_positive = ['<25%', 'NORMAL', '-']
    curr_section_label = reports.loc[reports['PATIENT_ID'] == patient_id][artery_segment].iloc[0]

    if ']' in curr_section_label:
        curr_section_label = '_'.join(literal_eval(curr_section_label))

    label = 0 if curr_section_label in classes_to_positive else 1    
    return label, curr_section_label


    
def get_labels(path_to_patient, reports):
    """
    Returns:
        - dict: key(str) - type of the artery, value(int) - label(0 or 1)
    """
    print(path_to_patient)
    types_of_images = os.listdir(path_to_patient)
    types_of_images = [x for x in types_of_images if x in list(reports.columns)]
    
    # Check wether patient contains any artery images
    if not types_of_images:
        return None
    
    patient_name = [x for x in path_to_patient.split('/') if len(x) > 0][-1]
    if patient_name not in list(reports['PATIENT_ID']):
        print(patient_name, patient_name in list(reports['REPORT_ID']))
        if patient_name in list(reports['REPORT_ID']):
            patient_name = list(reports['PATIENT_ID'])[list(reports['REPORT_ID']).index(patient_name)]
        elif patient_name.split('_')[1] in list(reports['REPORT_ID']):
            patient_name = list(reports['PATIENT_ID'])[list(reports['REPORT_ID']).index(patient_name.split('_')[1])]

        else:
            patient_name = patient_name.split('_')[1]
    labels_dict = {}
    
    for i in range(len(types_of_images)):
        labels_dict[types_of_images[i]] = get_label_for_patient(types_of_images[i], reports, patient_name)
    
    def filter_function(stenosis_score_list):
        if stenosis_score_list in ['-','-_-','-_-_-']:
            return False
        else:
            return True
            
    return dict(filter(lambda elem: filter_function(elem[1][1]), labels_dict.items()))


def get_imgs_names_to_the_labels(path_to_patient, labels_dict):
    """
    Args:
        - dict: key - branch of artery(str), value - label(int) 0 or 1.   
    Returns:
        - dict: key - label(int) 0 or 1, value(list) list of images, belong to the labelT
    """
    img_labels = {}
    for key in labels_dict.keys():
        list_of_images = os.listdir(os.path.join(path_to_patient, key))
        
        if labels_dict[key] in img_labels:
            new_key = (key, labels_dict[key][1], labels_dict[key][0])
#             img_labels[labels_dict[key]]+= list_of_images
            img_labels[new_key] += list_of_images
        else:
            new_key = (key, labels_dict[key][1], labels_dict[key][0])
            img_labels[new_key] = list_of_images

#             img_labels[labels_dict[key]] = list_of_images
            
    return img_labels


In [225]:
get_labels('/home/petryshak/CoronaryArteryPlaqueIdentification/data/binary_classification_all_branches/train/imgs/CTCASCK30061977',
           reports)

{'D-1': (0, 'NORMAL'), 'LCX': (1, 'NORMAL_NORMAL')}

### A) rename name of the patient's folders

In [7]:
# PATH_TO_MERGED_CSV = '/home/petryshak/CoronaryArteryPlaqueIdentification/notebooks/ExtractLabels/lad_rca_lcx.xlsx'

PATH_TO_RENAME = '/home/petryshak/CoronaryArteryPlaqueIdentification/data/extended_test_h5'


In [8]:
file_names = os.listdir(PATH_TO_RENAME)


In [11]:
old_names = []
new_names = []
new_values = []
exceptions_which_should_be_added = \
    [
     '348 CTCAGRH27071943',
     '349 CTCANGM17081945',
     '350 CTCATRH10061944',
     '351 CTCAGRH27071943',
     '353 CTCANGM17081945',
     '371 CTCATRH10061944'
    ]

# for val and train
# for i in range(len(file_names)):
#     if file_names[i] in exceptions_which_should_be_added:
#         continue
    
#     splited_value = file_names[i].split(' ')
#     if len(splited_value[0]) == 4 or file_names[i] in exceptions_which_should_be_added:
#         new_values.append(file_names[i])
#     else:
#         os.rename(os.path.join(PATH_TO_RENAME, file_names[i]), os.path.join(PATH_TO_RENAME, splited_value[1]))
#         old_names.append(os.path.join(PATH_TO_RENAME, file_names[i]))
#         new_names.append(os.path.join(PATH_TO_RENAME, splited_value[1]))
#         new_values.append(splited_value[1])
        
        
# for test
for i in range(len(file_names)):
    if file_names[i] in exceptions_which_should_be_added:
        continue
    
    splited_value = file_names[i].split(' ')
    if len(splited_value[0]) == 4 or file_names[i] in exceptions_which_should_be_added:
        new_values.append(file_names[i])
    else:
        os.rename(os.path.join(PATH_TO_RENAME, file_names[i]), os.path.join(PATH_TO_RENAME, '_'.join(splited_value)))
        old_names.append(os.path.join(PATH_TO_RENAME, file_names[i]))
        new_names.append(os.path.join(PATH_TO_RENAME, splited_value[1]))
        new_values.append(splited_value[1])

### B) Rename all images per patient

In [12]:
PATH_TO_DATA = '/home/petryshak/CoronaryArteryPlaqueIdentification/data/extended_test_h5'

patients_names = os.listdir(PATH_TO_DATA)

for patient_name in tqdm(patients_names):
    images_types = os.listdir(os.path.join(PATH_TO_DATA, patient_name))
    
    for img_type in images_types:
        images_names = os.listdir(os.path.join(PATH_TO_DATA, patient_name, img_type))
        for img_name in images_names:
            os.rename(os.path.join(PATH_TO_DATA, patient_name, img_type, img_name),
                      os.path.join(PATH_TO_DATA, patient_name, img_type, img_type+'_'+'_'.join(img_name.split(' '))))

HBox(children=(IntProgress(value=0, max=99), HTML(value=u'')))




### C) Train/Val/Split

In [7]:
def move_dataset_part(dataset_part_name, patients_list, path_to_data , path_to_move):
    """
    Takes patient list and move to new subdataset.
    
    Args:
        - dataset_part_name(str): name of the new subpart of the dataset
        - patients_list: list of the patient, which sould be moved
        - path_to_data: dataset, from which we take the patient folders
        - path_to_move: path, where create new dataset
    Returns:
        - None
    """
    if not os.path.exists(os.path.join(path_to_move, dataset_part_name)):
        os.mkdir(os.path.join(path_to_move, dataset_part_name))
    
    for i in range(len(patients_list)):
        shutil.move(os.path.join(path_to_data, patients_list[i]), os.path.join(path_to_move, dataset_part_name, patients_list[i]))

349 CTCANGM17081945 - deleted because it empty

In [8]:
test_part = [
    'CTCAZHX30011957',
    'CTCAYOG08091955',
    'CTCAYOA13121966',
    'CTCATHJ17011957',
    'CTCAPHD16081938',
    'CTCASIP30041975',
    'CTCASTW15121946',
    'CTCATKR01031953',
    'CTCAVAH09071948',
    'CTCAZDV13081958',
]

val_part = [
    '1006 CTCA1961',
    '1007 CTCA1959',
    '1009 CTCA1955',
    '1027 CTCA1965',
    'CTCAAGK05031979',
    'CTCAANM18021961',
    '1001 CTCA1947',
    '1002 CTCA1955',
    '1060 CTCA1959',
    '1038 CTCA1979',
    '1045 CTCA1950',  
    'CTCAWUK05041963',
    'CTCASTR17021954',
    'CTCASTS01111969',
    'CTCATUQ02091955',
]


In [9]:
PATH_TO_DATA = '/home/petryshak/CoronaryArteryPlaqueIdentification/data/all_branches_with_pda_plv_h5'
PATH_TO_MOVE = '/home/petryshak/CoronaryArteryPlaqueIdentification/data/all_branches_with_pda_plv_h5'
DATASET_PART_NAME = 'test'
move_dataset_part(DATASET_PART_NAME, test_part, PATH_TO_DATA, PATH_TO_MOVE)

In [10]:
PATH_TO_DATA = '/home/petryshak/CoronaryArteryPlaqueIdentification/data/all_branches_with_pda_plv_h5'
PATH_TO_MOVE = '/home/petryshak/CoronaryArteryPlaqueIdentification/data/all_branches_with_pda_plv_h5'
DATASET_PART_NAME = 'val'
move_dataset_part(DATASET_PART_NAME, val_part, PATH_TO_DATA, PATH_TO_MOVE)

### D) Create final dataset


In [2]:
train_files = os.listdir('/home/petryshak/CoronaryArteryPlaqueIdentification/data/all_branches_with_pda_plv_h5/train')
val_files = os.listdir('/home/petryshak/CoronaryArteryPlaqueIdentification/data/all_branches_with_pda_plv_h5/val')
test_files = os.listdir('/home/petryshak/CoronaryArteryPlaqueIdentification/data/all_branches_with_pda_plv_h5/test')

In [3]:
# reports = pd.read_excel('ExtractLabels/lad_rca_lcx.xlsx')
reports = pd.read_csv('ExtractLabels/lad_rca_lcx_all.csv')

In [4]:
reports.tail()

Unnamed: 0.1,Unnamed: 0,REPORT_ID,PATIENT_ID,LAD,D-1,D-2,D-3,RCA,PDA_RCA,PLV_RCA,LCX,OM,OM-1,OM-2,OM-3,PDA_LCX,PLV_LCX
839,739,CTCALAN04091968,CTCALAN04091968,"['NORMAL', 'NORMAL', 'NORMAL']",-,-,-,"['NORMAL', 'NORMAL', 'NORMAL']",NORMAL,NORMAL,"['NORMAL', 'NORMAL']",-,NORMAL,-,-,-,-
840,740,1058 CTCA1956,1058 CTCA1956,"['25-50%', '50-70%', 'NORMAL']",NORMAL,-,-,"['NORMAL', 'NORMAL', 'NORMAL']",NORMAL,-,"['NORMAL', 'NORMAL']",-,NORMAL,-,-,-,NORMAL
841,741,CTCABOJ29051945,CTCABOJ29051945,"['NORMAL', 'NORMAL', 'NORMAL']",NORMAL,NORMAL,-,"['NORMAL', 'NORMAL', 'NORMAL']",NORMAL,NORMAL,"['NORMAL', 'NORMAL']",-,NORMAL,-,-,-,-
842,742,CTCAGRC24011965,CTCAGRC24011965,"['NORMAL', 'NORMAL', 'NORMAL']",NORMAL,NORMAL,NORMAL,"['NORMAL', 'NORMAL', 'NORMAL']",NORMAL,NORMAL,"['NORMAL', 'NORMAL']",-,NORMAL,-,-,-,-
843,743,CTCAGAR01101953,CTCAGAR01101953,"['25%', 'NORMAL', 'NORMAL']",NORMAL,-,-,"['NORMAL', 'NORMAL', 'NORMAL']",NORMAL,NORMAL,"['NORMAL', 'NORMAL']",-,NORMAL,NORMAL,NORMAL,-,-


In [11]:
PATH_TO_DATA = r'/home/petryshak/CoronaryArteryPlaqueIdentification/data/all_branches_with_pda_plv_h5/val/imgs'


In [12]:
len(list_of_patients), reports.shape

(109, (844, 17))

In [13]:
list_of_patients = os.listdir(PATH_TO_DATA)
# labels_csv = pd.DataFrame(columns=['PATIENT_NAME', 'IMG_NAME', 'STENOSIS_SCORE', 'ARTERY_SECTION', 'MPR_VIEWPOINT_INDEX', 'LABEL'])
labels_csv = pd.DataFrame(columns=['IMG_PATH', 'STENOSIS_SCORE', 'ARTERY_SECTION', 'MPR_VIEWPOINT_INDEX', 'LABEL'])

for i in tqdm(range(len(list_of_patients))):

    labels = get_labels(os.path.join(PATH_TO_DATA, list_of_patients[i]), reports)
    
    if not labels:
        continue
    dict_labels_images = get_imgs_names_to_the_labels(os.path.join(PATH_TO_DATA, list_of_patients[i]), labels)
    for key in dict_labels_images:
        each_ind = 0
        for j in range(len(dict_labels_images[key])):
            mpr_veiwpoint_index = dict_labels_images[key][j].strip('.h5').split('_')[-1]
                                        
            new_row = pd.Series(
                                [
                                    os.path.join('imgs/', list_of_patients[i], key[0], dict_labels_images[key][j]),
                                    key[1].split('_'),
                                    key[0],
                                    mpr_veiwpoint_index,
                                    key[2]
                                ],
                                index=labels_csv.columns)
#             new_row = pd.Series(
#                                 [
#                                     list_of_patients[i],
#                                     dict_labels_images[key][j],
#                                     key[1],
#                                     key[0],
#                                     mpr_veiwpoint_index,
#                                     key[2]
#                                 ],
#                                 index=labels_csv.columns)
            labels_csv = labels_csv.append(new_row, ignore_index=True)
#             each_ind+=1            




HBox(children=(IntProgress(value=0, max=15), HTML(value='')))

/home/petryshak/CoronaryArteryPlaqueIdentification/data/all_branches_with_pda_plv_h5/val/imgs/1038 CTCA1979
/home/petryshak/CoronaryArteryPlaqueIdentification/data/all_branches_with_pda_plv_h5/val/imgs/CTCATUQ02091955
/home/petryshak/CoronaryArteryPlaqueIdentification/data/all_branches_with_pda_plv_h5/val/imgs/CTCASTR17021954
/home/petryshak/CoronaryArteryPlaqueIdentification/data/all_branches_with_pda_plv_h5/val/imgs/1060 CTCA1959
/home/petryshak/CoronaryArteryPlaqueIdentification/data/all_branches_with_pda_plv_h5/val/imgs/1045 CTCA1950
/home/petryshak/CoronaryArteryPlaqueIdentification/data/all_branches_with_pda_plv_h5/val/imgs/CTCAANM18021961
/home/petryshak/CoronaryArteryPlaqueIdentification/data/all_branches_with_pda_plv_h5/val/imgs/1002 CTCA1955
/home/petryshak/CoronaryArteryPlaqueIdentification/data/all_branches_with_pda_plv_h5/val/imgs/1009 CTCA1955
/home/petryshak/CoronaryArteryPlaqueIdentification/data/all_branches_with_pda_plv_h5/val/imgs/CTCAWUK05041963
/home/petryshak/Coro

In [113]:
labels_csv.tail()

Unnamed: 0,IMG_PATH,STENOSIS_SCORE,ARTERY_SECTION,MPR_VIEWPOINT_INDEX,LABEL
2495,imgs/CTCAZDV13081958/OM-2/OM-2_901_CTCAZDV1308...,[NORMAL],OM-2,24,0
2496,imgs/CTCAZDV13081958/OM-2/OM-2_901_CTCAZDV1308...,[NORMAL],OM-2,43,0
2497,imgs/CTCAZDV13081958/OM-2/OM-2_901_CTCAZDV1308...,[NORMAL],OM-2,34,0
2498,imgs/CTCAZDV13081958/OM-2/OM-2_901_CTCAZDV1308...,[NORMAL],OM-2,45,0
2499,imgs/CTCAZDV13081958/OM-2/OM-2_901_CTCAZDV1308...,[NORMAL],OM-2,46,0


In [16]:
labels_csv.head()

Unnamed: 0,IMG_PATH,STENOSIS_SCORE,ARTERY_SECTION,MPR_VIEWPOINT_INDEX,LABEL
0,imgs/1038 CTCA1979/LAD/LAD_1038_CTCA1979_12.h5,"[NORMAL, 50-70%, NORMAL]",LAD,12,1
1,imgs/1038 CTCA1979/LAD/LAD_1038_CTCA1979_32.h5,"[NORMAL, 50-70%, NORMAL]",LAD,32,1
2,imgs/1038 CTCA1979/LAD/LAD_1038_CTCA1979_14.h5,"[NORMAL, 50-70%, NORMAL]",LAD,14,1
3,imgs/1038 CTCA1979/LAD/LAD_1038_CTCA1979_22.h5,"[NORMAL, 50-70%, NORMAL]",LAD,22,1
4,imgs/1038 CTCA1979/LAD/LAD_1038_CTCA1979_18.h5,"[NORMAL, 50-70%, NORMAL]",LAD,18,1


In [14]:
labels_csv['MPR_VIEWPOINT_INDEX'] = labels_csv['MPR_VIEWPOINT_INDEX'].apply(lambda x: x.strip('.h5'))
labels_csv.to_csv('labels.csv', index=False)

In [356]:
from ast import literal_eval
df1 = pd.read_excel('val_labels_new.xlsx')['STENOSIS_SCORE'].apply(literal_eval)

FileNotFoundError: [Errno 2] No such file or directory: 'val_labels_new.xlsx'

In [None]:
labels_csv['STENOSIS_SCORE'].shape[0] / 3

In [None]:
pd.value_counts(labels_csv['STENOSIS_SCORE'])

In [None]:
labels_csv.head()

In [58]:
fif.append('CTCAANM20091965')

In [288]:
path_to_data = r'/home/petryshak/CoronaryArteryPlaqueIdentification/data/binary_classification_all_branches/train/imgs'
list_of_patients = os.listdir(path_to_data)
fif = []
for i in range(len(list_of_patients)):
    if list_of_patients[i] not in list(reports['PATIENT_ID']):
        fif.append(list_of_patients[i])
        print(list_of_patients[i])

349 CTCANGM17081945
CTCASAA13011957
CTCABAM18061976
CTCASIN11111959
CTCAPUP1208191941
CTCADOP23111947
CTCAKOG14041971
CTCACOV10031966
CTCACHY14101949
CTCALAT23091942
1003 CTCA1952
371 CTCATRH10061944
1000 CTCANEJ23051970
353 CTCANGM17081945
348 CTCAGRH27071943
351 CTCAGRH27071943
CTCATRM02031956
CTCAROS28091767
CTCASTV03081936
CTCATAA17041965
350 CTCATRH10061944
CTCAESL01121969
CTCAIKE28031986
CTCAFEY20011971


In [112]:
test_set_names = os.listdir(r'D:\extended_test_set')

FileNotFoundError: [Errno 2] No such file or directory: 'D:\\extended_test_set'

In [None]:
from ast import literal_eval

In [12]:
df = pd.read_csv('/home/petryshak/CoronaryArteryPlaqueIdentification/data/all_branches_with_pda_plv/train/labels.csv')
df['STENOSIS_SCORE'] = df['STENOSIS_SCORE'].apply(literal_eval)

In [33]:
df[df['STENOSIS_SCORE'].apply(lambda x: '-' in x and len(x)==2)]

Unnamed: 0,IMG_PATH,STENOSIS_SCORE,ARTERY_SECTION,MPR_VIEWPOINT_INDEX,LABEL
4580,imgs/CTCASHZ19081947/LCX/LCX_689_CTCASHZ190819...,"[25%, -]",LCX,14,1
4581,imgs/CTCASHZ19081947/LCX/LCX_689_CTCASHZ190819...,"[25%, -]",LCX,31,1
4582,imgs/CTCASHZ19081947/LCX/LCX_689_CTCASHZ190819...,"[25%, -]",LCX,30,1
4583,imgs/CTCASHZ19081947/LCX/LCX_689_CTCASHZ190819...,"[25%, -]",LCX,26,1
4584,imgs/CTCASHZ19081947/LCX/LCX_689_CTCASHZ190819...,"[25%, -]",LCX,12,1
4585,imgs/CTCASHZ19081947/LCX/LCX_689_CTCASHZ190819...,"[25%, -]",LCX,1,1
4586,imgs/CTCASHZ19081947/LCX/LCX_689_CTCASHZ190819...,"[25%, -]",LCX,2,1
4587,imgs/CTCASHZ19081947/LCX/LCX_689_CTCASHZ190819...,"[25%, -]",LCX,50,1
4588,imgs/CTCASHZ19081947/LCX/LCX_689_CTCASHZ190819...,"[25%, -]",LCX,15,1
4589,imgs/CTCASHZ19081947/LCX/LCX_689_CTCASHZ190819...,"[25%, -]",LCX,46,1


In [17]:
kik =[]
for i in range(len(df)):
    for el in df['STENOSIS_SCORE'].iloc[i]:
        kik.append(el)

In [20]:
set(kik)

{'*50%',
 '-',
 '25%',
 '25-50%',
 '50%',
 '50-70%',
 '70%',
 '70-90%',
 '90%',
 '90-100%',
 '<25%',
 '<35%',
 '<50%',
 '>50%',
 '>70%',
 '>75%',
 '>90%',
 'NORMAL'}