In [1]:
import pandas as pd
import os
from tqdm import tqdm
import re
from numpy import nan as Nan
import pydicom as dicom
import cv2
import shutil


In [2]:
reports = pd.read_excel('lad_reports.xlsx')

In [3]:
reports.head()

Unnamed: 0,PATIENT_ID,PROXIMAL,MID,DISTAL,D-1,D-2,D-3,D-4
0,1001 CTCA1947,25-50%,25-50%,25-50%,NORMAL,25-50%,-,-
1,1002 CTCA1955,NORMAL,50%,NORMAL,NORMAL,-,-,-
2,DDJ261Z,50-70%,NORMAL,NORMAL,NORMAL,NORMAL,25-50%,-
3,1004 CTCA1955,>50%,NORMAL,NORMAL,NORMAL,NORMAL,-,-
4,1005 CTCA1945,25-50%,NORMAL,NORMAL,NORMAL,-,-,-


In [4]:
reports['PATIENT_ID']

0        1001 CTCA1947
1        1002 CTCA1955
2              DDJ261Z
3        1004 CTCA1955
4        1005 CTCA1945
5         106 CTCA1961
6        1007 CTCA1959
7        1008 CTCA1953
8        1009 CTCA1955
9        1010 CTCA1952
10       1011 CTCA1966
11       1012 CTCA1962
12       1013 CTCA1968
13       1014 CTCA1950
14       1015 CTCA1967
15       1016 CTCA1962
16       1017 CTCA1965
17       1019 CTCA1960
18       1020 CTCA1963
19       1021 CTCA1957
20       1022 CTCA1970
21       1023 CTCA1960
22       1024 CTCA1947
23       1025 CTCA1973
24       1026 CTCA1953
25       1027 CTCA1965
26       1028 CTCA1964
27       1029 CTCA1946
28       1030 CTCA1942
29       1031 CTCA1967
            ...       
714    CTCAWOD25061972
715    CTCAWOI17051966
716    CTCAWOJ08061951
717    CTCAWOM06061941
718    CTCAWOT09081947
719    CTCAWUJ01041970
720    CTCAWUK05041963
721    CTCAXIP08011952
722    CTCAXUG17101957
723    CTCAXUZ07071955
724    CTCAYAH26091963
725    CTCAYAX04121967
726    CTCA

In [4]:
def get_labels(path_to_patient):
    """
    Returns:
        - dict: key(str) - type of the artery, value(int) - label(0 or 1)
    """
    types_of_images = os.listdir(path_to_patient)
    patient_name = [x for x in path_to_patient.split('/') if len(x) > 0][-1]
    
    labels_dict = {}
#     print(types_of_images)
    for i in range(len(types_of_images)):
        if 'LAD' in types_of_images[i]:
            label = 0 if 'NORMAL' in reports.loc[reports['PATIENT_ID'] == patient_name]['MID'].iloc[0] or \
               'NORMAL' in reports.loc[reports['PATIENT_ID'] == patient_name]['DISTAL'].iloc[0] or \
               'NORMAL' in reports.loc[reports['PATIENT_ID'] == patient_name]['PROXIMAL'].iloc[0] else 1
        else:
            if types_of_images[i] == 'D-1Original':
                print(patient_name)
            label = 0 if 'NORMAL' in reports.loc[reports['PATIENT_ID'] == patient_name][types_of_images[i]].iloc[0] else 1
        
        labels_dict[types_of_images[i]] = label
        
    return labels_dict

def get_imgs_names_to_the_labels(path_to_patient, labels_dict):
    """
    Args:
        - dict: key - branch of artery(str), value - label(int) 0 or 1.   
    Returns:
        - dict: key - label(int) 0 or 1, value(list) list of images, belong to the labelT
    """
    img_labels = {}

    for key in labels_dict.keys():
        list_of_images = os.listdir(os.path.join(path_to_patient, key))
        if labels_dict[key] in img_labels:
            img_labels[labels_dict[key]]+= list_of_images
        else:
            img_labels[labels_dict[key]] = list_of_images
            
    return img_labels


In [55]:
path_to_patient = r'/home/petryshak/CoronaryArteryPlaqueIdentification/data/ONLY_LAD/CTCAATC18051949'
get_labels(path_to_patient)
get_imgs_names_to_the_labels(path_to_patient ,get_labels(path_to_patient))

{0: ['D-2_377_CTCAATC18051949_7.png',
  'D-2_377_CTCAATC18051949_11.png',
  'D-2_377_CTCAATC18051949_20.png',
  'D-2_377_CTCAATC18051949_8.png',
  'D-2_377_CTCAATC18051949_42.png',
  'D-2_377_CTCAATC18051949_16.png',
  'D-2_377_CTCAATC18051949_40.png',
  'D-2_377_CTCAATC18051949_41.png',
  'D-2_377_CTCAATC18051949_6.png',
  'D-2_377_CTCAATC18051949_38.png',
  'D-2_377_CTCAATC18051949_47.png',
  'D-2_377_CTCAATC18051949_21.png',
  'D-2_377_CTCAATC18051949_4.png',
  'D-2_377_CTCAATC18051949_2.png',
  'D-2_377_CTCAATC18051949_35.png',
  'D-2_377_CTCAATC18051949_9.png',
  'D-2_377_CTCAATC18051949_24.png',
  'D-2_377_CTCAATC18051949_14.png',
  'D-2_377_CTCAATC18051949_28.png',
  'D-2_377_CTCAATC18051949_25.png',
  'D-2_377_CTCAATC18051949_12.png',
  'D-2_377_CTCAATC18051949_31.png',
  'D-2_377_CTCAATC18051949_49.png',
  'D-2_377_CTCAATC18051949_32.png',
  'D-2_377_CTCAATC18051949_19.png',
  'D-2_377_CTCAATC18051949_45.png',
  'D-2_377_CTCAATC18051949_37.png',
  'D-2_377_CTCAATC18051949_48.p

# 2 Rename all images per patient

In [5]:
PATH_TO_DATA = r'D:\images\\'
 
image_types = os.listdir(PATH_TO_DATA)

In [None]:
for img_type in image_types:
    images_names = os.listdir(os.path.join(PATH_TO_DATA, img_type))
    for img_name in images_names:
        os.rename(os.path.join(PATH_TO_DATA, img_type, img_name),
                  os.path.join(PATH_TO_DATA, img_type, img_type+'_'+'_'.join(img_name.split(' '))))

# Rename all images in all patients

In [43]:
PATH_TO_DATA = '/home/petryshak/CoronaryArteryPlaqueIdentification/data/thrash'
patients_names = os.listdir(PATH_TO_DATA)

for patient_name in tqdm(patients_names):
    images_types = os.listdir(os.path.join(PATH_TO_DATA, patient_name))
    
    for img_type in images_types:
        images_names = os.listdir(os.path.join(PATH_TO_DATA, patient_name, img_type))
        for img_name in images_names:
            os.rename(os.path.join(PATH_TO_DATA, patient_name, img_type, img_name),
                      os.path.join(PATH_TO_DATA, patient_name, img_type, img_type+'_'+'_'.join(img_name.split(' '))))

# 3 Create final dataset and create excel file with labels per image

In [5]:
PATH_TO_DATA = r'/home/petryshak/CoronaryArteryPlaqueIdentification/data/binary_classification_only_lad/train'
list_of_patients = os.listdir(PATH_TO_DATA)

labels_csv = pd.DataFrame(columns=['PATIENT_NAME', 'IMG_NAME', 'LABEL'])

for i in tqdm(range(len(list_of_patients))):
    if i == 0:
        continue
    labels = get_labels(os.path.join(PATH_TO_DATA, list_of_patients[i]))
    dict_labels_images = get_imgs_names_to_the_labels(os.path.join(PATH_TO_DATA, list_of_patients[i]), labels)
    for key in dict_labels_images:
        each_ind = 0

        for j in range(len(dict_labels_images[key])):
            if each_ind % 3 == 0:
                new_row = pd.Series([list_of_patients[i],dict_labels_images[key][j], key],index=labels_csv.columns)
                labels_csv = labels_csv.append(new_row, ignore_index=True)
            each_ind+=1            


100%|██████████| 694/694 [00:50<00:00, 13.73it/s]


In [8]:
pd.value_counts(labels_csv['LABEL'])

1    287
0    254
Name: LABEL, dtype: int64

In [11]:
labels_csv.to_csv('train.csv')

In [9]:
labels_csv.head()

Unnamed: 0,ARTERY_SECTION,PATIENT_NAME,IMG_NAME,LABEL
0,0,CTCAYOG08091955,D-2_889_CTCAYOG08091955_13.png,0
1,0,CTCAYOG08091955,D-2_889_CTCAYOG08091955_18.png,0
2,0,CTCAYOG08091955,D-2_889_CTCAYOG08091955_8.png,0
3,0,CTCAYOG08091955,D-2_889_CTCAYOG08091955_25.png,0
4,0,CTCAYOG08091955,D-2_889_CTCAYOG08091955_29.png,0


# 1 Rename patients folders names

In [41]:
file_names = os.listdir(path_to_rename)
file_names_reports = list(reports['PATIENT_ID'])

In [42]:
path_to_rename = '/home/petryshak/CoronaryArteryPlaqueIdentification/data/thrash'
new_values = []
exceptions_which_should_be_added = \
    [
     '348 CTCAGRH27071943',
     '349 CTCANGM17081945',
     '350 CTCATRH10061944',
     '351 CTCAGRH27071943',
     '353 CTCANGM17081945',
     '371 CTCATRH10061944'
    ]

for i in range(len(file_names)):
    if file_names[i] in exceptions_which_should_be_added:
        continue
    
    splited_value = file_names[i].split(' ')
    if len(splited_value[0]) == 4 or file_names[i] in exceptions_which_should_be_added:
        new_values.append(file_names[i])
    else:
        os.rename(os.path.join(path_to_rename, file_names[i]), os.path.join(path_to_rename, splited_value[1]))
        new_values.append(splited_value[1])

In [None]:
list_of_folders = os.listdir('E:\ONLY_LAD')

In [None]:
remove_files = []
for i in range(len(list_of_folders)):
    if list_of_folders[i] not in file_names_reports:
        remove_files.append(list_of_folders[i])
#         shutil.move(os.path.join('E:\ONLY_LAD\\',list_of_folders[i]), 
#                     os.path.join('E:\exceptions_lad\\', list_of_folders[i]))

In [10]:
for i in range(len(labels_csv)):
    lol = labels_csv['IMG_NAME'].iloc[i].split('_')[0]
    if lol not in ['LAD', 'D-1', 'D-2', 'D-3', 'D-4']:
        print(lol)

2D-2
2D-2
2D-2
2D-2
2D-2
2D-2
2D-2
2D-2
2D-2
2D-2
2D-2
2D-2
2D-2
2D-2
2D-2
2D-2
2D-2
LADOriginal
LADOriginal
LADOriginal
LADOriginal
LADOriginal
LADOriginal
LADOriginal
LADOriginal
LADOriginal
LADOriginal
LADOriginal
LADOriginal
LADOriginal
LADOriginal
LADOriginal
LADOriginal
LADOriginal
D-1Original
D-1Original
D-1Original
D-1Original
D-1Original
D-1Original
D-1Original
D-1Original
D-1Original
D-1Original
D-1Original
D-1Original
D-1Original
D-1Original
D-1Original
D-1Original
D-1Original
