In [1]:
import pandas as pd
import os
from tqdm import tqdm
import re
from numpy import nan as Nan
import pydicom as dicom
import cv2
import shutil


In [None]:
def get_label_for_patient(lad_segment, reports, patient_id):
    """
    
    
    Example:
        get_label_for_patient('LAD', reports, 'DDJ261Z' )

    """
    classes_to_positive = ['<25%','25%', 'NORMAL', '-']

    if lad_segment == 'LAD':
        curr_section_label = reports.loc[reports['PATIENT_ID'] == patient_id][['MID', 'PROXIMAL', 'DISTAL']].iloc[0]
        stenosis_score = [x for x in curr_section_label if x not in classes_to_positive]
        label = 1 if len(stenosis_score) > 0 else 0
        return label, '___'.join(curr_section_label.values)
        
    else:
        curr_section_label =  reports.loc[reports['PATIENT_ID'] == patient_id][lad_segment].iloc[0]
        label = 0 if curr_section_label in classes_to_positive else 1    
        return label, curr_section_label

def get_labels(path_to_patient, reports):
    """
    Returns:
        - dict: key(str) - type of the artery, value(int) - label(0 or 1)
    """
    
    types_of_images = os.listdir(path_to_patient)
    patient_name = [x for x in path_to_patient.split('/') if len(x) > 0][-1]
    labels_dict = {}
    
    for i in range(len(types_of_images)):
        labels_dict[types_of_images[i]] = get_label_for_patient(types_of_images[i], reports, patient_name)
        
    return labels_dict

def get_imgs_names_to_the_labels(path_to_patient, labels_dict):
    """
    Args:
        - dict: key - branch of artery(str), value - label(int) 0 or 1.   
    Returns:
        - dict: key - label(int) 0 or 1, value(list) list of images, belong to the labelT
    """
    img_labels = {}

    for key in labels_dict.keys():
        list_of_images = os.listdir(os.path.join(path_to_patient, key))
        if labels_dict[key][1]=='-' or labels_dict[key][1]=='-___-___-':
            continue
        if labels_dict[key] in img_labels:
            img_labels[labels_dict[key]]+= list_of_images
        else:
            img_labels[labels_dict[key]] = list_of_images
            
    return img_labels




In [None]:
path_to_patient = r'/home/petryshak/CoronaryArteryPlaqueIdentification/data/binary_classification_only_lad/train/CTCAZOD18071961'
get_labels(path_to_patient, reports)

In [None]:
path_to_patient = r'/home/petryshak/CoronaryArteryPlaqueIdentification/data/binary_classification_only_lad/train/CTCAZSR26031950'
# get_labels(path_to_patient)
get_imgs_names_to_the_labels(path_to_patient ,get_labels(path_to_patient, reports))

# Read reports file

In [None]:
reports = pd.read_excel('lad_reports.xlsx')
reports.tail()

In [None]:
reports[reports['PATIENT_ID'] == 'CTCAYAY28011972']

# 1 Rename patients folders names

In [None]:
file_names = os.listdir(path_to_rename)
file_names_reports = list(reports['PATIENT_ID'])

In [None]:
path_to_rename = '/home/petryshak/CoronaryArteryPlaqueIdentification/data/thrash'
new_values = []
exceptions_which_should_be_added = \
    [
     '348 CTCAGRH27071943',
     '349 CTCANGM17081945',
     '350 CTCATRH10061944',
     '351 CTCAGRH27071943',
     '353 CTCANGM17081945',
     '371 CTCATRH10061944'
    ]

for i in range(len(file_names)):
    if file_names[i] in exceptions_which_should_be_added:
        continue
    
    splited_value = file_names[i].split(' ')
    if len(splited_value[0]) == 4 or file_names[i] in exceptions_which_should_be_added:
        new_values.append(file_names[i])
    else:
        os.rename(os.path.join(path_to_rename, file_names[i]), os.path.join(path_to_rename, splited_value[1]))
        new_values.append(splited_value[1])

In [None]:
list_of_folders = os.listdir('E:\ONLY_LAD')

In [None]:
remove_files = []
for i in range(len(list_of_folders)):
    if list_of_folders[i] not in file_names_reports:
        remove_files.append(list_of_folders[i])
#         shutil.move(os.path.join('E:\ONLY_LAD\\',list_of_folders[i]), 
#                     os.path.join('E:\exceptions_lad\\', list_of_folders[i]))

In [None]:
for i in range(len(labels_csv)):
    lol = labels_csv['IMG_NAME'].iloc[i].split('_')[0]
    if lol not in ['LAD', 'D-1', 'D-2', 'D-3', 'D-4']:
        print(lol)

In [None]:
pd.value_counts(pd.read_csv('val.csv')['LABEL'])

# 2 Rename all images per patient

In [None]:
PATH_TO_DATA = r'D:\images\\'
 
image_types = os.listdir(PATH_TO_DATA)

In [None]:
for img_type in image_types:
    images_names = os.listdir(os.path.join(PATH_TO_DATA, img_type))
    for img_name in images_names:
        os.rename(os.path.join(PATH_TO_DATA, img_type, img_name),
                  os.path.join(PATH_TO_DATA, img_type, img_type+'_'+'_'.join(img_name.split(' '))))

### Rename all images in all patients

In [None]:
PATH_TO_DATA = '/home/petryshak/CoronaryArteryPlaqueIdentification/data/thrash'
patients_names = os.listdir(PATH_TO_DATA)

for patient_name in tqdm(patients_names):
    images_types = os.listdir(os.path.join(PATH_TO_DATA, patient_name))
    
    for img_type in images_types:
        images_names = os.listdir(os.path.join(PATH_TO_DATA, patient_name, img_type))
        for img_name in images_names:
            os.rename(os.path.join(PATH_TO_DATA, patient_name, img_type, img_name),
                      os.path.join(PATH_TO_DATA, patient_name, img_type, img_type+'_'+'_'.join(img_name.split(' '))))

# 3 Create final dataset and create excel file with labels per image


In [None]:
PATH_TO_DATA = r'/home/petryshak/CoronaryArteryPlaqueIdentification/data/binary_classification_only_lad/test'
list_of_patients = os.listdir(PATH_TO_DATA)

labels_csv = pd.DataFrame(columns=['PATIENT_NAME', 'IMG_NAME', 'LABEL', 'STENOSIS_SCORE'])

for i in tqdm(range(len(list_of_patients))):
    if i == 0:
        continue
    labels = get_labels(os.path.join(PATH_TO_DATA, list_of_patients[i]), reports)
    dict_labels_images = get_imgs_names_to_the_labels(os.path.join(PATH_TO_DATA, list_of_patients[i]), labels)
    for key in dict_labels_images:
        each_ind = 0
        for j in range(len(dict_labels_images[key])):
            if each_ind % 3 == 0:
                new_row = pd.Series([list_of_patients[i],dict_labels_images[key][j], key[0], key[1]],index=labels_csv.columns)
                labels_csv = labels_csv.append(new_row, ignore_index=True)
            each_ind+=1            


In [None]:
pd.value_counts(labels_csv['LABEL'])

In [None]:
labels_csv.to_csv('test_without_25.csv')

In [None]:
labels_csv.head()

# 4 Create images without text

In [15]:
import matplotlib.pyplot as plt
import numpy as np

In [19]:
def remove_text(img):
    mask = cv2.threshold(img, 250, 255, cv2.THRESH_BINARY)[1][:,:,0]
    dilated_mask = cv2.dilate(mask, cv2.getStructuringElement(cv2.MORPH_ELLIPSE,(3,3)))
    dst = cv2.inpaint(img, dilated_mask, 5, cv2.INPAINT_NS)
    return dst
    
path_to_img = '/home/petryshak/CoronaryArteryPlaqueIdentification/data/binary_classification_only_lad/train/CTCAZON16011962/D-3/D-3_674_CTCAZON16011962_10.png'
img = cv2.imread(path_to_img)
print(img.shape)
removed_text = remove_text(img)

# cv2.imwrite('lol.png', removed_text)


(512, 512, 3)


In [52]:
PATH_TO_DATA = '/home/petryshak/CoronaryArteryPlaqueIdentification/data/binary_classification_only_lad/train'
patients_names = os.listdir(PATH_TO_DATA)

for patient_name in tqdm(patients_names):
    images_types = os.listdir(os.path.join(PATH_TO_DATA, patient_name))
    
    for img_type in images_types:
        images_names = os.listdir(os.path.join(PATH_TO_DATA, patient_name, img_type))
        filtered_imgs = [x for x in images_names if '_text_deleted' in x]
        if len(filtered_imgs) > 0:
            continue
            
#         print('loool')
        for img_name in images_names:
            img_path = os.path.join(PATH_TO_DATA, patient_name, img_type, img_name)
            cur_img = cv2.imread(img_path)
            img_without_text = remove_text(cur_img)
            cv2.imwrite(os.path.join(PATH_TO_DATA, patient_name, img_type, img_name.split('.')[0] + '_text_deleted.png'), img_without_text)



  0%|          | 0/694 [00:00<?, ?it/s][A
  0%|          | 1/694 [00:02<33:29,  2.90s/it][A
  0%|          | 2/694 [00:07<39:31,  3.43s/it][A
 91%|█████████ | 629/694 [00:07<02:35,  2.40s/it][A
100%|██████████| 694/694 [00:07<00:00, 90.48it/s][A

In [50]:
patient_name

'CTCATHR12061965'