In [1]:
import pandas as pd
import numpy as np
import os
import shutil
from tqdm import tqdm
import pydicom as dicom

import os
import re
from numpy import nan as Nan

In [2]:
##################################### ADDITIONAL FUNCTIONS #####################################################

def strip_mpr_lad_name(mpr_name):
    """
    Strip MPR name of the LAD artery. We do this step because the name in the doctor's report 
    is not equal to the name in the MPR. 
    
    Returns:
        - str: striped string
    """
    return "".join(mpr_name.split()).replace('*', '').replace('original', '')

def read_and_strip_record(path_to_record):
    '''
    Read record file and remove empty rows and rows with all NaNs.
    
    Returns:
        - Pandas DataFrame: 
    '''
    excel_file = pd.read_excel(path_to_record,index_col=None, header=None)
    excel_file.dropna(how='all')
    excel_file.rename(columns={0: 'a', 1: 'b'}, inplace=True)
    excel_file = excel_file.fillna('  ')
    excel_file = excel_file.replace('', '  ', regex=True)
    excel_file = excel_file.drop(excel_file[excel_file['a'].str.isspace()].index)
    return excel_file

def get_lad_info_from_report(striped_record, artery_type):
    """
    Takes striped(without any empty lines and NaNs) and returns info only about the certain artery type. 
    
    Returns:
        - list: each element is the string with some info about certain artery type
    """
    lad_info = []
    wether_add = False
    lad_info.append(striped_record.iloc[0]['b'])
    for ind, row_value in striped_record.iterrows():
        if wether_add and row_value['a'].isupper():
            break
        if wether_add:
            lad_info.append(row_value['a'])
        
        if artery_type in row_value['a']:
            wether_add = True
    return lad_info


def get_level_of_stenosis_from_string(artery_info):
    """
    Returns:
        - list of str: each element is the string with percentage of stenosis. 
    """
    return [x.strip() for x in re.findall(r'.\d{1,3}.?\d{1,3}\%', artery_info)]

In [3]:
reports_path = '/home/marichka/PycharmProjects/CoronaryArteryPlaqueIdentification/reports'
list_of_files = os.listdir(reports_path)

In [4]:
# GET LIST OF ALL BRANCHES OF RCA
unique_rca = list()
for i in range(len(list_of_files)):
    try:
        cur_report_name = list_of_files[i]
        cur_file = read_and_strip_record(os.path.join(reports_path, cur_report_name))
        cur_patient_info = get_lad_info_from_report(cur_file, 'RIGHT CORONARY')
        cur_patient_info.pop(0)
        for name in [el.split(' -')[0] for el in cur_patient_info]:
            new_branch = name.lstrip()
            if len(new_branch) < 20 and '.' not in new_branch:
                unique_rca.append(new_branch)
    except:
        print('Error!')

Error!
Error!
Error!
Error!
Error!
Error!
Error!
Error!


In [5]:
# GET LIST OF ALL BRANCHES OF LEFT CIRCUMFLEX 
unique_lcx = list()
for i in range(len(list_of_files)):
    try:
        cur_report_name = list_of_files[i]
        cur_file = read_and_strip_record(os.path.join(reports_path, cur_report_name))
        cur_patient_info = get_lad_info_from_report(cur_file, 'LEFT CIRCUMFLEX')
        cur_patient_info.pop(0)
        for name in [el.split(' -')[0] for el in cur_patient_info]:
            new_branch = name.lstrip()
            if len(new_branch) < 20 and '.' not in new_branch:
                unique_lcx.append(new_branch)
    except:
        print('Error!')

Error!
Error!
Error!
Error!
Error!
Error!
Error!
Error!


In [6]:
# UNIQUE LCX SECTIONS in REPORTS
pd.Series(unique_lcx).value_counts()[pd.Series(unique_lcx).value_counts() > 20].index

Index(['Proximal', 'OM1', 'Distal', 'OM2', 'OM3', 'PDA', 'PLV', 'PL'], dtype='object')

In [7]:
# UNIQUE RCA SECTIONS in REPORTS
pd.Series(unique_rca).value_counts()[pd.Series(unique_rca).value_counts() > 5].index

Index(['Proximal', 'Distal', 'Mid', 'PDA', 'PLV'], dtype='object')

In [8]:
# UNIQUE BRANCHES in IMAGES
unique_branches_images = pd.read_csv('num_unique_branches.csv')

In [9]:
# GET LCX TABLE

extracted_lcx_df = pd.DataFrame(columns=['PATIENT_ID', 'REPORT_ID', 'PROXIMAL', 'DISTAL', 'OM', 'OM1', 'OM2', 'OM3', 
                                     'PDA', 'PLV'])


def calculate_stenosis(info):
    score = get_level_of_stenosis_from_string(info)
    score =  score[0] if score else 'NORMAL'
    return score

for i in tqdm(range(len(list_of_files))):
    try:
        cur_report_name = list_of_files[i]
        cur_file = read_and_strip_record(os.path.join(reports_path, cur_report_name))
        cur_patient_info = get_lad_info_from_report(cur_file, 'LEFT CIRCUMFLEX')

        new_row = pd.Series(['-','-','-','-','-','-','-','-', '-','-'], index=extracted_lcx_df.columns)
        new_row['PATIENT_ID'] = cur_patient_info[0]
        new_row['REPORT_ID'] = cur_report_name.split('.')[0].split('RT ')[1]

        cur_patient_info.pop(0)
        list_of_lcx_branches = [x + ' ' for x in list(extracted_lcx_df.columns)]
        for line_info in cur_patient_info:
            artery_area_name = [x for x in list_of_lcx_branches 
                                if x in line_info or x.lower() in line_info or x.title() in line_info]
            if len(artery_area_name) >=1:
                for area_name in artery_area_name:
                    area_name = area_name.strip()
                    stenosis_score = calculate_stenosis(line_info)
                    new_row.loc[area_name] = stenosis_score
            else:
                continue
        extracted_lcx_df = extracted_lcx_df.append(new_row, ignore_index=True)
    except:
        print("Error!")
extracted_lcx_df.head()

 11%|█         | 79/752 [00:01<00:08, 75.29it/s]

Error!


 19%|█▉        | 144/752 [00:01<00:08, 74.72it/s]

Error!


 35%|███▌      | 265/752 [00:03<00:06, 74.36it/s]

Error!


 39%|███▊      | 290/752 [00:03<00:06, 74.70it/s]

Error!


 42%|████▏     | 315/752 [00:04<00:05, 74.41it/s]

Error!


 74%|███████▎  | 553/752 [00:07<00:02, 67.60it/s]

Error!


 84%|████████▎ | 629/752 [00:08<00:01, 70.18it/s]

Error!


 95%|█████████▍| 711/752 [00:10<00:00, 68.53it/s]

Error!


100%|██████████| 752/752 [00:10<00:00, 70.71it/s]


Unnamed: 0,PATIENT_ID,REPORT_ID,PROXIMAL,DISTAL,OM,OM1,OM2,OM3,PDA,PLV
0,CTCAMCJ29041973,CTCAMCJ29041973,NORMAL,NORMAL,-,NORMAL,-,-,-,-
1,1051 CTCA1940,1051 CTCA1940,NORMAL,NORMAL,-,NORMAL,-,-,-,-
2,CTCAONK14021969,CTCAONK14021969,NORMAL,NORMAL,-,NORMAL,-,-,-,-
3,CTCAINM24101943,CTCAINM24101943,NORMAL,NORMAL,-,NORMAL,-,-,-,-
4,CTCAZHX30011957,CTCAZHX30011957,NORMAL,NORMAL,-,<25%,NORMAL,-,-,-


In [10]:
extracted_lcx_df.tail()

Unnamed: 0,PATIENT_ID,REPORT_ID,PROXIMAL,DISTAL,OM,OM1,OM2,OM3,PDA,PLV
739,1012 CTCA1962,1012 CTCA1962,NORMAL,NORMAL,-,NORMAL,NORMAL,-,-,-
740,CTCADAP31101957,CTCADAP31101957,25%,NORMAL,-,25%,NORMAL,-,-,-
741,CTCAPHD16081938,CTCASPHD16081938,25%,25%,-,25-50%,-,-,-,-
742,1029 CTCA1946,1029 CTCA1946,25%,NORMAL,-,NORMAL,NORMAL,-,NORMAL,-
743,CTCAVAV21041974,CTCAVAV21041974,NORMAL,-,-,NORMAL,NORMAL,-,-,-


In [11]:
extracted_lcx_df.to_excel('lcx_labels_titlesreports.xlsx', index=False)

In [54]:
# extracted_lcx_df['PLV']

In [12]:
# GET RCA TABLE
extracted_rca_df = pd.DataFrame(columns=['PATIENT_ID', 'REPORT_ID', 'PROXIMAL', 'DISTAL', 'MID',
                                     'PDA', 'PLV'])

def calculate_stenosis_rca(info, cur_report_name):
    if len(info) > 100:
        return '-'
    score = get_level_of_stenosis_from_string(info)
    score = score[0] if score else '-'
    if 'normal' in info.lower():
        score = 'NORMAL' 
    return score

for i, el in enumerate(list_of_files):
    try:
        cur_report_name = list_of_files[i]
        cur_file = read_and_strip_record(os.path.join(reports_path, cur_report_name))
        cur_patient_info = get_lad_info_from_report(cur_file, 'RIGHT CORONARY')
        
        new_row = pd.Series(['-','-','-','-','-','-','-'], index=extracted_rca_df.columns)
        new_row['PATIENT_ID'] = cur_patient_info[0]
        new_row['REPORT_ID'] = cur_report_name.split('.')[0].split('RT ')[1]

        cur_patient_info.pop(0)
        list_of_lcx_branches = [x + ' ' for x in list(extracted_rca_df.columns)]
        for line_info in cur_patient_info:
            artery_area_name = [x for x in list_of_lcx_branches 
                                if x in line_info or x.lower() in line_info or x.title() in line_info]
            if len(artery_area_name) >=1:
                for area_name in artery_area_name:
                    area_name = area_name.strip()
                    stenosis_score = calculate_stenosis_rca(line_info, cur_report_name)
                    new_row.loc[area_name] = stenosis_score
            else:
                continue
        extracted_rca_df = extracted_rca_df.append(new_row, ignore_index=True)
    except:
        print("Error, wrong file (not excel)!")
extracted_rca_df.head()

Error, wrong file (not excel)!
Error, wrong file (not excel)!
Error, wrong file (not excel)!
Error, wrong file (not excel)!
Error, wrong file (not excel)!
Error, wrong file (not excel)!
Error, wrong file (not excel)!
Error, wrong file (not excel)!


Unnamed: 0,PATIENT_ID,REPORT_ID,PROXIMAL,DISTAL,MID,PDA,PLV
0,CTCAMCJ29041973,CTCAMCJ29041973,NORMAL,NORMAL,NORMAL,NORMAL,NORMAL
1,1051 CTCA1940,1051 CTCA1940,NORMAL,NORMAL,NORMAL,NORMAL,NORMAL
2,CTCAONK14021969,CTCAONK14021969,NORMAL,NORMAL,NORMAL,NORMAL,NORMAL
3,CTCAINM24101943,CTCAINM24101943,<25%,NORMAL,NORMAL,NORMAL,NORMAL
4,CTCAZHX30011957,CTCAZHX30011957,NORMAL,NORMAL,NORMAL,NORMAL,NORMAL


In [13]:
extracted_rca_df.tail()

Unnamed: 0,PATIENT_ID,REPORT_ID,PROXIMAL,DISTAL,MID,PDA,PLV
739,1012 CTCA1962,1012 CTCA1962,NORMAL,NORMAL,NORMAL,NORMAL,NORMAL
740,CTCADAP31101957,CTCADAP31101957,-,-,-,-,-
741,CTCAPHD16081938,CTCASPHD16081938,25%,25%,25-50%,25%,25-50%
742,1029 CTCA1946,1029 CTCA1946,-,-,-,-,-
743,CTCAVAV21041974,CTCAVAV21041974,NORMAL,NORMAL,NORMAL,NORMAL,NORMAL


In [68]:
# extracted_rca_df.to_csv('rca_labels_titlesreports.csv', index=False)

In [14]:
extracted_rca_df.to_excel('rca_labels_titlesreports.xlsx', index=False)