In [1]:
import pandas as pd
import os
from tqdm import tqdm
import re
from numpy import nan as Nan

#### We deleted two files from the bigger dataset because they had problems
#### REPORT 1018 CTCA1948.xlsx - while stripping it gave Nan after fillna() function
#### SOME SYSTEM FILE


In [2]:
path_to_records = r'/home/petryshak/CoronaryArteryPlaqueIdentification/data/reports'
list_of_files = os.listdir(path_to_records)

# Extract LAD(LEFT ANTERIOR DESCENDING ARTERY) 

In [3]:
def strip_mpr_lad_name(mpr_name):
    """
    Strip MPR name of the LAD artery. We do this step because the name in the doctor's report 
    is not equal to the name in the MPR. 
    
    Returns:
        - str: striped string
    """
    return "".join(mpr_name.split()).replace('*', '').replace('original', '')

def read_and_strip_record(path_to_record):
    '''
    Read record file and remove empty rows and rows with all NaNs.
    
    Returns:
        - Pandas DataFrame: 
    '''
    excel_file = pd.read_excel(path_to_record,index_col=None, header=None)
    excel_file.dropna(how='all')
    excel_file.rename(columns={0: 'a', 1: 'b'}, inplace=True)
    excel_file = excel_file.fillna('  ')
    excel_file = excel_file.replace('', '  ', regex=True)
    excel_file = excel_file.drop(excel_file[excel_file['a'].str.isspace()].index)
    return excel_file

def get_lad_info_from_report(striped_record, artery_type):
    """
    Takes striped(without any empty lines and NaNs) and returns info only about the certain artery type. 
    
    Returns:
        - list: each element is the string with some info about certain artery type
    """
    lad_info = []
    wether_add = False
    lad_info.append(striped_record.iloc[0]['b'])
    for ind, row_value in striped_record.iterrows():
        
        if wether_add and row_value['a'].isupper():
            break
        if wether_add:
            lad_info.append(row_value['a'])
        
        if artery_type in row_value['a']:
            wether_add = True
    return lad_info

def get_level_of_stenosis_from_string(artery_info):
    """
    Returns:
        - list of str: each element is the string with percentage of stenosis. 
    """
    return [x.strip() for x in re.findall(r'.\d{1,3}.?\d{1,3}\%', artery_info)]

# LAD data extraction

In [4]:
extracted_lad_df = pd.DataFrame(columns=['PATIENT_ID','PROXIMAL', 'MID', 'DISTAL', 'D-1', 'D-2', 'D-3', 'D-4'])

for i in tqdm(range(len(list_of_files))):
    cur_file = read_and_strip_record(os.path.join(path_to_records, list_of_files[i]))
    cur_patient_lad_info = get_lad_info_from_report(cur_file, 'LEFT ANTERIOR')
    
    new_row = pd.Series(#[Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan],
                        ['-','-','-','-','-','-','-','-'],
                        index=extracted_lad_df.columns)
    new_row['PATIENT_ID'] = cur_patient_lad_info[0]
    cur_patient_lad_info.pop(0)
    list_of_lda_branches = list(extracted_lad_df.columns)
    
    for line_info in cur_patient_lad_info:
        
        artery_area_name = [x for x in list_of_lda_branches 
                            if x in line_info or x.lower() in line_info or x.title() in line_info]
        if len(artery_area_name) >=1:
            artery_area_name = artery_area_name[0]
        else:
            continue
        stenosis_score = get_level_of_stenosis_from_string(line_info)
        stenosis_score =  stenosis_score[0] if stenosis_score else 'NORMAL'
        new_row.loc[artery_area_name] = stenosis_score
    extracted_lad_df = extracted_lad_df.append(new_row, ignore_index=True)

100%|██████████| 744/744 [00:08<00:00, 85.94it/s]


In [12]:
extracted_lad_df

Unnamed: 0,PATIENT_ID,PROXIMAL,MID,DISTAL,D-1,D-2,D-3,D-4
0,CTCAHER21101967,<25%,<25%,NORMAL,NORMAL,-,-,-
1,CTCAGRH27071943,NORMAL,NORMAL,NORMAL,NORMAL,-,-,-
2,CTCADRG22021959,<25%,<25%,NORMAL,<25%,<25%,NORMAL,-
3,CTCAXUZ07071955,NORMAL,NORMAL,NORMAL,NORMAL,NORMAL,NORMAL,-
4,CTCAQUD02121959,70%,25-50%,NORMAL,NORMAL,NORMAL,-,-
5,CTCAOSP05031947,25%,NORMAL,NORMAL,NORMAL,NORMAL,NORMAL,-
6,CTCAHIP19061975,NORMAL,NORMAL,NORMAL,NORMAL,NORMAL,-,-
7,CTCAMCF02061941,<25%,<25%,<25%,<25%,-,-,-
8,CTCAKAG20081953,NORMAL,NORMAL,NORMAL,NORMAL,NORMAL,-,-
9,CTCASHZ19081947,25%,25-50%,NORMAL,50%,-,-,-


# Save LAD results

In [5]:
extracted_lad_df.to_excel('lad_reports.xlsx', index=False)