In [1]:
import pandas as pd
import numpy as np
import os
import shutil
from tqdm import tqdm
import pydicom as dicom

import os
import re
from numpy import nan as Nan

In [2]:
##################################### ADDITIONAL FUNCTIONS #####################################################

def strip_mpr_lad_name(mpr_name):
    """
    Strip MPR name of the LAD artery. We do this step because the name in the doctor's report 
    is not equal to the name in the MPR. 
    
    Returns:
        - str: striped string
    """
    return "".join(mpr_name.split()).replace('*', '').replace('original', '')

def read_and_strip_record(path_to_record):
    '''
    Read record file and remove empty rows and rows with all NaNs.
    
    Returns:
        - Pandas DataFrame: 
    '''
    excel_file = pd.read_excel(path_to_record,index_col=None, header=None)
    excel_file.dropna(how='all')
    excel_file.rename(columns={0: 'a', 1: 'b'}, inplace=True)
    excel_file = excel_file.fillna('  ')
    excel_file = excel_file.replace('', '  ', regex=True)
    excel_file = excel_file.drop(excel_file[excel_file['a'].str.isspace()].index)
    return excel_file

def get_lad_info_from_report(striped_record, artery_type):
    """
    Takes striped(without any empty lines and NaNs) and returns info only about the certain artery type. 
    
    Returns:
        - list: each element is the string with some info about certain artery type
    """
    lad_info = []
    wether_add = False
    lad_info.append(striped_record.iloc[0]['b'])
    for ind, row_value in striped_record.iterrows():
        if wether_add and row_value['a'].isupper():
            break
        if wether_add:
            lad_info.append(row_value['a'])
        
        if artery_type in row_value['a']:
            wether_add = True
    return lad_info


def get_level_of_stenosis_from_string(artery_info):
    """
    Returns:
        - list of str: each element is the string with percentage of stenosis. 
    """
    return [x.strip() for x in re.findall(r'.\d{1,3}.?\d{1,3}\%', artery_info)]

In [3]:
reports_path = r'/home/petryshak/CoronaryArteryPlaqueIdentification/data/reports'
list_of_files = os.listdir(reports_path)

In [4]:
# GET LIST OF ALL BRANCHES OF RCA
unique_rca = list()
for i in range(len(list_of_files)):
    try:
        cur_report_name = list_of_files[i]
        cur_file = read_and_strip_record(os.path.join(reports_path, cur_report_name))
        cur_patient_info = get_lad_info_from_report(cur_file, 'RIGHT CORONARY')
        cur_patient_info.pop(0)
        for name in [el.split(' -')[0] for el in cur_patient_info]:
            new_branch = name.lstrip()
            if len(new_branch) < 20 and '.' not in new_branch:
                unique_rca.append(new_branch)
    except:
        print('Error!')

In [5]:
# GET LIST OF ALL BRANCHES OF LEFT CIRCUMFLEX 
unique_lcx = list()
for i in range(len(list_of_files)):
    try:
        cur_report_name = list_of_files[i]
        cur_file = read_and_strip_record(os.path.join(reports_path, cur_report_name))
        cur_patient_info = get_lad_info_from_report(cur_file, 'LEFT CIRCUMFLEX')
        cur_patient_info.pop(0)
        for name in [el.split(' -')[0] for el in cur_patient_info]:
            new_branch = name.lstrip()
            if len(new_branch) < 20 and '.' not in new_branch:
                unique_lcx.append(new_branch)
    except:
        print('Error!')

In [6]:
# UNIQUE LCX SECTIONS in REPORTS
pd.Series(unique_lcx).value_counts()[pd.Series(unique_lcx).value_counts() > 20].index

Index(['Proximal', 'OM1', 'Distal', 'OM2', 'OM3', 'PDA', 'PLV', 'PL'], dtype='object')

In [7]:
# UNIQUE RCA SECTIONS in REPORTS
pd.Series(unique_rca).value_counts()[pd.Series(unique_rca).value_counts() > 5].index

Index(['Proximal', 'Distal', 'Mid', 'PDA', 'PLV'], dtype='object')

In [8]:
# UNIQUE BRANCHES in IMAGES
# unique_branches_images = pd.read_csv('num_unique_branches.csv')

In [14]:
# GET LCX TABLE

extracted_lcx_df = pd.DataFrame(columns=['PATIENT_ID', 'REPORT_ID', 'PROXIMAL', 'DISTAL', 'OM', 'OM1', 'OM2', 'OM3', 
                                     'PDA', 'PLV'])


def calculate_stenosis(info):
    score = get_level_of_stenosis_from_string(info)
    score =  score[0] if score else 'NORMAL'
    return score

for i in tqdm(range(len(list_of_files))):
    try:
        cur_report_name = list_of_files[i]
        cur_file = read_and_strip_record(os.path.join(reports_path, cur_report_name))
        cur_patient_info = get_lad_info_from_report(cur_file, 'LEFT CIRCUMFLEX')

        new_row = pd.Series(['-','-','-','-','-','-','-','-', '-','-'], index=extracted_lcx_df.columns)
        new_row['PATIENT_ID'] = cur_patient_info[0]
        new_row['REPORT_ID'] = cur_report_name.split('.')[0].split('RT ')[1]
        print(cur_report_name.split('.')[0].split('RT ')[1])
        cur_patient_info.pop(0)
        list_of_lcx_branches = [x + ' ' for x in list(extracted_lcx_df.columns)]
        for line_info in cur_patient_info:
            artery_area_name = [x for x in list_of_lcx_branches 
                                if x in line_info or x.lower() in line_info or x.title() in line_info]
            if len(artery_area_name) >=1:
                for area_name in artery_area_name:
                    area_name = area_name.strip()
                    stenosis_score = calculate_stenosis(line_info)
                    new_row.loc[area_name] = stenosis_score
            else:
                continue
        extracted_lcx_df = extracted_lcx_df.append(new_row, ignore_index=True)
    except:
        print("Error!")
extracted_lcx_df.head()

  2%|▏         | 16/744 [00:00<00:09, 74.98it/s]

CTCAHER21101967
CTCAGRH27071943
CTCADRG22021959
CTCAXUZ07071955
CTCAQUD02121959
CTCAOSP05031947
CTCAHIP19061975
CTCAMCF02061941
CTCAKAG20081953
CTCASHZ19081947
CTCAVIC18091963
CTCACUY07051952
CTCAAND19041968
CTCAGRP22061951
CTCABRV22041952
CTCASIK14121953


  4%|▍         | 31/744 [00:00<00:09, 72.13it/s]

1020 CTCA1963
CTCABUT30081955
CTCAROJ23091952
CTCAMOW12101970
CTCAABK06031973
1026 CTCA1953
CTCASTR17021954
CTCAFIM15041952
CTCACOD18031960
CTCABAD21091949
CTCAHUR24071959
CTCALID30101976
CTCAHAF13091949
CTCAWED27071959
1044 CTCA1967


  6%|▋         | 47/744 [00:00<00:09, 74.37it/s]

CTCA20011971
CTCAROM11031942
CTCAGAM01121957
CTCADIG09091944
CTCABIA20011955
CTCAMOV27061944
1002 CTCA1955
CTCAHOR25041945
CTCAMOR21071983
CTCACOA15011958
CTCABAB18031970
CTCALES06081963
CTCASKF31031950
CTCACAS09041982
CTCAGOA24091965
CTCASAI16101954


  8%|▊         | 63/744 [00:00<00:09, 75.60it/s]

1037 CTCA1969
CTCABAS19021964
CTCACHA31011951
CTCACAP30091958
CTCABAS25051953
CTCACRD12031954
CTCAMOG10091935
CTCADUM13041972
CTCASIN15081948
CTCADOJ09111951
CTCATHJ17011957
CTCAJONM04111950
CTCABAJ03111955
CTCAGAK23041963
CTCAWUK05041963
CTCAHIB10121959
CTCAHUG25081941


 11%|█         | 79/744 [00:01<00:08, 75.52it/s]

CTCABAU12091968
CTCACEI01071940
CTCACAS11041945
1019 CTCA1960
CTCAPAS13121955
CTCAARD10111950
CTCAPRN24101949
CTCABOM03071955
CTCABOF07081953
CTCAMIV03101952
CTCANAA25091925
CTCATHR12061965
CTCACHY14101949
CTCAGOD07121961
CTCADEW30121968


 12%|█▏        | 87/744 [00:01<00:09, 71.14it/s]

CTCAWIT08081971
1033 CTCA1945
CTCAABU16111967
CTCADHAG13081961
1046 CTCA1964
CTCASTP11041948
CTCASCN04121966
CTCAFAI15031978
CTCASTW15121946
CTCAJAD01031956
CTCAKAD01081968
1041 CTCA1964
CTCALEX19121968
1053 CTCA1979
CTCAPAH25051962


 14%|█▍        | 103/744 [00:01<00:08, 73.60it/s]

CTCANGT01051968
CTCAMOC16041942
CTCAMCB11031950
1031 CTCA1967
CTCAKIT24071959
1045 CTCA1950
CTCADAR31081968
CTCAPUJ05021952
CTCAGRA18051968
CTCAHUJ15101954
CTCAANV13011954
CTCAHAE26121983
CTCABEL03111978
CTCACAR02091949
CTCAMAE11081949
CTCATRH02061949


 16%|█▌        | 119/744 [00:01<00:08, 75.14it/s]

CTCABLB03011936
CTCADIJ24101985
CTCAZAS01111946
1063 CTCA1979
CTCAALJ04071966
CTCALAC24101956
CTCAMOE07031963
CTCACAG11081945
CTCAAVH30091974
CTCAPRR29101944
CTCAMAT14101964
CTCAKOK28071950
CTCAPRD05011942
CTCAELR05061973
CTCANID17011976
CTCASAR26091960


 18%|█▊        | 135/744 [00:01<00:08, 73.55it/s]

CTCATAA28071960
CTCADAN16011945
CTCAGIJ28071948
CTCAFAJ25011956
CTCAHEJ29071947
CTCAPEP30101982
CTCAXIP08011952
CTCAPHT13091968
CTCAPOA11041950
CTCATRM17041959
CTCAGRS22121954
CTCAYAX04121967
CTCACOC31031962
CTCACOV07031950
CTCAGOS05111954


 20%|██        | 151/744 [00:02<00:07, 74.41it/s]

CTCAVAV21041974
CTCASIB31121979
CTCAEWG11041952
CTCAGIM19101962
CTCATOJ08041973
CTCALIM04031966
CTCABRL10121958
CTCABIA20051940
CTCAWOT09081947
CTCAVEC19021971
CTCATUQ02091955
CTCATIS27021946
CTCAALR18121945
CTCAHEM27111973
CTCATRM02031956
1066 CTCA1963


 22%|██▏       | 167/744 [00:02<00:07, 74.60it/s]

CTCALYP04021962
CTCASAM15101957
CTCAREJ13011964
CTCATRD05041958
CTCAMCD18111977
CTCAHEY04071964
CTCAEIA13071967
CTCADAM16061964
CTCAKEP15081962
CTCAMCR11051947
1028 CTCA1964
CTCAMAM06011952
CTCAHOK10051952
CTCARUN29081958
CTCABLG12101945
CTCABAN15051969


 25%|██▍       | 183/744 [00:02<00:07, 74.69it/s]

CTCAZAM03011951
CTCASAR15101938
CTCADED23081947
CTCADRC03011954
CTCAGAE16091970
1014 CTCA1950
CTCAALR27081972
CTCAROM04021952
CTCAHOE06011952
CTCABOJ02101965
CTCAINJ15061951
CTCADES05101972
CTCATOL08041984
CTCAUMD28031950
CTCATRH14021945


 27%|██▋       | 199/744 [00:02<00:07, 73.85it/s]

CTCAKAJ12021946
1011 CTCA1966
CTCAWUJ01041970
CTCAMCJ29041973
CTCABLG30101947
CTCASCF04051972
CTCAPAB25021966
CTCAHAP22031954
CTCACOC13081966
CTCADIM19081959
CTCALED04121967
CTCAARA22081947
1025 CTCA1973
CTCAMUC23091956
CTCAKOR23091950
CTCALOL05111960


 29%|██▉       | 215/744 [00:02<00:07, 75.06it/s]

CTCASIP17081980
CTCAYAY28011972
CTCACOA05051980
CTCAODJ09091954
1051 CTCA1940
CTCADUC28011969
CTCADES01051972
CTCASAR25081950
CTCAALR31101986
CTCAFOJ19021975
CTCAVAH09071948
CTCAZOD18071961
1010 CTCA1952
CTCARAA13051979
CTCACHS06011940
CTCAGUN15021975


 31%|███       | 231/744 [00:03<00:06, 75.65it/s]

CTCASMG29031951
CTCABAT27011976
CTCADOL12081970
CTCADJG28091949
CTCADEB14021972
CTCAJEA09111957
CTCALIM27071947
CTCAPID12031963
CTCAAID19041953
CTCASIP30041975
CTCASAM23081952
CTCAYEL10021947
CTCATHM25111962
CTCAWIS30071966
CTCABIY07041951
CTCAGUK01051961


 33%|███▎      | 247/744 [00:03<00:06, 73.93it/s]

CTCAEAL11071946
CTCAFAR17121938
CTCAJIH04111954
CTCABRK09121977
CTCABUS16061963
CTCABIJ11051956
CTCADER26031967
CTCAOBA05021946
CTCAKAM12031972
CTCALUX13101950
CTCANGM17081945
CTCAANK23101957
1054 CTCA1953
CTCAALC04021959
CTCACOS28101950


 35%|███▌      | 263/744 [00:03<00:06, 75.07it/s]

CTCAZOF20051940
CTCAJAC22091959
CTCAWAK11061958
CTCAHOC02031957
CTCASTE11081946
1052 CTCA1957
CTCAHUW03041988
CTCABOM15031948
CTCATRH10061944
CTCARAT29031953
CTCALOJ20051936
CTCALAR04081987
CTCAFRP12041968
CTCAMIK02071954
CTCACHK07111955
CTCAFRD01071967


 38%|███▊      | 279/744 [00:03<00:06, 75.56it/s]

CTCAQUM09101945
CTCABOM03051933
CTCADAJ03121967
CTCASOH10051949
CTCAHOJ02091938
CTCAPOR20061944
CTCAZSR26031950
CTCAYOP20011964
CTCATSR04011957
CTCAZHX30011957
CTCACHD22121964
CTCAWHN01061948
Error!
1067 CTCA1946
CTCAJOK16081964
CTCAMIJ01011949


 40%|███▉      | 295/744 [00:03<00:06, 73.85it/s]

CTCALOR28041951
CTCABLI29061978
CTCACAM19011957
CTCACAA05121944
CTCASPHD16081938
CTCAYOG08091955
CTCABUL19101948
CTCAKEM21011954
CTCAFRZ25041962
CTCAELS01121969
CTCAALJ05061968
CTCACUM29011952
CTCANIN05021971
CTCAPAP16061962
CTCAIAM10111964
1021 CTCA1957


 42%|████▏     | 311/744 [00:04<00:05, 74.72it/s]

CTCAPOT29031971
CTCAGAB19091982
CTCAVAA12021957
CTCANIB03021959
CTCABAS07091965
CTCABUA19081963
CTCABAL17031943
CTCACHT10121950
CTCABOC04051952
CTCASAJ20051973
CTCAMAC08071949
CTCAROF27081939
CTCALOC04031955
CTCADIE26111971
CTCABIV01071975
1017 CTCA1965


 44%|████▍     | 327/744 [00:04<00:05, 75.65it/s]

CTCAWIA16091949
CTCAOLM18061959
CTCAKNJ05061962
CTCAMIJ24121949
1068 CTCA1965
CTCARYA14051975
1038 CTCA1979
CTCAROS28091967
CTCALAN28121951
CTCAMCM13101965
CTCADOC01011939
CTCALIS10071959
CTCAPWG15041947
CTCANIG15071969
CTCAFON19121956


 46%|████▌     | 343/744 [00:04<00:05, 72.90it/s]

CTCALAS15021963
CTCAWIM24071960
CTCAYUH07051951
CTCAREL07111960
CTCADER07041933
CTCAJOS19101956
CTCANIN21061961
CTCALAM11041957
1055 CTCA1940
CTCAFLA01021951
CTCANAI09081957
CTCAMCE16031959
CTCAKRR04101944
CTCAALJ09041974
CTCARYM13031960
CTCAMUJ26121935


 48%|████▊     | 359/744 [00:04<00:05, 73.84it/s]

CTCACON15031968
1006 CTCA1961
CTCAPUP12081941
CTCAWHG07051947
CTCACOJ19041959
CTCASUP18091942
CTCAVES16111953
CTCAMID23021968
CTCAPAP26101958
CTCALIY25101936
CTCANIC06071977
CTCAWAG26031956
1056 CTCA1975
CTCACAS02111945
CTCADEA11031934
CTCAKEA21071963


 50%|█████     | 375/744 [00:05<00:05, 72.36it/s]

CTCAHAM12031963
CTCALAM07041983
CTCANAM01011958
1065 CTCA1975
CTCAPHM02051973
CTCATRH28111961
CTCAEUJ28061947
CTCASTP26091970
CTCAATJ12091964
CTCASAD22031955
CTCAWAL11031957
CTCAFOA17071936
1022 CTCA1970
CTCASMW25011965
CTCANGD08021962
CTCASCG29081959


 53%|█████▎    | 391/744 [00:05<00:04, 73.96it/s]

CTCACLM08111963
CTCAREA01101963
CTCAOZA25051960
CTCAANM20091965
CTCAPHN24041956
CTCAERE19041969
CTCAAVE26051952
CTCASHK01071965
CTCALIY07061941
CTCASAA16011948
CTCAGOJ04081950
CTCAYEA25021959
CTCAKAM05041965
CTCAFIF29081962
CTCALEH15091954


 55%|█████▍    | 407/744 [00:05<00:04, 69.73it/s]

CTCAWOD25061972
CTCALEP19021961
CTCAHUL31121979
1030 CTCA1942
CTCANOT29121954
DOP23111947
1007 CTCA1959
CTCARUE27031956
CTCALIG23101958
1029 CTCA1946
CTCALOG09081952
CTCAWOJ08061951
1043 CTCA1947
CTCAONK14021969
CTCADIK26091982


 57%|█████▋    | 423/744 [00:05<00:04, 71.28it/s]

CTCAHAK07071989
CTCAPER20051953
CTCAFIL30091959
CTCACLP03101964
CTCAGHR05051944
1013 CTCA1968
CTCAMIA19051956
CTCACHJ03051975
CTCAABS20051975
CTCAALS10011968
CTCADEW15091966
CTCATRK08061957
CTCABIG08081948
CTCATAY21121955
CTCAGHS23011946


 58%|█████▊    | 431/744 [00:05<00:04, 72.30it/s]

CTCAGRK05051966
CTCACOL20081960
CTCAWAD12111978
CTCARUJ13031970
CTCASAF08011966
CTCAMAB06011971
CTCAKAA04061988
CTCALEK24101973
CTCASAA04021955
CTCALIH29011953
CTCAYOA13121966
CTCATAL16071942
CTCAKOG14041971
CTCANIJ16061961
CTCAANJ15011966
CTCALIH01041952


 60%|██████    | 447/744 [00:06<00:04, 69.16it/s]

1059 CTCA1948
CTCADUN02091949
CTCAYOM21111962
CTCASAC09091967
CTCAALK23011941
CTCAWIA26081961
1060 CTCA1959
CTCADES01091969
CTCACUJ26111956
CTCATUJ10031952
CTCACAL06051964
CTCACUL11051976
CTCASHJ10081965
CTCAFIR29061959


 62%|██████▏   | 463/744 [00:06<00:03, 72.56it/s]

CTCAMCJ03111971
CTCANIP30061951
1042 CTCA1963
CTCACAG10091951
1057 CTCA1970
CTCAABA15071978
CTCABAD05021966
CTCAHAM21081956
CTCAHAA12121955
CTCADEE10061964
CTCASPF22031950
CTCANGJ21101965
CTCANGD08031959
CTCACAM19071962
CTCAAMJ02121958
CTCABLM10031948


 64%|██████▍   | 479/744 [00:06<00:03, 74.60it/s]

CTCAWIW03091957
CTCARUM21071960
CTCASUR08101955
CTCALAS26031955
CTCAKIM10051959
CTCAROA25111965
CTCAKUJ26101958
CTCALIJ25091960
CTCAGAM07071954
1027 CTCA1965
CTCAFRD03031970
CTCALAN17091954
1008 CTCA1953
CTCAMOM16071939
CTCACHN27011976
CTCAMAJ11101955


 67%|██████▋   | 495/744 [00:06<00:03, 74.07it/s]

CTCADUS31011959
CTCALIT24011959
1062 CTCA1950
1049 CTCA1948
CTCAGOD16101961
CTCAPIH03011940
CTCABUL09101957
CTCASUM14111968
CTCASOA02031935
CTCAKAG03101958
CTCABIM05031976
CTCAREM16071951
CTCAGRK16111978
CTCAFUK23081966
CTCAROD03071963


 69%|██████▊   | 511/744 [00:06<00:03, 74.88it/s]

CTCAFOL20081954
CTCABAR15031947
CTCALOC30081957
CTCAPAD28111958
CTCAHAM28051973
CTCACHD05011974
CTCAMAJ25091943
CTCAMEJ30111966
1039 CTCA1962
CTCAFAM14021944
CTCAZDV13081958
1050 CTCA1970
CTCABUK10061941
CTCASEA22051965
CTCARON05071963
CTCABEJ27081974


 71%|███████   | 527/744 [00:07<00:02, 75.20it/s]

CTCAMAA21091962
CTCASHI20071976
CTCABRR26081951
CTCAHOR03051964
1032 CTCA1953
CTCAJAB03061972
CTCAHAN06081951
CTCASSTR19121963
CTCABIK30071951
CTCAJOJ18081952
CTCAYAH26091963
CTCATKR01031953
1034 CTCA1968
CTCAWID07011953
CTCAPAE18101950
CTCABIS12031974


 73%|███████▎  | 543/744 [00:07<00:02, 75.55it/s]

CTCAWII29041949
CTCALYK26111949
CTCAMAD11101958
CTCABLP12071954
CTCADID11041947
CTCATOB10071953
CTCAARV06091960
CTCAOBO09071956
CTCADIJ01051959
CTCABRA09051945
1048 CTCA1969
CTCAREK14091958
1036 CTCA1958
CTCASZM03081953
CTCASAB26021963


 75%|███████▌  | 559/744 [00:07<00:02, 74.99it/s]

CTCAFIJ20051944
1024 CTCA1947
CTCAAMP05021943
CTCAXUG17101957
CTCAAND23011985
CTCASTS01111969
CTCAMUF24011960
CTCASTB22081954
CTCAJON06051958
CTCAMEL16071958
1061 CTCA1958
CTCAGOJ29111988
CTCASHA02091971
CTCASSCK30061977
CTCAARM02111955
CTCAMAJ16011947


 77%|███████▋  | 575/744 [00:07<00:02, 75.87it/s]

CTCAWAB12121964
CTCASAM14101958
CTCAGAL26021950
CTCATIR05011973
CTCAPAP02011978
CTCACOP16111942
CTCASPI20121934
CTCABOJ10101974
CTCALYM29101955
CTCASMR20051961
CTCAINM24101943
CTCAPAM11021971
CTCAHEH18071972
1035 CTCA1963
CTCAOSO06031956
CTCAGEW21051949


 79%|███████▉  | 591/744 [00:07<00:02, 76.43it/s]

CTCAJOJ30031955
CTCADAP31101957
CTCAWOI17051966
CTCAHAM24121959
CTCAFAS30041953
CTCAOBT05091952
CTCACAS22071957
CTCATRA27081947
CTCALIJ25121960
CTCABAD21031971
CTCAPOD03051961
CTCADIH01051966
CTCAATC18051949
CTCACLM20011937
CTCAKEK18041957
CTCADSM27111962


 82%|████████▏ | 607/744 [00:08<00:01, 72.54it/s]

CTCACHD14091940
CTCALAL12111977
CTCAALS24081961
CTCAGRE17121953
CTCASIJ17101947
CTCABRK17071953
1009 CTCA1955
CTCABRA11121962
CTCAHOD09121963
CTCAFUP12011963
CTCACHA05061964
CTCAPAW11081956
CTCAGAT05081949
CTCATAR27091957


 84%|████████▎ | 623/744 [00:08<00:01, 74.85it/s]

CTCALAC16021949
CTCACRM10081962
CTCABHP18021957
CTCAWIF19081951
CTCAKOR29061959
CTCAHOH10021975
CTCAHOH20111949
CTCALUR28041961
CTCAMCK11101957
CTCAZAA11071953
CTCAZON16011962
CTCAGIG10031949
CTCAJAE22121969
CTCAAMA10051949
CTCAKUP13111962
CTCABOC29071976
CTCAHAS05121968


 86%|████████▌ | 639/744 [00:08<00:01, 76.28it/s]

CTCABRZ31031967
CTCANGT15041950
CTCAROJ27021964
CTCAREM23111946
CTCAMIK28021956
CTCAHOB27061971
CTCAHOM24111954
CTCANEJ23051970
CTCAROM23031950
CTCAROA07091955
CTCASAA13011957
CTCATRN30041961
1003 CTCA1952
CTCADEM10031952
CTCAMAN25091961
CTCAPEP24061976


 88%|████████▊ | 655/744 [00:08<00:01, 73.09it/s]

1004 CTCA1955
1070 CTCA1950
CTCATAM15101977
CTCAWHJ03091947
CTCAKIK23071957
CTCAIBA01071966
CTCAPIS26121959
CTCATAS19041964
1016 CTCA1962
CTCALAT23091942
CTCAQUC25031951
CTCASAG10051945
CTCAWIC07031959
CTCAILK23071937
CTCAQUA07061949


 90%|█████████ | 671/744 [00:09<00:01, 72.65it/s]

1001 CTCA1947
CTCAGAY31051980
CTCASOK08111990
CTCASEV04021963
CTCABOJ16091972
CTCAPAR19111945
CTCAGIJ07041956
CTCAWOC03021939
CTCADIV19031951
1005 CTCA1945
1015 CTCA1967
CTCAMOC15111963
CTCABAM18061976
CTCACHQ16061953
CTCACID21061960


 92%|█████████▏| 687/744 [00:09<00:00, 73.59it/s]

CTCASTJ20111970
CTCAVAE10021954
CTCAMAT27021969
CTCANAS04021967
CTCATHC18051977
CTCAJOS11031956
1047 CTCA1946
CTCAPAE21071949
1069 CTCA1951
CTCATAN26011954
CTCAWET15021980
CTCAODJ10101953
1064 CTCA1958
CTCAGOH16121968
CTCADHP18091972
CTCADYJ20091966


 94%|█████████▍| 703/744 [00:09<00:00, 72.80it/s]

CTCAKIB01061964
CTCATUC21061956
CTCAILV18021966
CTCACAK20091961
CTCAHOP09101954
CTCABRR14011952
CTCACOM10011939
CTCAHOC22051965
CTCAFER25031943
CTCADAC17041956
CTCARIC26111946
CTCAARG09091933
CTCAKAA17051968
CTCAAGK05031979
CTCAARI01081944
CTCAPIJ31011964


 97%|█████████▋| 719/744 [00:09<00:00, 73.98it/s]

CTCAPEJ30101975
CTCATAM24041955
CTCAMAP18021966
COV10031966
CTCALID04021953
CTCADGOE28101952
CTCATRV02031937
CTCAKIA22041947
CTCANGA24091958
CTCARIS28021951
CTCALAH29101940
CTCAIKE28031986
CTCALAJ21021947
CTCAANM18021961
1040 CTCA1962
CTCAPOM18091967


 99%|█████████▉| 735/744 [00:09<00:00, 75.89it/s]

1023 CTCA1960
CTCACES20071965
CTCABIB16101983
1012 CTCA1962
CTCAMIA30041964
CTCABRW29041967
CTCASPB03121931
CTCAJHH20081965
CTCALIB10031961
CTCABAM11031964
CTCASCL12011981
CTCADOT21121946
CTCALID04061955
CTCAWOM06061941
CTCAVRC07081978
CTCADAG07031973
CTCARIS09111963


100%|██████████| 744/744 [00:10<00:00, 74.08it/s]

CTCAMIM08101949
CTCAODJ27091949
CTCALAN04091968
1058 CTCA1956
CTCABOJ29051945
CTCAGRC24011965
CTCAGAR01101953





Unnamed: 0,PATIENT_ID,REPORT_ID,PROXIMAL,DISTAL,OM,OM1,OM2,OM3,PDA,PLV
0,CTCAHER21101967,CTCAHER21101967,NORMAL,-,-,NORMAL,-,-,-,-
1,CTCAGRH27071943,CTCAGRH27071943,NORMAL,NORMAL,-,NORMAL,NORMAL,-,NORMAL,-
2,CTCADRG22021959,CTCADRG22021959,NORMAL,NORMAL,-,NORMAL,-,-,-,-
3,CTCAXUZ07071955,CTCAXUZ07071955,NORMAL,NORMAL,-,NORMAL,NORMAL,-,-,-
4,CTCAQUD02121959,CTCAQUD02121959,NORMAL,NORMAL,-,NORMAL,NORMAL,NORMAL,-,-


In [12]:
extracted_lcx_df.to_excel('lcx_labels.xlsx', index=False)

In [13]:
pd.value_counts(extracted_lcx_df['PLV'])

-         692
NORMAL     51
>70%        1
Name: PLV, dtype: int64

In [14]:
# GET RCA TABLE
extracted_rca_df = pd.DataFrame(columns=['PATIENT_ID', 'REPORT_ID', 'PROXIMAL', 'DISTAL', 'MID',
                                     'PDA', 'PLV'])

def calculate_stenosis_rca(info, cur_report_name):
    if len(info) > 100:
        return '-'
    score = get_level_of_stenosis_from_string(info)
    score = score[0] if score else '-'
    if 'normal' in info.lower():
        score = 'NORMAL' 
    return score

for i, el in enumerate(list_of_files):
    try:
        cur_report_name = list_of_files[i]
        cur_file = read_and_strip_record(os.path.join(reports_path, cur_report_name))
        cur_patient_info = get_lad_info_from_report(cur_file, 'RIGHT CORONARY')
        
        new_row = pd.Series(['-','-','-','-','-','-','-'], index=extracted_rca_df.columns)
        new_row['PATIENT_ID'] = cur_patient_info[0]
        new_row['REPORT_ID'] = cur_report_name.split('.')[0].split('RT ')[1]

        cur_patient_info.pop(0)
        list_of_lcx_branches = [x + ' ' for x in list(extracted_rca_df.columns)]
        for line_info in cur_patient_info:
            artery_area_name = [x for x in list_of_lcx_branches 
                                if x in line_info or x.lower() in line_info or x.title() in line_info]
            if len(artery_area_name) >=1:
                for area_name in artery_area_name:
                    area_name = area_name.strip()
                    stenosis_score = calculate_stenosis_rca(line_info, cur_report_name)
                    new_row.loc[area_name] = stenosis_score
            else:
                continue
        extracted_rca_df = extracted_rca_df.append(new_row, ignore_index=True)
    except:
        print("Error, wrong file (not excel)!")
extracted_rca_df.head()

Unnamed: 0,PATIENT_ID,REPORT_ID,PROXIMAL,DISTAL,MID,PDA,PLV
0,CTCAHER21101967,CTCAHER21101967,NORMAL,NORMAL,NORMAL,NORMAL,NORMAL
1,CTCAGRH27071943,CTCAGRH27071943,-,-,-,-,-
2,CTCADRG22021959,CTCADRG22021959,NORMAL,<25%,<25%,<25%,NORMAL
3,CTCAXUZ07071955,CTCAXUZ07071955,NORMAL,NORMAL,NORMAL,NORMAL,NORMAL
4,CTCAQUD02121959,CTCAQUD02121959,NORMAL,NORMAL,70%,NORMAL,-


In [68]:
# extracted_rca_df.to_csv('rca_labels_titlesreports.csv', index=False)

In [15]:
extracted_rca_df.to_excel('rca_labelsreports.xlsx', index=False)

(744, 7)