In [1]:
import pandas as pd
import pydicom as dicom
from PIL import Image
import numpy as np
import os
import cv2
import shutil
from tqdm import tqdm
import matplotlib.pyplot as plt
import re

In [2]:
### USEFUL FINCTIONS 

def get_maximum_stenosis_score_category(section_stenosis_string):
    """
    Returns the max stenois score category
    
    Args:
        - section_stenosis_string(str): NORMAL_<25%_NORMAL
    
    Returns:
        - max_stenosis_score(str): <25%
    """
    categories = section_stenosis_string.split('_')
    numbers = [[int(i) for i in re.findall(r'\d+', x)] for x in categories]
    print(numbers)
    max_elements_of_sublists = [max(x or [0]) for x in numbers]
    max_stenosis_category = categories[max_elements_of_sublists.index(max(max_elements_of_sublists))]
    
    return max_stenosis_category

def split_string(section_stenosis_string):
    return section_stenosis_string.split('_')

def remove_thrash_categories(df):
    """
    Merge rare stenosis scores into the broader ones.
    """
    right_categies_names = {
     '<25%': ['<35%'], #'25%'],
     '25-50%': ['-25-50%', '<25-50%'],
     '50%': ['*50%'], #'>50%'],
     '50-70%': ['50-75%'],
     '70-90%': ['>70%', '>75%'],
     '90-100%': ['>90%', '90%']
    }
    for right_category in right_categies_names:
        df.replace(right_categies_names[right_category], right_category, inplace=True)
    return df

### Read reports

In [5]:
lad = pd.read_excel('lad_reports.xlsx')
rca = pd.read_excel('rca_reports.xlsx')
lcx = pd.read_excel('lcx_reports.xlsx')



In [6]:
assert len(np.intersect1d(lad['PATIENT_ID'].values, rca['PATIENT_ID'].values)) == lad.shape[0] 
assert len(np.intersect1d(lad['PATIENT_ID'].values, lcx['PATIENT_ID'].values)) == lad.shape[0] 

In [7]:
lad.shape, rca.shape, lcx.shape

((744, 8), (744, 7), (744, 10))

In [8]:
lad.columns, rca.columns, lcx.columns

(Index(['PATIENT_ID', 'PROXIMAL', 'MID', 'DISTAL', 'D-1', 'D-2', 'D-3', 'D-4'], dtype='object'),
 Index(['PATIENT_ID', 'REPORT_ID', 'PROXIMAL', 'DISTAL', 'MID', 'PDA', 'PLV'], dtype='object'),
 Index(['PATIENT_ID', 'REPORT_ID', 'PROXIMAL', 'DISTAL', 'OM', 'OM1', 'OM2',
        'OM3', 'PDA', 'PLV'],
       dtype='object'))

### Merge all sections to one table

In [9]:
# Merge proximal mid distal
lad['LAD'] = lad['PROXIMAL'] +'_'+ lad['MID'] +'_'+ lad['DISTAL']
rca['RCA'] = rca['PROXIMAL'] +'_'+ rca['MID'] +'_'+ rca['DISTAL']
lcx['LCX'] = lcx['PROXIMAL'] +'_'+ lcx['DISTAL']

# Del redundant columns
for df in [lad, rca, lcx]:
    del df['PROXIMAL']
    del df['DISTAL']
del lad['MID']
del rca['MID']

# Merge tables, removing redundant columns
lad_rca = pd.merge(lad, rca, on='PATIENT_ID')
lad_rca_lcx = pd.merge(lad_rca, lcx, on='PATIENT_ID', suffixes=['_RCA', '_LCX'])  # FULL DATAFRAME FOR ALL BRANCHES!
lad_rca_lcx.drop(['D-4', 'REPORT_ID_LCX'], axis=1, inplace=True)
lad_rca_lcx.rename(columns={'REPORT_ID_RCA': 'REPORT_ID'}, inplace=True)
new_order = [5, 0, 4, 1, 2, 3, 8, 6, 7, 15,9, 10, 11, 12, 13, 14]
lad_rca_lcx = lad_rca_lcx[lad_rca_lcx.columns[new_order]]

lad_rca_lcx.columns

Index(['REPORT_ID', 'PATIENT_ID', 'LAD', 'D-1', 'D-2', 'D-3', 'RCA', 'PDA_RCA',
       'PLV_RCA', 'LCX', 'OM', 'OM1', 'OM2', 'OM3', 'PDA_LCX', 'PLV_LCX'],
      dtype='object')

In [10]:
# Convert proximalMidDistal string to stenosis category
for branch in ['LAD', 'RCA', 'LCX']:
    lad_rca_lcx[branch] = lad_rca_lcx[branch].apply(split_string)

remove_thrash_categories(lad_rca_lcx).head()

Unnamed: 0,REPORT_ID,PATIENT_ID,LAD,D-1,D-2,D-3,RCA,PDA_RCA,PLV_RCA,LCX,OM,OM1,OM2,OM3,PDA_LCX,PLV_LCX
0,CTCAHER21101967,CTCAHER21101967,"[<25%, <25%, NORMAL]",NORMAL,-,-,"[NORMAL, NORMAL, NORMAL]",NORMAL,NORMAL,"[NORMAL, -]",-,NORMAL,-,-,-,-
1,CTCAGRH27071943,CTCAGRH27071943,"[NORMAL, NORMAL, NORMAL]",NORMAL,-,-,"[-, -, -]",-,-,"[NORMAL, NORMAL]",-,NORMAL,NORMAL,-,NORMAL,-
2,CTCADRG22021959,CTCADRG22021959,"[<25%, <25%, NORMAL]",<25%,<25%,NORMAL,"[NORMAL, <25%, <25%]",<25%,NORMAL,"[NORMAL, NORMAL]",-,NORMAL,-,-,-,-
3,CTCAXUZ07071955,CTCAXUZ07071955,"[NORMAL, NORMAL, NORMAL]",NORMAL,NORMAL,NORMAL,"[NORMAL, NORMAL, NORMAL]",NORMAL,NORMAL,"[NORMAL, NORMAL]",-,NORMAL,NORMAL,-,-,-
4,CTCAQUD02121959,CTCAQUD02121959,"[70%, 25-50%, NORMAL]",NORMAL,NORMAL,-,"[NORMAL, 70%, NORMAL]",NORMAL,-,"[NORMAL, NORMAL]",-,NORMAL,NORMAL,NORMAL,-,-


In [11]:
lad_rca_lcx.rename(columns={"OM1": "OM-1", "OM2": "OM-2", "OM3": "OM-3"}, inplace=True)

In [12]:
lad_rca_lcx.head()

Unnamed: 0,REPORT_ID,PATIENT_ID,LAD,D-1,D-2,D-3,RCA,PDA_RCA,PLV_RCA,LCX,OM,OM-1,OM-2,OM-3,PDA_LCX,PLV_LCX
0,CTCAHER21101967,CTCAHER21101967,"[<25%, <25%, NORMAL]",NORMAL,-,-,"[NORMAL, NORMAL, NORMAL]",NORMAL,NORMAL,"[NORMAL, -]",-,NORMAL,-,-,-,-
1,CTCAGRH27071943,CTCAGRH27071943,"[NORMAL, NORMAL, NORMAL]",NORMAL,-,-,"[-, -, -]",-,-,"[NORMAL, NORMAL]",-,NORMAL,NORMAL,-,NORMAL,-
2,CTCADRG22021959,CTCADRG22021959,"[<25%, <25%, NORMAL]",<25%,<25%,NORMAL,"[NORMAL, <25%, <25%]",<25%,NORMAL,"[NORMAL, NORMAL]",-,NORMAL,-,-,-,-
3,CTCAXUZ07071955,CTCAXUZ07071955,"[NORMAL, NORMAL, NORMAL]",NORMAL,NORMAL,NORMAL,"[NORMAL, NORMAL, NORMAL]",NORMAL,NORMAL,"[NORMAL, NORMAL]",-,NORMAL,NORMAL,-,-,-
4,CTCAQUD02121959,CTCAQUD02121959,"[70%, 25-50%, NORMAL]",NORMAL,NORMAL,-,"[NORMAL, 70%, NORMAL]",NORMAL,-,"[NORMAL, NORMAL]",-,NORMAL,NORMAL,NORMAL,-,-


In [13]:
lad_rca_lcx.to_excel('lad_rca_lcx.xlsx')

In [None]:
stenosis_statistics = {}
for branch in lad_rca_lcx:
    if branch in ['PATIENT_ID', 'REPORT_ID']:
        continue
    cur_dict = pd.value_counts(lad_rca_lcx[branch]).to_dict()
    for key in cur_dict:
        if key in stenosis_statistics.keys():
            stenosis_statistics[key] += cur_dict[key]
        else:
            stenosis_statistics[key] = cur_dict[key]
    del stenosis_statistics['-']



In [None]:
normal_num = stenosis_statistics.pop('NORMAL')


In [None]:
pd.Series(stenosis_statistics).plot(kind='bar')

In [None]:
a = os.listdir('/home/petryshak/CoronaryArteryPlaqueIdentification/data/binary_classification_all_branches/train/imgs')
b = os.listdir('/home/petryshak/CoronaryArteryPlaqueIdentification/data/binary_classification_all_branches/val/imgs')
c = os.listdir('/home/petryshak/CoronaryArteryPlaqueIdentification/data/binary_classification_all_branches/test/imgs')
f = a+b+c

In [None]:
[x for x in f if x not in list(lad_rca_lcx['PATIENT_ID'])]

In [None]:
fif = ['349 CTCANGM17081945',
 'CTCASCK30061977',
 'CTCAGOE28101952',
 'CTCASIN11111959',
 'CTCAPUP1208191941',
 'CTCADOP23111947',
 'CTCARUN25081958',
 'CTCACOV10031966',
 'CTCAONM04111950',
 'CTCAGUN18021975',
 '371 CTCATRH10061944',
 'CTCASCG29061959',
 '1000 CTCANEJ23051970',
 '353 CTCANGM17081945',
 'CTCASTR19121963',
 '1057 CTCA1972',
 '348 CTCAGRH27071943',
 '351 CTCAGRH27071943',
 'CTCAROS28091767',
 'CTCASTV03081936',
 'CTCATAA17041965',
 'CTCAHAG13081961',
 '350 CTCATRH10061944',
 'CTCAESL01121969',
 'CTCAFEY20011971']

fof = ['349 CTCANGM17081945',
 'CTCASAA13011957',
 'CTCABAM18061976',
 'CTCASIN11111959',
 'CTCAPUP1208191941',
 'CTCADOP23111947',
 'CTCAKOG14041971',
 'CTCACOV10031966',
 'CTCACHY14101949',
 'CTCALAT23091942',
 '1003 CTCA1952',
 '371 CTCATRH10061944',
 '1000 CTCANEJ23051970',
 '353 CTCANGM17081945',
 '348 CTCAGRH27071943',
 '351 CTCAGRH27071943',
 'CTCATRM02031956',
 'CTCAROS28091767',
 'CTCASTV03081936',
 'CTCATAA17041965',
 '350 CTCATRH10061944',
 'CTCAESL01121969',
 'CTCAIKE28031986',
 'CTCAFEY20011971',
 '1006 CTCA1961']

In [None]:
[x for x in fif if x not in fof]

### Questions

1. Wether 25% and <25% are the same or 25% is the same as 25-50% of stenosis score?
2. Wether 50% and 50-70% are the same or 50% means 25-50% stenosis score estimation?
3. Wether >50% are the same as 50-70% stenosis score estimation?
4. Wether 70% is the same as 50-70% or it is the same as >70% i.e. 70-90%?

# OLD VERSION BELOW

In [None]:
for branch in ['LAD', 'RCA', 'LCX', 'D-1', 'D-2', 'OM', 'OM1', 'OM2', 'OM3']:
    s = lad_rca_lcx[branch].values
    d = {'0':1, '25':2, '25-50':3, '50':4,'50-70':5, '70':6}
    temp = [ ''.join(re.sub('[>,%<]', ' ', el)) for el in s]
    temp = [re.findall(r'\d+',el) for el in temp]
    print(temp)
    break
#     if branch in ['LAD', 'RCA', 'LCX']:
#         temp = [' '.join(re.sub('[>,%<]', ' ', el).split('_')).replace('NORMAL', '0').split() for el in s]
#         temp = [list(d.keys())[list(d.values()).index(el)] for el in temp]
#         print(temp)
#         temp = [max([d[i] for i in t ]) for t in temp]
#     else:
#         temp = ' '.join(re.sub('[>,%<]', ' ', el))
#         s = [list(d.keys())[list(d.values()).index(el)] for el in temp]
#     print(s)
#     lad_rca_lcx[branch] = s

### Merge all tables

In [None]:
result = pd.merge(lad, rca, on='PATIENT_ID', suffixes=['_LAD', '_RCA'])
result.drop(['PDA', 'PLV','D-4','D-3'], axis=1, inplace=True)
result = pd.merge(result, lcx, on='REPORT_ID')

result.drop(['PDA','PLV', 'OM'], axis=1)

In [None]:
result.head()

In [None]:
lad.head()

In [None]:
rca.head()

In [None]:
lcx.head()

# 1. Stage statistics

### RCA

In [None]:
pd.value_counts(rca['PROXIMAL']).plot(kind='bar')

In [None]:
pd.value_counts(rca['MID']).plot(kind='bar')

In [None]:
pd.value_counts(rca['DISTAL']).plot(kind='bar')

### LCX

In [None]:
pd.value_counts(lcx['OM']).plot(kind='bar')

### LAD

In [None]:
pd.value_counts(lad['D-1']).plot(kind='bar')


# 2. Patient statistics

# 3. Balance positive VS negative

# 4. Statistics per branches