In [1]:
import pandas as pd
import numpy as np
import wfdb
import ast
import os
import sys
import shutil
#from example_physionet import load_raw_data, aggregate_diagnostic

In [2]:
def load_raw_data(df, sampling_rate, path):
    if sampling_rate == 100:
        data = [wfdb.rdsamp(path+f) for f in df.filename_lr]
    else:
        data = [wfdb.rdsamp(path+f) for f in df.filename_hr]
    data = np.array([signal for signal, meta in data])
    return data

def aggregate_diagnostic(y_dic):
    tmp = []
    for key in y_dic.keys():
        if key in agg_df.index:
            tmp.append(agg_df.loc[key].diagnostic_class)
    return list(set(tmp))

In [3]:
path = os.path.join('..', '..', 'ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.3/')
sampling_rate=100

In [4]:
!ls {path}

example_physionet.py	  ptbxl_v103_changelog.txt  scp_statements.csv
LICENSE.txt		  RECORDS		    SHA256SUMS.txt
ptbxl_database.csv	  records100
ptbxl_v102_changelog.txt  records500


In [5]:
Y = pd.read_csv(path+'ptbxl_database.csv', index_col='ecg_id')
Y.scp_codes = Y.scp_codes.apply(lambda x: ast.literal_eval(x))
Y.count()

patient_id                      21799
age                             21799
sex                             21799
height                           6974
weight                           9421
nurse                           20326
site                            21782
device                          21799
recording_date                  21799
report                          21799
scp_codes                       21799
heart_axis                      13331
infarction_stadium1              5612
infarction_stadium2               103
validated_by                    12421
second_opinion                  21799
initial_autogenerated_report    21799
validated_by_human              21799
baseline_drift                   1598
static_noise                     3260
burst_noise                       613
electrodes_problems                30
extra_beats                      1949
pacemaker                         291
strat_fold                      21799
filename_lr                     21799
filename_hr 

In [6]:
Y.head()

Unnamed: 0_level_0,patient_id,age,sex,height,weight,nurse,site,device,recording_date,report,...,validated_by_human,baseline_drift,static_noise,burst_noise,electrodes_problems,extra_beats,pacemaker,strat_fold,filename_lr,filename_hr
ecg_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,15709.0,56.0,1,,63.0,2.0,0.0,CS-12 E,1984-11-09 09:17:34,sinusrhythmus periphere niederspannung,...,True,,", I-V1,",,,,,3,records100/00000/00001_lr,records500/00000/00001_hr
2,13243.0,19.0,0,,70.0,2.0,0.0,CS-12 E,1984-11-14 12:55:37,sinusbradykardie sonst normales ekg,...,True,,,,,,,2,records100/00000/00002_lr,records500/00000/00002_hr
3,20372.0,37.0,1,,69.0,2.0,0.0,CS-12 E,1984-11-15 12:49:10,sinusrhythmus normales ekg,...,True,,,,,,,5,records100/00000/00003_lr,records500/00000/00003_hr
4,17014.0,24.0,0,,82.0,2.0,0.0,CS-12 E,1984-11-15 13:44:57,sinusrhythmus normales ekg,...,True,", II,III,AVF",,,,,,3,records100/00000/00004_lr,records500/00000/00004_hr
5,17448.0,19.0,1,,70.0,2.0,0.0,CS-12 E,1984-11-17 10:43:15,sinusrhythmus normales ekg,...,True,", III,AVR,AVF",,,,,,4,records100/00000/00005_lr,records500/00000/00005_hr


In [7]:
Y.filename_lr[1]

'records100/00000/00001_lr'

In [8]:
Y.scp_codes.head()

ecg_id
1    {'NORM': 100.0, 'LVOLT': 0.0, 'SR': 0.0}
2                {'NORM': 80.0, 'SBRAD': 0.0}
3                  {'NORM': 100.0, 'SR': 0.0}
4                  {'NORM': 100.0, 'SR': 0.0}
5                  {'NORM': 100.0, 'SR': 0.0}
Name: scp_codes, dtype: object

In [9]:
Y.columns

Index(['patient_id', 'age', 'sex', 'height', 'weight', 'nurse', 'site',
       'device', 'recording_date', 'report', 'scp_codes', 'heart_axis',
       'infarction_stadium1', 'infarction_stadium2', 'validated_by',
       'second_opinion', 'initial_autogenerated_report', 'validated_by_human',
       'baseline_drift', 'static_noise', 'burst_noise', 'electrodes_problems',
       'extra_beats', 'pacemaker', 'strat_fold', 'filename_lr', 'filename_hr'],
      dtype='object')

In [10]:
Y['nurse'].unique()

array([ 2.,  0., nan,  1.,  7.,  8., 10.,  5.,  9., 11.,  4.,  3.,  6.])

In [11]:
Y['strat_fold'].unique()

array([ 3,  2,  5,  4,  7,  9, 10,  8,  6,  1])

## Sumarização dos dados

## Test

In [87]:
Y_TEST = Y[Y.strat_fold == 10]
print(f"Qtd de dados de teste: {Y_TEST.patient_id.count()}, {Y_TEST.patient_id.count()/Y.patient_id.count()*100}%")

Qtd de dados de teste: 2198, 10.083031331712464%


## Devel

In [88]:
Y_DEVEL = Y[Y.strat_fold == 9]
print(f"Qtd de dados de avaliação: {Y_DEVEL.patient_id.count()}, {Y_DEVEL.patient_id.count()/Y.patient_id.count()*100}%")

Qtd de dados de avaliação: 2183, 10.014220835818156%


## Train

In [89]:
Y_TRAIN = Y[(Y.strat_fold != 9) & (Y.strat_fold != 10)] 
print(f"Qtd de dados de treino: {Y_TRAIN.patient_id.count()}, {Y_TRAIN.patient_id.count()/Y.patient_id.count()*100}%")

Qtd de dados de treino: 17418, 79.90274783246937%


## Load data

In [17]:
X = load_raw_data(Y, sampling_rate, path)

In [18]:
type(X)

numpy.ndarray

In [19]:
agg_df = pd.read_csv(path+'scp_statements.csv', index_col=0)
agg_df = agg_df[agg_df.diagnostic == 1]
agg_df.head()

Unnamed: 0,description,diagnostic,form,rhythm,diagnostic_class,diagnostic_subclass,Statement Category,SCP-ECG Statement Description,AHA code,aECG REFID,CDISC Code,DICOM Code
NDT,non-diagnostic T abnormalities,1.0,1.0,,STTC,STTC,other ST-T descriptive statements,non-diagnostic T abnormalities,,,,
NST_,non-specific ST changes,1.0,1.0,,STTC,NST_,Basic roots for coding ST-T changes and abnorm...,non-specific ST changes,145.0,MDC_ECG_RHY_STHILOST,,
DIG,digitalis-effect,1.0,1.0,,STTC,STTC,other ST-T descriptive statements,suggests digitalis-effect,205.0,,,
LNGQT,long QT-interval,1.0,1.0,,STTC,STTC,other ST-T descriptive statements,long QT-interval,148.0,,,
NORM,normal ECG,1.0,,,NORM,NORM,Normal/abnormal,normal ECG,1.0,,,F-000B7


In [19]:
Y['diagnostic_superclass'] = Y.scp_codes.apply(aggregate_diagnostic)

In [37]:
Y_TEST.filename_lr[9].split(os.sep)[2]

'00009_lr'

In [32]:
Y_TEST

Unnamed: 0_level_0,patient_id,age,sex,height,weight,nurse,site,device,recording_date,report,...,validated_by_human,baseline_drift,static_noise,burst_noise,electrodes_problems,extra_beats,pacemaker,strat_fold,filename_lr,filename_hr
ecg_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,18792.0,55.0,0,,70.0,2.0,0.0,CS-12 E,1984-12-08 09:44:43,sinusrhythmus normales ekg,...,True,,", I-AVR,",,,,,10,records100/00000/00009_lr,records500/00000/00009_hr
38,17076.0,40.0,0,,72.0,2.0,0.0,CS-12 E,1985-02-15 11:48:22,sinusrhythmus schwierig bestimmbare qrs-achse,...,True,,", alles,",V5,,,,10,records100/00000/00038_lr,records500/00000/00038_hr
40,19501.0,60.0,0,,85.0,2.0,0.0,CS-12 E,1985-02-20 11:43:45,sinusrhythmus linkstyp sonst normales ekg,...,True,,,,,,,10,records100/00000/00040_lr,records500/00000/00040_hr
57,16063.0,26.0,0,,93.0,2.0,0.0,CS-12 E,1985-06-06 11:32:43,sinusrhythmus normales ekg,...,True,,,,,,,10,records100/00000/00057_lr,records500/00000/00057_hr
59,19475.0,54.0,0,,67.0,2.0,0.0,CS-12 E,1985-06-12 06:36:01,sinusrhythmus normales ekg,...,True,", V1",", V1,",,,,,10,records100/00000/00059_lr,records500/00000/00059_hr
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21809,12931.0,69.0,1,,,1.0,2.0,AT-60 3,2001-02-18 12:36:54,sinusrhythmus linkstyp qrs(t) abnorm inferi...,...,True,,,,,,,10,records100/21000/21809_lr,records500/21000/21809_hr
21812,20789.0,67.0,0,,,1.0,2.0,AT-60 3,2001-02-21 13:34:15,supraventrikulÄre arrhythmie a-v block i p-ver...,...,True,,,,,,,10,records100/21000/21812_lr,records500/21000/21812_hr
21818,19204.0,84.0,1,,,1.0,2.0,AT-60 3,2001-03-03 12:09:05,sinusrhythmus linkstyp mÄssige amplitudenkrite...,...,True,,,,,,,10,records100/21000/21818_lr,records500/21000/21818_hr
21819,9843.0,54.0,0,,,1.0,2.0,AT-60 3,2001-03-03 12:12:58,sinusrhythmus p-sinistrocardiale ueberdrehter ...,...,True,,,,,,,10,records100/21000/21819_lr,records500/21000/21819_hr


## Splits

In [22]:
test_fold = 10
val_fold = 10

In [None]:
# Train
X_train = X[np.where((Y.strat_fold != test_fold) & (Y.strat_fold != val_fold))]
y_train = Y[(Y.strat_fold != test_fold) & (Y.strat_fold != val_fold)].diagnostic_superclass

In [None]:
# Test
X_test = X[np.where(Y.strat_fold == test_fold)]
y_test = Y[Y.strat_fold == test_fold].diagnostic_superclass