# 1. Data Preparation

After downloading the selected database of ECG signals, a preprocessing stage was put in place to obtain an adequate organization of the data, extracting general information of each subject and specific characteristics of each ECG waveform to create a DataFrame with several features for further processing. 

In [1]:
# Importing packages

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy as sp
import scipy.signal
import glob
import os
import seaborn as sns
%matplotlib inline

## 1.1 Signal pre-processing

Since there are relative few subjects, more data was obtained by dividing the signal into portions, assigning the correspondet information to create more rows with unique values. 

In [2]:
# Getting the ECG signals from original files

general_directory = os.path.join(os.getcwd(), 'Data\ecg-id-database-1.0.0\ecg-id-database-1.0.0')
entries = os.listdir(general_directory)

In [3]:
# Extracting relevant information and creating basic ECG based features

fs = 500 # Sampling frequency indicated by the publisher of the database
subject, age, gender, RR, ecg_mean, ecg_std, ecg_var, ecg_median, ecg_samples = [], [], [], [], [], [], [], [], []

for folder in entries:
    if "Person_" in folder:
             
        info_targetPattern = os.path.join(general_directory, folder, '*.hea')
        info_files = glob.glob(info_targetPattern)
        
        for ecg_info in info_files:
            with open(ecg_info) as f:
                subject_info = int(folder.replace("Person_","", 1))
                
                # Extracting demographic information
                lines = [line.rstrip() for line in f]
                age_info = int([int(s) for s in lines[4].split() if s.isdigit()][0])
                condition = lines[5].find('female')
                gender_info = 'female' if condition > 0 else 'male'
                
                ecg_signal_file = ecg_info.replace(".hea", ".dat")
                ecg_signal = np.fromfile(ecg_signal_file, dtype='int32')
                ecg_signal = ecg_signal/max(ecg_signal)
                ts = np.arange(0, len(ecg_signal)/fs, 1/fs)
                
                # RR intervals calculation
                pks_RR = sp.signal.find_peaks(ecg_signal, height=np.mean(np.abs(ecg_signal)*5), distance=500)[0]
                
                RR_samples = np.diff(pks_RR)
                RR_time = RR_samples*ts[1] 
                
                # Filling row gaps with blank spaces
                if len(pks_RR) <= 1:
                    subject.append(subject_info)
                    age.append(age_info)
                    gender.append(gender_info)
                    
                    ecg_mean.append('')
                    ecg_std.append('')
                    ecg_var.append('')
                    ecg_median.append('')
                    RR.append('')
                    ecg_samples.append('')
                    
                # Assigning values to the correspondent row
                for index in range(len(pks_RR)-1):
                    
                    subject.append(subject_info)
                    age.append(age_info)
                    gender.append(gender_info)
                    
                    ecg_mean.append(np.mean(ecg_signal[pks_RR[index]:pks_RR[index+1]]))
                    ecg_std.append(np.std(ecg_signal[pks_RR[index]:pks_RR[index+1]]))
                    ecg_var.append(np.var(ecg_signal[pks_RR[index]:pks_RR[index+1]]))
                    ecg_median.append(np.median(ecg_signal[pks_RR[index]:pks_RR[index+1]]))
                    RR.append(RR_time[index])
                    
                    ecg_samples.append(ecg_signal[pks_RR[index]:pks_RR[index+1]]) # Selecting a unique portion of the raw ECG signal

## 1.2 DataFrame declaration

In [4]:
# DataFrame declaration

data={"Subject_ID": subject, "Age": np.array(age), "Gender": gender, "RR": RR, "ECG_mean": ecg_mean, "ECG_std": ecg_std,
      "ECG_var": ecg_var, "ECG_median": ecg_median}
df_ecg = pd.DataFrame(data)
df_ecg

Unnamed: 0,Subject_ID,Age,Gender,RR,ECG_mean,ECG_std,ECG_var,ECG_median
0,1,25,male,1.564,-0.00570945,0.14325,0.0205207,-0.0440251
1,1,25,male,1.864,0.00910373,0.148422,0.022029,-0.037736
2,1,25,male,1.852,-0.00195572,0.15066,0.0226984,-0.0440252
3,1,25,male,1.756,-0.00592427,0.156182,0.0243929,-0.050315
4,1,25,male,2.496,0.0216589,0.150437,0.0226313,-0.0314467
...,...,...,...,...,...,...,...,...
2854,89,40,female,2.79,0.00345874,0.12851,0.0165149,-0.0397724
2855,89,40,female,1.712,0.00398307,0.133123,0.0177218,-0.0397721
2856,89,40,female,1.662,-0.000443603,0.130906,0.0171364,-0.0454548
2857,90,21,female,,,,,


In [5]:
# DataFrame description

num_rows = df_ecg.shape[0] #Provide the number of rows in the dataset
num_cols = df_ecg.shape[1] #Provide the number of columns in the dataset

print(num_rows, num_cols)
df_ecg.describe()

2859 8


Unnamed: 0,Subject_ID,Age
count,2859.0,2859.0
mean,38.874432,28.857643
std,26.443411,11.864158
min,1.0,13.0
25%,13.0,21.0
50%,39.0,23.0
75%,61.0,34.0
max,90.0,75.0


### Categorical columns

According to column type, there are 6 categorical columns within the DataFrame. However, 5 of those columns should be numerical. To proceed, it has been identified that these columns present blank spaces in case of missing values. Thus, blank spaces are replaced with NaN, resulting in numerical columns. 

In [6]:
# Checking categorical columns of the DataFrame

print(df_ecg.dtypes)

cat_df = df_ecg.select_dtypes(include=["object"])
cat_df.shape[1]
cat_df.isnull().sum().sort_values()

Subject_ID     int64
Age            int32
Gender        object
RR            object
ECG_mean      object
ECG_std       object
ECG_var       object
ECG_median    object
dtype: object


Gender        0
RR            0
ECG_mean      0
ECG_std       0
ECG_var       0
ECG_median    0
dtype: int64

In [7]:
# Replacing blank spaces with NaN, resulting in numerical columns instead of categorical type

df_ecg["RR"] = df_ecg["RR"].replace("", np.nan)
df_ecg["ECG_mean"] = df_ecg["ECG_mean"].replace("", np.nan)
df_ecg["ECG_std"] = df_ecg["ECG_std"].replace("", np.nan)
df_ecg["ECG_var"] = df_ecg["ECG_var"].replace("", np.nan)
df_ecg["ECG_median"] = df_ecg["ECG_median"].replace("", np.nan)

df_ecg.dtypes

Subject_ID      int64
Age             int32
Gender         object
RR            float64
ECG_mean      float64
ECG_std       float64
ECG_var       float64
ECG_median    float64
dtype: object

#### Are the selected features suitable for classifying correctly the subjects based on ECG and basic demographic data?
From the description of the DataFrame, the RR interval maximum value is too high, showing potential issues with the calculation method seleted. These outliers could introduce some bias and, thus, impact all the other features due to the way they are proposed. There are potential solutions to correct the implementation, such as a more robust algorithm to calculate the RR interval, fr example a Pan-Tompkins algorithm. However, it could be interesting to assess the impact of these empirical calculations to the trained model proposed for classification.

In [8]:
df_ecg.describe()

Unnamed: 0,Subject_ID,Age,RR,ECG_mean,ECG_std,ECG_var,ECG_median
count,2859.0,2859.0,2832.0,2832.0,2832.0,2832.0,2832.0
mean,38.874432,28.857643,1.771206,0.003868,0.173795,0.031865,-0.04331
std,26.443411,11.864158,0.816328,0.009374,0.040755,0.017201,0.019152
min,1.0,13.0,1.0,-0.131275,0.074087,0.005489,-0.153844
25%,13.0,21.0,1.38,0.000496,0.147292,0.021695,-0.053695
50%,39.0,23.0,1.628,0.003803,0.162338,0.026354,-0.043322
75%,61.0,34.0,1.958,0.006981,0.196118,0.038462,-0.032924
max,90.0,75.0,13.722,0.153532,0.567605,0.322175,0.095204


In [9]:
no_nulls = set(df_ecg.dropna(axis="columns").columns) # Provide a set of columns with 0 missing values.
print(no_nulls)
most_missing_cols = set(df_ecg.columns[df_ecg.isnull().sum() > len(df_ecg)*0.05]) # Provide a set of columns with more than 5% of the values missing
print(most_missing_cols)

{'Gender', 'Subject_ID', 'Age'}
set()


In [10]:
df_ecg.to_pickle("df_ecg")

df_ecg_samples = pd.DataFrame(ecg_samples).add_prefix('ecg_')
df_ecg_samples.to_pickle("df_ecg_samples")

## 1.3 Conclusions

The pre-processing of the ECG signal database allows to construct a DataFrame with selected information regarding basic demographic data and calculated ECG features. 
Additionally, a portion of the raw (original) ECG signal was also saved for further analysis.
Feature engineering could entail a more complex and robust processing of the data but the proposed features could demonstrate if the trained model could have a good performance even with calculation pitfalls or the raw data.