In [6]:
import csv
import os

import biosppy.signals.ecg as ecg
import neurokit2
import tsfel
import emd
import tqdm

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
import tensorflow as tf
import IPython.display as ipd
import matplotlib.pyplot as plt
import scipy as sp
from numpy.fft import rfft, irfft
from tqdm import tqdm



In [75]:
import keras

from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.models import Sequential

from sklearn.impute import KNNImputer

from sklearn.feature_selection import SelectKBest, chi2
from scipy import stats
from statistics import pstdev,variance
from sklearn.preprocessing import normalize

from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

from sklearn.model_selection import KFold
from sklearn.metrics import balanced_accuracy_score

from sktime.transformations.panel.rocket import MiniRocket

In [4]:
X_train = pd.read_csv("task2/X_train.csv").drop('id', axis=1)
y_train = pd.read_csv("task2/y_train.csv").drop('id', axis=1)
X_test = pd.read_csv("task2/X_test.csv").drop('id', axis=1)

In [21]:
n_rows = X_train.shape[0]
n_cols = X_train.shape[1]

dataset_x = X_train.copy()
dataset_x = np.array(dataset_x)

### Feature Extraction

### Biosppy 

In [26]:
#compute some statistics of all the signals 
autocor = []
ptp = []
med = []
avg = []
fft=[]

for i in range(n_rows):
    signal = dataset_x[i]
    signal_serie = pd.Series(signal)
    corr = signal_serie.autocorr(lag=2)
    autocor.append(corr)
    #average, range, median
    avg.append(np.average(signal))
    ptp.append(np.ptp(signal))
    med.append(np.median(signal))
    #top 20 fft frequencies
    f = np.fft.fft(signal)
    array = f[0:800]
    n = 20
    indices = array.argsort()[-n:][::-1]
    fft.append(indices)


In [31]:
#Padding the data in case some library need, padding with the last value. 
to_pad = n_cols
new_seq = []
for one_seq in dataset_x:
    one_seq = one_seq[~np.isnan(one_seq)]
    len_one_seq = len(one_seq)
    last_val = one_seq[-1]
    n = to_pad - len_one_seq
    to_concat = np.repeat(0, n)
    new_one_seq = np.concatenate([one_seq, to_concat])
    new_seq.append(new_one_seq)
padded = np.stack(new_seq)
padded.shape

(5117, 17807)

In [93]:
#templates_ts: equal step timestep, axis reference
#templates: #heartbeat * timestep 
#heart_rate_ts: heart_rate_ts, equal-step timestamp
#heart_rate: heart rate at each time

ts_list = []
filtered_list=[]
rpeaks_list=[]
templates_ts_list=[]
templates_list=[]
heart_rate_ts_list=[]
heart_rate_list=[]
norm_average_heartbeat_list = [] #normalized average heartbeat of pacient

for i in range(n_rows):
    ts, filtered, rpeaks, templates_ts, templates, heart_rate_ts, heart_rate = ecg.ecg(signal=padded[i], sampling_rate=300.0, show=False)
    ts_list.append(ts)
    filtered_list.append(filtered)
    rpeaks_list.append(rpeaks)
    templates_ts_list.append(templates_ts)
    templates_list.append(templates)
    heart_rate_ts_list.append(heart_rate_ts)
    heart_rate_list.append(heart_rate)
    norm_template = normalize(templates)
    norm_average_heartbeat_list.append(sum(norm_template)/len(norm_template))

# Extarct all the peaks 
P_list=[]
Q_list=[]
R_list=[]
S_list=[]
T_list=[]

for i in range(len(norm_average_heartbeat_list)):

    patient_current = norm_average_heartbeat_list[i]

    # Find the peak
    index = np.where(patient_current==max(patient_current))
    R = index[0]

    # First-half
    first_half = patient_current[0:R[0]]
    index = np.where(patient_current==min(first_half[R[0]-30:R[0]]))
    Q = index[0]

    index = np.where(first_half[0:Q[0]]==max(first_half[0:Q[0]]))
    P = index[0]

    #Second half
    second_half = patient_current[R[0]+1:] 
    index = np.where(patient_current==min(second_half[0:30]))
    S = index[0]

    second_half = second_half[S[0]-R[0]+1:]
    index = np.where(patient_current==max(second_half))
    T = index[0] 

    P_list.append(P[0])
    Q_list.append(Q[0])
    R_list.append(R[0])
    S_list.append(S[0])
    T_list.append(T[0])

# Intervals and Ratios of peaks
PR_list = []
QRS_list = []
ST_list = []

for i in range(len(P_list)):
    PR_list.append(R_list[i]-P_list[i])
    QRS_list.append(S_list[i]-Q_list[i])
    ST_list.append(T_list[i]-S_list[i])

PR_list = np.array(PR_list).reshape(-1,1)
QRS_list = np.array(QRS_list).reshape(-1,1)
ST_list = np.array(ST_list).reshape(-1,1)
P_list = np.array(P_list).reshape(-1,1)
R_list = np.array(R_list).reshape(-1,1)
S_list = np.array(S_list).reshape(-1,1)
T_list = np.array(T_list).reshape(-1,1)

QRS_T_list= np.divide(QRS_list, T_list) 
QRS_P_list= np.divide(QRS_list, P_list) 
QRS_T_list=np.nan_to_num(QRS_T_list, nan=0.0,posinf=0.0, neginf=0.0)
QRS_P_list=np.nan_to_num(QRS_P_list, nan=0.0,posinf=0.0, neginf=0.0)

#statistics of heartrate and heartbeat
hr_mean_list = []
hr_median_list = []
hr_var_list = []
for i in range(len(heart_rate_list)):
        hr_mean_list.append(np.mean(heart_rate_list[i]))
        hr_median_list.append(np.median(heart_rate_list[i]))
        hr_var_list.append(np.var(heart_rate_list[i]))
hb_mean_list = []
hb_median_list = []
hb_var_list = []
for i in range(len(norm_average_heartbeat_list)):
        hb_mean_list.append(np.mean(norm_average_heartbeat_list[i]))
        hb_median_list.append(np.median(norm_average_heartbeat_list[i]))
        hb_var_list.append(np.var(norm_average_heartbeat_list[i]))


  QRS_P_list= np.divide(QRS_list, P_list)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


#### Time series libraries

In [94]:
#rocket features

In [95]:
#tsfel features

In [96]:
#emd features

### Neorokit features