In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from hrvanalysis import get_time_domain_features, get_frequency_domain_features
from hrvanalysis.preprocessing import get_nn_intervals
import biosppy.signals.ecg as ecg
from tqdm import tqdm
import neurokit2 as nk
from sklearn.utils import class_weight
import json

import warnings
warnings.simplefilter("ignore")

plt.rcParams['figure.figsize'] = [20, 5]

In [None]:
DATA_PATH = "../data"
SAMPLING_RATE = 300

In [None]:
X_test = pd.read_csv(f'{DATA_PATH}/X_test.csv')
X_train = pd.read_csv(f'{DATA_PATH}/X_train.csv')
y_train = pd.read_csv(f'{DATA_PATH}/y_train.csv', index_col='id')

# Merge data functions

In [None]:
def read_data(dataname):
    X_test = pd.read_csv(f'../data/{dataname}/X_test.csv')
    X_train = pd.read_csv(f'../data/{dataname}/X_train.csv')
    return X_train, X_test
def concat_data(x1, x2):
    concated_x = pd.concat([x1, x2.drop(columns=['id'])], axis=1)
    return concated_x

# hvrtd (Time Domain Features)

In [None]:
# https://aura-healthcare.github.io/hrv-analysis/hrvanalysis.html#hrvanalysis.extract_features.get_time_domain_features

In [None]:
def get_td_features(signal):
#     r_peaks = ecg.engzee_segmenter(signal, SAMPLING_RATE)['rpeaks']
    signals, info = nk.ecg_process(signal, sampling_rate=SAMPLING_RATE)
    r_peaks = info["ECG_R_Peaks"]
    rr_intervals = r_peaks[1:]-r_peaks[:-1]
    time_domain_features = get_time_domain_features(rr_intervals)
    return time_domain_features

In [None]:
# signal = X_test.loc[155].dropna().to_numpy(dtype='float32')
# # signals, info = nk.ecg_process(signal, sampling_rate=SAMPLING_RATE)
# # r_peaks = info["ECG_R_Peaks"]
# # r_peaks = ecg.engzee_segmenter(signal, SAMPLING_RATE)['rpeaks']
# feature_names = [
#         "mean_nni", "sdnn" , "sdsd", "rmssd", "median_nni", 
#         "nni_50", "pnni_50", "nni_20", "pnni_20", "range_nni", 
#         "cvsd", "cvnni", "mean_hr", "max_hr", "min_hr", "std_hr"]
# td_features = get_td_features(signal)
# feature_vector = []
# for fn in feature_names:
#     feature_vector.append(td_features[fn])
# feature_vector, td_features
# # len(r_peaks)

In [None]:
def generate_td_features(data):
    feature_names = [
        "mean_nni", "sdnn" , "sdsd", "rmssd", "median_nni", 
        "nni_50", "pnni_50", "nni_20", "pnni_20", "range_nni", 
        "cvsd", "cvnni", "mean_hr", "max_hr", "min_hr", "std_hr"]
    feature_vecs = []
    error_counts = 0
    for i in tqdm(range(len(data))):
        signal = data.loc[i].dropna().to_numpy(dtype='float32')
        time_domain_features = None
        try:
            time_domain_features = get_td_features(signal)
        except:
            pass
        feature_vector = []
        if time_domain_features is None:
            error_counts += 1
        for fn in feature_names:
            if time_domain_features is None:
                feature_vector.append(np.nan)
            else:
                feature_vector.append(time_domain_features[fn])
        feature_vecs.append(feature_vector)
    return feature_names, np.array(feature_vecs), error_counts

### Train

In [None]:
%%time
td_feature_names, td_features, error_counts = generate_td_features(X_train)


In [None]:
df = pd.DataFrame(data=td_features, columns=td_feature_names)
with open("X_train.csv", "w") as f:
    f.write("id,"+",".join(td_feature_names) + "\n")
    for i, d in enumerate(df.to_numpy()):
        f.write(f"{str(i)},"+",".join([str(x) for x in d])+"\n")

### Test

In [None]:
%%time
td_feature_names, td_features, error_counts = generate_td_features(X_test)


In [None]:
df = pd.DataFrame(data=td_features, columns=td_feature_names)
with open("X_test.csv", "w") as f:
    f.write("id,"+",".join(td_feature_names) + "\n")
    for i, d in enumerate(df.to_numpy()):
        f.write(f"{str(i)},"+",".join([str(x) for x in d])+"\n")

### Merge

In [None]:
rpqst_X_train, rpqst_X_test = read_data('rpqst')
hvr_X_train, hvr_X_test = read_data('hvr')

In [None]:
concated_X_train = concat_data(rpqst_X_train, hvr_X_train)
concated_X_test = concat_data(rpqst_X_test, hvr_X_test)


In [None]:
concated_X_train.to_csv("X_train.csv", index=False)

In [None]:
concated_X_test.to_csv("X_test.csv", index=False)

# HVRAnalysis freq features

In [None]:
# https://aura-healthcare.github.io/hrv-analysis/hrvanalysis.html#hrvanalysis.extract_features.get_frequency_domain_features

In [None]:
feature_names = ["total_power","vlf","lf","hf","lf_hf_ratio","lfnu","hfnu"]

In [None]:
def get_features(signal, extraction_method):
#     r_peaks = ecg.engzee_segmenter(signal, SAMPLING_RATE)['rpeaks']
    signals, info = nk.ecg_process(signal, sampling_rate=SAMPLING_RATE)
    r_peaks = info["ECG_R_Peaks"]
    rr_intervals = r_peaks[1:]-r_peaks[:-1]
    features = extraction_method(rr_intervals)
    return features

In [None]:
def generate_features(data, extraction_method, feature_names):
    feature_vecs = []
    error_counts = 0
    for i in tqdm(range(len(data))):
        signal = data.loc[i].dropna().to_numpy(dtype='float32')
        features = None
        try:
            features = get_features(signal, extraction_method)
        except:
            pass
        feature_vector = []
        if features is None:
            error_counts += 1
        for fn in feature_names:
            if features is None:
                feature_vector.append(np.nan)
            else:
                feature_vector.append(features[fn])
        feature_vecs.append(feature_vector)
    return feature_names, np.array(feature_vecs), error_counts

### Train

In [None]:
fd_X_train = generate_features(X_train, get_frequency_domain_features, feature_names)

In [None]:
fd_X_train_df = pd.DataFrame(fd_X_train[1], columns = fd_X_train[0])
fd_X_train_df.to_csv("X_train.csv", index_label="id")

### Test

In [None]:
fd_X_test = generate_features(X_test, get_frequency_domain_features, feature_names)

In [None]:
fd_X_test_df = pd.DataFrame(fd_X_test[1], columns = fd_X_test[0])
fd_X_test_df.to_csv("X_test.csv", index_label="id")

In [None]:
rpqst_hvrtd_X_train, rpqst_hvrtd_X_test = read_data('rpqst_hvrtd')
hvrfd_X_train, hvrfd_X_test = read_data('hvrfd')

In [None]:
concated_X_train = concat_data(rpqst_hvrtd_X_train, hvrfd_X_train)
concated_X_test = concat_data(rpqst_hvrtd_X_test, hvrfd_X_test)

In [None]:
concated_X_train.to_csv("X_train.csv", index=False)
concated_X_test.to_csv("X_test.csv", index=False)

# Intervals

In [None]:
signal = X_test.loc[5].dropna().to_numpy(dtype='float32')
signals, info = nk.ecg_process(signal, sampling_rate=SAMPLING_RATE)
rpeaks = info["ECG_R_Peaks"]
cleaned_signal = signals["ECG_Clean"]

_, waves_peak = nk.ecg_delineate(cleaned_signal, rpeaks, sampling_rate=SAMPLING_RATE, method="peak")

intervals = np.array(waves_peak["ECG_P_Onsets"][1:])-np.array(waves_peak["ECG_P_Onsets"][:-1])
np.mean(intervals[~np.isnan(intervals)])

def intervals_mean_std(l1, l2):
    intervals = np.array(l2)-np.array(l1)
    return np.mean(intervals[~np.isnan(intervals)]), np.std(intervals[~np.isnan(intervals)])

feature_names = waves_peak.keys()
intervals_stats = []
for k in feature_names:
    l1 = waves_peak[k]
    l2 = rpeaks
    mean, std = intervals_mean_std(l1, l2)
    intervals_stats.append(mean)
    intervals_stats.append(std)

feature_names, intervals_stats

# ['ECG_P_Peaks', 'ECG_Q_Peaks', 'ECG_S_Peaks', 'ECG_T_Peaks', 'ECG_P_Onsets', 'ECG_T_Offsets']


In [None]:
def intervals_mean_std(l1, l2):
    intervals = np.array(l2)-np.array(l1)
    return np.mean(intervals[~np.isnan(intervals)]), np.std(intervals[~np.isnan(intervals)])

def generate_interval_features(data):
    keys = ['ECG_P_Peaks', 'ECG_Q_Peaks', 'ECG_S_Peaks', 'ECG_T_Peaks', 'ECG_P_Onsets', 'ECG_T_Offsets']
    feature_names = [
        'pr_mean', 'pr_std', 
        'qr_mean', 'qr_std',
        'sr_mean', 'sr_std',
        'tr_mean', 'tr_std',
        'por_mean', 'por_std',
        'tor_mean', 'tor_std',
    ]
    feature_vecs = []
    indices = []
    error_counts = 0
    for i in tqdm(range(len(data))):
        signal = data.loc[i].dropna().to_numpy(dtype='float32')
        feature_vector = None

        try:
            signals, info = nk.ecg_process(signal, sampling_rate=SAMPLING_RATE)
            rpeaks = info["ECG_R_Peaks"]
            cleaned_signal = signals["ECG_Clean"]

            _, waves_peak = nk.ecg_delineate(cleaned_signal, rpeaks, sampling_rate=SAMPLING_RATE, method="peak")

            intervals_stats = []
            for k in keys:
                mean, std = intervals_mean_std(waves_peak[k], rpeaks)
                intervals_stats.append(mean)
                intervals_stats.append(std)
            feature_vector = intervals_stats
        except:
            pass
        
        if feature_vector is None:
            error_counts += 1
            feature_vector = [np.nan]*len(feature_names)
        feature_vecs.append(feature_vector)
        indices.append(i)
        
    return feature_names, np.array(feature_vecs), error_counts

In [None]:
feature_names, features, error_counts = generate_interval_features(X_train)

In [None]:
fd_X_train_df = pd.DataFrame(features, columns = feature_names)
fd_X_train_df.to_csv("X_train.csv", index_label="id")

In [None]:
feature_names, features, error_counts = generate_interval_features(X_test)

In [None]:
fd_X_test_df = pd.DataFrame(features, columns = feature_names)
fd_X_test_df.to_csv("X_test.csv", index_label="id")

### Merge

In [None]:
X_train1, X_test1 = read_data('rpqst_hvrtd_hvrfd')
X_train2, X_test2 = read_data('itv')
concated_X_train = concat_data(X_train1, X_train2)
concated_X_test = concat_data(X_test1, X_test2)

In [None]:
concated_X_train.to_csv("X_train.csv", index=False)
concated_X_test.to_csv("X_test.csv", index=False)

# pyHRV

In [None]:
import pyhrv.tools as tools
from pyhrv.hrv import hrv

In [None]:
# feature_names = ['nni_counter', 'nni_mean', 'nni_min', 'nni_max', 'hr_mean', 'hr_min', 'hr_max', 'hr_std', 
#            'nni_diff_mean', 'nni_diff_min', 'nni_diff_max', 'sdnn', 'rmssd', 
#            'sdsd', 'nn50', 'pnn50', 'nn20', 'pnn20', 'sd1', 'sd2', 'sd_ratio', 'ellipse_area', 'sampen']
# for k in fl_keys:
#     print(k, extracted[k])

In [None]:
# # %%capture
# signal = X_test.loc[0].dropna().to_numpy(dtype='float32')
# signals, info = nk.ecg_process(signal, sampling_rate=SAMPLING_RATE)
# # rpeaks = info["ECG_R_Peaks"]
# # rpeaks
# signals, rpeaks = ecg.ecg(signal, show=False)[1:3]
# rpeaks
# nni = tools.nn_intervals(rpeaks)
# extracted = hrv(nni, rpeaks, signals, SAMPLING_RATE)
# feature_vector = np.array([extracted[k] for k in feature_names])

In [None]:
def generate_interval_features(data, file_path, start=0):
    feature_names = ['nni_counter', 'nni_mean', 'nni_min', 'nni_max', 'hr_mean', 'hr_min', 'hr_max', 'hr_std', 
           'nni_diff_mean', 'nni_diff_min', 'nni_diff_max', 'sdnn', 'rmssd', 
           'sdsd', 'nn50', 'pnn50', 'nn20', 'pnn20', 'sd1', 'sd2', 'sd_ratio'] #, 'ellipse_area', 'sampen']
#     feature_vecs = []
    error_counts = 0
    
    with open(file_path, "w") as f:
        f.write("id,"+",".join(feature_names) + "\n")
        
    for i in tqdm(range(start, len(data))):
        signal = data.loc[i].dropna().to_numpy(dtype='float32')
        feature_vector = None

        try:
            signals, rpeaks = ecg.ecg(signal, show=False)[1:3]
            nni = tools.nn_intervals(rpeaks)
            extracted = hrv(nni, rpeaks, signals, SAMPLING_RATE)
            feature_vector = np.array([extracted[k] for k in feature_names])
        except:
            pass
        
        if feature_vector is None:
            error_counts += 1
            feature_vector = [np.nan]*len(feature_names)
#         feature_vecs.append(feature_vector)

        with open(file_path, "a") as f:
            f.write(f"{str(i)},"+",".join([str(x) for x in feature_vector])+"\n")

#     return feature_names, np.array(feature_vecs), error_counts
    return feature_names, error_counts

In [None]:
feature_names, error_counts = generate_interval_features(X_train, "X_train_cont.csv", start=2789)

### Merge

In [None]:
X_train1, X_test1 = read_data('rpqst_hvrtd_hvrfd_itv')
X_train2, X_test2 = read_data('pyhrv')
concated_X_train = concat_data(X_train1, X_train2)
concated_X_test = concat_data(X_test1, X_test2)

In [None]:
concated_X_train.to_csv("X_train.csv", index=False)
concated_X_test.to_csv("X_test.csv", index=False)

# Class weights

In [None]:
classes = [int(i) for i in np.unique(y_train)]
cw = class_weight.compute_class_weight(class_weight='balanced', classes=classes, y=y_train.to_numpy().reshape(-1))
weights = dict(zip(classes,cw))
weights
# with open('weights.json', 'w') as f:
#     json.dump(weights, f)


In [None]:
def cls_count(df, cls):
    return len(df.loc[y_train.y==cls])

In [None]:
counts = []
for i in range(4):
    count = cls_count(y_train, i)
    counts.append(count)
counts = np.array(counts)

In [None]:
max_cls_count = np.max(counts)
weights = max_cls_count / counts
with open('weights.npy', 'wb') as f:
    np.save(f, weights)

In [None]:
with open('../score/weights.npy', 'rb') as f:
    weights = np.load(f)