<a href="https://colab.research.google.com/github/lygitdata/aml_project/blob/main/project2/aml_p2_liyuan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import libraries

In [1]:
import csv
import os
import biosppy.signals.ecg as ecg
import biosppy
import neurokit2 as nk
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats
from scipy.signal import find_peaks
from scipy.stats import kurtosis, skew
import multiprocessing as mp
from tqdm import tqdm
import pickle
from imblearn import over_sampling, pipeline
import lightgbm as lgb
import catboost as cat
from xgboost import XGBRegressor
from sklearn import ensemble, model_selection, preprocessing
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, RationalQuadratic
from sklearn.metrics import r2_score, mean_squared_error

# Import dataframes

In [2]:
def load_data(train_path, test_path):
    X_train = pd.read_csv(train_path, index_col="id")
    y_train = X_train.iloc[:, 0]
    X_train = X_train.iloc[:, 1:]
    X_test = pd.read_csv(test_path, index_col="id")
    return transform_data(X_train), y_train.values, transform_data(X_test)

def transform_data(df):
    return np.array([row.dropna().to_numpy(dtype='float32') for _, row in df.iterrows()], dtype=object)

In [3]:
X_train_raw, y_train_raw, X_test_raw = load_data(
    train_path = "train.csv",
    test_path = "test.csv"
)
print(
    "X_train_raw shape: ",
    X_train_raw.shape,
    "\ny_train_raw shape",
    y_train_raw.shape,
    "\nX_test_raw shape",
    X_test_raw.shape,
)

X_train_raw shape:  (5117,) 
y_train_raw shape (5117,) 
X_test_raw shape (3411,)


# Feature extraction

In [None]:
# Function to extract features from a single raw ECG signal
def _extract_features(signal):
    # Use biosppy to process the ECG signal and extract features
    out = biosppy.signals.ecg.ecg(signal=signal, sampling_rate=300, show=False)

    # Access relevant outputs from the tuple
    rpeaks = out[2]  # Indices of R-peaks
    heart_rate = out[6]  # Instantaneous heart rate

    # Feature vector to store the extracted features
    features = []

    # Spectral features using FFT (same as before)
    clip = signal[rpeaks[0]:rpeaks[-1]]  # Clip signal around R-peaks
    freq = np.fft.rfftfreq(len(clip), 1 / 300)  # Frequency bin edges (sampling rate assumed as 300 Hz)
    spec = np.abs(np.fft.rfft(clip)) / len(clip)  # Spectral magnitude
    freq, spec = binned(freq, spec, 50.0, 100, np.max)  # Bin and apply max function
    features += list(spec)

    # Autocorrelation of the signal
    autocorr = np.correlate(clip, clip, mode="full") / len(clip)
    autocorr = autocorr[autocorr.size // 2:]
    time = np.linspace(0, len(clip) / 300, len(clip))
    time, autocorr = binned(time, autocorr, 1.0, 100, np.mean)
    features += list(autocorr)

    # Heart rate features (mean, std, median, variance)
    features += msmv(heart_rate)

    # HRV (Heart Rate Variability) - difference between successive R-peaks (R-R intervals)
    rr_intervals = np.diff(rpeaks) / 300  # R-R intervals in seconds
    features += msmv(rr_intervals)  # HRV: mean, std, median, variance of R-R intervals

    # QRS Duration (can be approximated by the difference between consecutive R-peaks)
    qrs_duration = np.diff(rpeaks) / 300  # In seconds
    features.append(np.mean(qrs_duration))  # Average QRS duration

    # Signal Entropy: Measure of signal complexity (Shannon Entropy)
    entropy = signal_entropy(clip)
    features.append(entropy)

    # Time-domain features: Mean, Standard deviation, Skewness, Kurtosis
    features += time_domain_features(clip)

    # Return the extracted feature vector
    return features

# Function to calculate Shannon entropy of a signal
def signal_entropy(signal):
    prob_density, _ = np.histogram(signal, bins=10, density=True)
    prob_density = prob_density[prob_density > 0]  # Remove zero probabilities
    entropy = -np.sum(prob_density * np.log(prob_density))  # Shannon entropy
    return entropy

# Time-domain statistical features (mean, std, skewness, kurtosis)
def time_domain_features(signal):
    mean = np.mean(signal)
    std = np.std(signal)
    skewness = skew(signal)
    kurt = kurtosis(signal)
    return [mean, std, skewness, kurt]

# Binned function for downsampling and applying a function over binned data
def binned(x, y, xend, nbins, func):
    bx = np.linspace(x[0], xend, nbins + 1)
    idx = np.digitize(x, bx) - 1  # Bin assignments
    bx = (bx[1:] + bx[:-1]) / 2  # Bin centers
    by = np.array([func(y[idx == i]) for i in range(nbins)])  # Apply function to each bin
    return bx, by

# MSMV function to compute statistical measures
def msmv(x):
    x = x[~np.isnan(x)]  # Remove NaNs
    if len(x) == 0:
        return [0, 0, 0, 0]
    if len(x) == 1:
        return [x[0], 0, x[0], 0]
    else:
        return [np.mean(x), np.std(x), np.median(x), np.var(x)]

def extract_features(X, inverse=False):
    # Parallelize using multiprocessing (avoid excessive memory consumption by processing in chunks)
    with mp.Pool(mp.cpu_count()) as pool:
        Xn = list(tqdm(pool.imap(_extract_features, X), total=len(X)))
    return np.array(Xn)

# Example of applying the function to X_train_raw and X_test_raw
X_train_features = extract_features(X_train_raw)
X_test_features = extract_features(X_test_raw)

  0%|          | 0/5117 [00:00<?, ?it/s]

In [None]:
X_train_features.shape

In [None]:
X_test_features.shape

# Model Training

In [None]:
model = pipeline.make_pipeline(
    over_sampling.RandomOverSampler(random_state=42),
    preprocessing.StandardScaler(),
    ensemble.HistGradientBoostingClassifier(l2_regularization=0.2),
)
score = model_selection.cross_val_score(model, X_train_features, y_train_raw, cv=10, n_jobs=-1)
print(score.mean(), score.std())

In [None]:
def create_submission(model, X_train, y_train, X_test):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    pred = np.vstack((np.arange(X_test.shape[0]), pred)).T
    np.savetxt("submission.csv", pred, delimiter=",", header="id,y", comments="")

In [None]:
create_submission(model, X_train_features, y_train_raw, X_test_features)