In [1]:
import numpy as np
import biosppy
import scipy.io as sio
from scipy.io import savemat
import matplotlib.pyplot as plt
import pandas as pd
import random
import glob, os, shutil
import re
from tqdm import tqdm

In [2]:
def plot(signal):
    freq = 300
    secs = 10
    time = np.arange(signal.size) / freq
    plt.plot(time, signal)
    plt.show()

In [3]:
def split(list_a, chunk_size):
    for i in range(0, len(list_a), chunk_size):
        yield list(list_a[i:i + chunk_size])

In [4]:
def synth_signal(datadir, signal_id):
    signal = sio.loadmat(datadir+'/'+signal_id+'.mat')["val"][0]
    S = biosppy.signals.ecg.gamboa_segmenter(signal=signal, sampling_rate=300.0, tol=0.002)

    # identify the 6-peaks patches and their spacers in between
    a = random.choice(range(6))
    if a > 0:        peaks = [(i[0],i[-1]) for i in split(S[0][a:],6)]
    peaks.insert(0,tuple([S[0][:a][0],S[0][:a][-1]]))
    spacers = [(peaks[i][-1],peaks[i+1][0]) for i in range(len(peaks)-1)]

    # identify head and tail
    head = (0,peaks[0][0])
    tail = (peaks[-1][-1],len(signal))

    # shuffle the order of peaks and spacers
    idx = list(range(len(peaks)))
    random.shuffle(idx)
    peaks = [peaks[i] for i in idx]

    idx = list(range(len(spacers)))
    random.shuffle(idx)
    spacers = [spacers[i] for i in idx]

    # alternate the shuffled peaks and spacers
    core = [None]*(len(peaks)+len(spacers))
    core[::2] = peaks
    core[1::2] = spacers

    # reconstruct the signal
    head = signal[head[0]:head[1]]
    core = np.concatenate([signal[i[0]:i[1]] for i in core]).ravel()
    tail = signal[tail[0]:tail[1]]
    synth_signal = np.concatenate([head,core,tail])
    

    return(synth_signal)

In [5]:
# gamboa_segmenter gives error on some signals: we skip those

errors = []

def populate_errors(datadir):
    df = pd.read_csv(datadir+'/REFERENCE.csv', names=['mat', 'label'])
    for signal_id in tqdm(df['mat'], total=len(df['mat'])):
        signal = sio.loadmat(datadir+'/'+signal_id+'.mat')["val"][0]
        try:
            S = biosppy.signals.ecg.gamboa_segmenter(signal=signal, sampling_rate=300.0, tol=0.002)
        except:
            errors.append(signal_id)

populate_errors('training2017')
populate_errors('validation2017')

100%|██████████| 8528/8528 [00:09<00:00, 934.58it/s]
100%|██████████| 300/300 [00:00<00:00, 927.05it/s]


In [9]:
datadir = 'validation2017'

# if the directory exists, overwrite it with a new one
outdir = 'balanced_'+datadir
if os.path.exists(outdir):
    shutil.rmtree(outdir)
os.makedirs(outdir)

df = pd.read_csv(datadir+'/REFERENCE.csv', names=['mat', 'label'])
df = df[~df['mat'].isin(errors)]
a = df['label'].value_counts().max() - df['label'].value_counts()
ref_counts = a.to_dict()

for k,v in ref_counts.items():
    random.seed(2022)
    lst = random.choices(df[df['label']==k]['mat'].to_list(),k=v)
    for i in list(set(df['mat']) - set(lst)):
        shutil.copy(datadir+'/'+i+'.mat', outdir+'/'+i+'.mat')
    for signal_id in tqdm(lst, total=v):
        tag = str(len([i for i in filter(lambda x: re.search(signal_id, x), glob.glob(outdir + '/*'))]))
        synth = synth_signal(datadir, signal_id)
        mdic = {"val":synth ,"label":k}
        savemat(outdir + '/' + signal_id + '.' + tag + '.mat', mdic)

0it [00:00, ?it/s]
100%|██████████| 82/82 [00:00<00:00, 343.78it/s]
100%|██████████| 99/99 [00:00<00:00, 413.96it/s]
100%|██████████| 111/111 [00:00<00:00, 335.52it/s]


In [10]:
lst = []
files = glob.glob(outdir+'/*')
for i in tqdm(map(os.path.basename,files),total=len(files)):
    mat = i.replace('.mat','')
    label = df[df['mat'] == mat.split('.')[0]]['label'].values[0]
    lst.append([mat,label])
df2 = pd.DataFrame.from_records(lst)
df2.columns = ['mat','label']
ref = pd.concat([df,df2])
ref = ref.sort_values(by=['mat'])
ref.to_csv(outdir+'/REFERENCE.csv',header=None,index=None)

100%|██████████| 584/584 [00:00<00:00, 3253.92it/s]
