# MIE 2021 - DATA PREPROCESSING

In [10]:
#--- IMPORT LIBRARIES ---
import scipy 
from scipy.io import loadmat
import numpy as np
import os
import pickle5 as pickle
import matplotlib.pyplot as plt
import math as maths
from ecgdetectors import Detectors
detectors = Detectors(700)

In [11]:
#--- DATASETS PATHS ---
IEEE_Train = "Data/IEEE 2015/competition_data/Training_data/"
IEEE_Test = "Data/IEEE 2015/TestData/"
PPGDaLi = "Data/PPGDaLia/"
WESAD = "Data/WESAD/"

In [21]:
#--- HELPER FUNCTIONS ---
#-----------------------------
#find Function: - Finds file in Directory
    #INP: Name - Name of file AND Path - Path of directory
    #OUT - File name
def find(name, path):
    for root, dirs, files in os.walk(path):
        if name in files:
            return os.path.join(root, name)
#-----------------------------
#list_pkl_files Function: - Finds file in Directory  
    #INP: dir - Path of directory
    #OUT - List of pkl file name
def list_pkl_files(dir):
    r = []
    for root, dirs, files in os.walk(dir):
        for name in files:
            if name.endswith(".pkl"):
                r.append(os.path.join(root, name))
    return r
#-----------------------------
#windows Function: - splits signal into sliding windows 
    #INP: iterable - signal, n-lengthof window, m-slide
    #OUT - List of sliding windows
def windows(iterable, n, m = 1):
    if m == 0: # otherwise infinte loop
        raise ValueError("Parameter 'm' can't be 0")
    lst = list(iterable)
    i = 0
    while i + n < len(lst):
        yield lst[i:i + n]
        i += m
#-----------------------------
#create_pickle: - pickles data into file 
    #INP: pickle_name - name of file, data-data
    #OUT - pickle file     
def create_pickle(pickle_name, data):
    outfile = open(pickle_name,'wb')
    pickle.dump(data,outfile)
    outfile.close()

In [22]:
#--- INIT DATASETS LISTS ---
ds_IEEE_Train = []
ds_IEEE_Test = []
ds_DaLia = []
ds_Wesad = []

In [23]:
#--- GET SIGNALS FROM FILE ---
#-----------------------------
#--- IEEE_Train ---
for filename in os.listdir(IEEE_Train):
    
    if filename.endswith(".mat") and not filename.endswith("BPMtrace.mat"): 
        Truth = filename[:-4]+"_BPMtrace.mat"
        Subject_Num = filename[5:-11]
        x = loadmat(IEEE_Train+filename)['sig']
        truth_list = loadmat(IEEE_Train+Truth)["BPM0"].reshape(len(loadmat(IEEE_Train+Truth)["BPM0"]))
        subj_dict = {
            "Dataset": "IEEE_Train",
            "Subject": int(Subject_Num),
            "PPG_fs": 125, 
            "ACC_fs": 125, 
            "Protocol": filename[12:-4],
            "Raw_PPG_1":x[1],
            "Raw_PPG_2":x[2],
            "Raw ACC_X": x[3],
            "Raw ACC_Y":x[4],
            "Raw ACC_Z":x[5],
            "truth_values": np.asarray(truth_list, dtype=np.float32)
        }
        ds_IEEE_Train.append(subj_dict)
create_pickle("IEEE_Train", ds_IEEE_Train)

In [24]:
#-----------------------------#-----------------------------
#--- IEEE_Test ---
for filename in os.listdir(IEEE_Test):
    if filename.startswith("TEST"): 
        Truth = "True"+filename[4:]
        Subject_Num = filename[6:-8]
        x = loadmat(IEEE_Test+filename)['sig']
        truth_list = loadmat(IEEE_Test+Truth)["BPM0"].reshape(len(loadmat(IEEE_Test+Truth)["BPM0"]))
        subj_dict = {
            "Dataset": "IEEE_Test",
            "Subject": int(Subject_Num),
            "Protocol": filename[10:-4],
            "PPG_fs": 125, 
            "ACC_fs": 125, 
            "Raw_PPG_1":x[0],
            "Raw_PPG_2":x[1],
            "Raw ACC_X":x[2],
            "Raw ACC_Y":x[3],
            "Raw ACC_Z":x[4],
            "truth_values": np.asarray(truth_list, dtype=np.float32)
        }
        ds_IEEE_Test.append(subj_dict)
create_pickle("IEEE_Test", ds_IEEE_Test)

In [7]:
#-----------------------------
#--- PPGDaLi ---
for file in list_pkl_files(PPGDaLi):
    with open(file, 'rb') as f:
        data = pickle.load(f, encoding='bytes')          
        acc_raw = data[b'signal'][b'wrist'][b'ACC']
        ppg_raw = data[b'signal'][b'wrist'][b'BVP']
        label = data[b'label']
        ppg_raw_re = ppg_raw.reshape(len(ppg_raw))
        acc_raw_x = []
        acc_raw_y = []
        acc_raw_z = []
        for i in range(len(acc_raw)):
            acc_raw_x.append(acc_raw[i][0])
            acc_raw_y.append(acc_raw[i][1])
            acc_raw_z.append(acc_raw[i][2])
        acc_x = np.array(acc_raw_x).reshape(len(acc_raw_x))
        acc_y = np.array(acc_raw_y).reshape(len(acc_raw_y))
        acc_z = np.array(acc_raw_z).reshape(len(acc_raw_z))
        subj_dict = {
                "Dataset": "PPGDaLi",
                "Subject": int(file[15:-8]),
                "Protocol": "N/A",
                "PPG_fs": 64, 
                "ACC_fs": 32, 
                "Raw PPG": ppg_raw_re,
                "Raw ACC_X": acc_x,
                "Raw ACC_Y": acc_y,
                "Raw ACC_Z": acc_z,
                "truth_values": np.asarray(label, dtype=np.float32)
            }
    ds_DaLia.append(subj_dict)

In [8]:
#-----------------------------
#--- WESAD ---
for file in list_pkl_files(WESAD):
    with open(file, 'rb') as f:
        data = pickle.load(f, encoding='bytes')          
        #raw acc collected by 32Hz sampling rate
        acc_raw = data[b'signal'][b'wrist'][b'ACC']
        #raw ppg collected by 64Hz sampling rate
        ppg_raw = data[b'signal'][b'wrist'][b'BVP']
        #label for 8s data with 2s shift
        ECG = data[b'signal'][b'chest'][b'ECG']
        ECG = ECG.reshape(len(ECG))
        TRUTH = []
        for i in windows(ECG, 8*700, 2*700):
            r_peaks = detectors.two_average_detector(i)
            BPM = (len(r_peaks)/8)*60
            TRUTH.append(BPM)

        ppg_raw_re = ppg_raw.reshape(len(ppg_raw))
        #acc
        acc_raw_x = []
        acc_raw_y = []
        acc_raw_z = []
        for i in range(len(acc_raw)):
            acc_raw_x.append(acc_raw[i][0])
            acc_raw_y.append(acc_raw[i][1])
            acc_raw_z.append(acc_raw[i][2])
        acc_x = np.array(acc_raw_x).reshape(len(acc_raw_x))
        acc_y = np.array(acc_raw_y).reshape(len(acc_raw_y))
        acc_z = np.array(acc_raw_z).reshape(len(acc_raw_z))
        subj_dict = {
                "Dataset": "WESAD",
                "Subject": int(file[16:-4]),
                "Protocol": "N/A",
                "PPG_fs": 64, 
                "ACC_fs": 32, 
                "Raw PPG": ppg_raw_re,
                "Raw ACC_X": acc_x,
                "Raw ACC_Y": acc_y,
                "Raw ACC_Z": acc_z,
                "truth_values": np.asarray(TRUTH, dtype=np.float32)
            }
    ds_Wesad.append(subj_dict)

KeyboardInterrupt: 