In [8]:
import pandas as pd
import numpy as np
import os
import pyedflib
import random
from matplotlib import pyplot as plt
from scipy.interpolate import interp1d
import xml.etree.ElementTree as ET
import sys
import scipy.stats as stats
from sklearn.metrics import roc_auc_score

sys.path.append('/home/linzenghui/ECG_code/HeartRateVariability_220217')
import FrequencyDomain as fd
import TimeDomain as td
import NonLinear as nl
from common import *
from Rpeaks import *

names = os.listdir('../polysomnography/edfs/visit1/')
names.sort()
data_frame=pd.read_csv('../datasets/numom-visit1-dataset-0.3.1.csv')[['publicid','ahi_ap4nhp3x4n_f1t3']]

In [9]:
def resample_interp(ts, fs_in, fs_out):
    """
    基于线性拟合的差值重采样算法
    计算前后点对应的比例进行插值
    :param ts:  单导联数据，一维浮点型数组
    :param fs_in: 原始采样率，整型
    :param fs_out: 目标采样率，整型
    :return: 重采样后的数据
    """
    t = len(ts) / fs_in
    fs_in, fs_out = int(fs_in), int(fs_out)
    if fs_out == fs_in:
        return np.array(ts)
    else:
        x_old = np.linspace(0, 1, num=len(ts), endpoint=True)
        x_new = np.linspace(0, 1, num=int(t * fs_out), endpoint=True)
        y_old = ts
        f = interp1d(x_old, y_old, kind='linear')
        y_new = f(x_new)
        return y_new

In [12]:
def get_wake_time(anno_path):
    tree = ET.parse(anno_path)
    root = tree.getroot()
    stage_list=[]
    count=0
    assert root[4].tag=='SleepStages'
    for stage in root[4].findall('SleepStage'):
        stage_list.append(stage.text)
    for i in stage_list:
        if i!='0':
            break
        else:
            count+=1
    return (count*30,len(stage_list)*30)

def get_annopath_from_name(name):
    anno_folder='/data/0shared/linzenghui/ECG_data/public_dataset/numom2b/polysomnography/annotations-events-profusion/visit1/'
    return anno_folder+name.split('.')[0]+'-profusion.xml'

In [11]:
def getidx_from_name(name):
    '(id)'
    return name.split('-')[2].split('.')[0]

def find_ahi(data_frame,id):
    return float(data_frame[data_frame['publicid']==str(id)]['ahi_ap4nhp3x4n_f1t3'].values)

def my_random_split(list1,len,shuffle=False):
    temp_list=list1.copy()
    if shuffle:
        random.seed(100)
        random.shuffle(temp_list)
    return (temp_list[0:len[0]],temp_list[-len[1]:])


def get_wake_ecg_from_edf(name_list, bag_path,datasetnumber=10):
    ecg_database=np.zeros(shape=(len(name_list),3+5*60*200))
    ecg_database[:,0]=datasetnumber
    for idx,name in enumerate(name_list):
        wake_time=get_wake_time(get_annopath_from_name(name))[0]
        assert wake_time>=300
        id=getidx_from_name(name)
        ahi=find_ahi(data_frame,id)
        ecg_database[idx,1]=ahi
        ecg_database[idx,2]=float(ahi>=5)
        
        f = pyedflib.EdfReader(bag_path+name)
        labels=f.getSignalLabels()
        ind=labels.index('ECG')
        
        header=f.getSignalHeader(ind)
        if header['dimension']=='uV':
            ecg_data=f.readSignal(ind)/1000
        if header['dimension']=='mV':
            ecg_data=f.readSignal(ind)
        
        fs=f.getSampleFrequency(ind)
        if fs!=200:
            ecg_database[idx,3:]=resample_interp(ecg_data[int((wake_time-300)*fs):int(fs*wake_time)],fs_in=fs,fs_out=200)
        else:
            ecg_database[idx,3:]=ecg_data[int((wake_time-300)*fs):int(fs*wake_time)]
        
        f.close()
    assert idx==len(name_list)-1
    return ecg_database

def cut_data(data,window_size=30*200,step=30*200,datasetnumber=10):
    database=np.zeros(shape=(int(data.shape[0]*((data.shape[1]-3-window_size)/step+1)),3+window_size))
    database[:,0]=datasetnumber
    count=0
    for idx in range(data.shape[0]):
        for start in range(3,data.shape[1],step):
            database[count,1]=data[idx,1]
            database[count,2]=data[idx,2]
            database[count,3:]=data[idx,start:start+window_size]
            count+=1
    assert count==database.shape[0]
    return database

def check_edf(edf):
    f = pyedflib.EdfReader(edf)
    labels=f.getSignalLabels()
    try:
        ind=labels.index('ECG')
    except:
        ind=labels.index('ECG1')
    header=f.getSignalHeader(ind)
    dura=f.getFileDuration()
    ecg=f.readSignal(ind)
    print(header)
    print(dura,len(ecg)/dura)
    f.close()

In [13]:
def basic_screen(sig,fs=200):
    peaks=simple_qrs_detector(sig,fs=fs)
    rpos = R_Wave_finetune(sig, peaks)
    if len(rpos)<150:
        return (False,'峰值太少')
        # Amplitude less than 3mV
        # ampl = np.abs(np.max(sig) - np.min(sig))
        #if ampl > 3:
        #    return False
    sig_len=len(sig)
    tmp_sig = np.abs(sig)
    tmp_sig = tmp_sig[tmp_sig > 0.1]
    if len(tmp_sig) < 5:
        return (False,'电压值过低')
    if rpos[0] > fs*5 or rpos[-1] < (sig_len-fs*5):
        return (False,'前方或后方有空缺')
    rr_intervals = np.diff(rpos)
    maxRR = np.max(rr_intervals)
    meanRR = np.mean(rr_intervals)
    if maxRR > meanRR * 3:
        return (False,'rri max值过大')
    return (True,'pass')

In [14]:
##baseline check
wake_list=[get_wake_time(get_annopath_from_name(name))[0]/60 for name in names]
whole_len=[get_wake_time(get_annopath_from_name(name))[1]/60 for name in names]
df=pd.DataFrame({'name':names,'wake':wake_list,'whole':whole_len})
name_list=list(df[df['wake']>=5]['name'])##wakeness时间长度合格

wake_ecg=get_wake_ecg_from_edf(name_list,bag_path='../polysomnography/edfs/visit1/',datasetnumber=11)
sqi_list=[basic_screen(wake_ecg[idx,3:],fs=200)[0] for idx in range(len(wake_ecg))]
cleaned_wake_ecg=wake_ecg[sqi_list]##基本质量过关

In [15]:
with open('numom2b1.npy','xb') as f:
    np.save(f,cleaned_wake_ecg)

In [8]:
dic_list=[]
for idx in range(len(cleaned_wake_ecg)):
    peaks=simple_qrs_detector(cleaned_wake_ecg[idx,3:],fs=200)
    fineturned_r=R_Wave_finetune(cleaned_wake_ecg[idx,3:],peaks)
    td_data=td.hrv(fineturned_r,sampling_rate=200).__dict__
    fd_data=fd.frequencies(fineturned_r,200).__dict__
    poincare_data=nl.poincare(fineturned_r, 200).__dict__
    entropy_data=nl.entropy(fineturned_r, 200).__dict__
    all_dict=dict(**td_data,**fd_data,**poincare_data,**entropy_data)
    all_dict['ahi']=cleaned_wake_ecg[idx,1]
    all_dict['anno']=cleaned_wake_ecg[idx,2]
    dic_list.append(all_dict)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


In [9]:
map=['ahi','anno','sdnn', 'sdann_1', 'sdnni_1', 'rmssd', 'sdsd', 'cvnn',
       'cvsd', 'pNN20', 'pNN50', 'tri_index', 'tinn_m', 'tinn_n', 'tinn',
       'vlf_power', 'lf_power', 'hf_power', 'vhf_power', 'total_power', 'lf_n',
       'hf_n', 'lf_hf', 'sd1', 'sd2', 'sd1_sd2', 's', 'csi', 'cvi',
       'csi_modified', 'approximate', 'sample', 'shannon', 'fuzzy', 'cd',
       'hfd', 'kfd', 'lzc']

In [10]:
df_hrv=pd.DataFrame(dic_list)[map]
normal_df=df_hrv[df_hrv['anno']==0]
abnormal_df=df_hrv[df_hrv['anno']==1]

In [11]:
p_value={}
for column in normal_df.columns[2:38]:
    P=stats.ranksums(normal_df[column],abnormal_df[column])[1]
    p_value[column]=P
index=['numon2b2']
pd.DataFrame(p_value,index=index).to_excel('numom2b2.xlsx')