In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import os
import sys
import shutil
import math
import random
import heapq 
import time
import itertools
from PIL import Image
from io import StringIO,BytesIO 
from scipy.spatial.distance import pdist
import cv2
from scipy.signal import butter, lfilter
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,roc_curve,accuracy_score,auc 
from functools import reduce
import wfdb#https://github.com/MIT-LCP/wfdb-python
from wfdb import processing
import faiss 
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
torch.cuda.set_device(0)
print (torch.cuda.current_device())

Loading faiss with AVX2 support.


0


In [9]:
#Beats generation,
#we defined a single ECG beat image by centering the Q-wave peak signal while
#excluding the first and the last 20 ECG signals from the previous and afterward Q-wave peak signal
#https://github.com/MIT-LCP/wfdb-python/blob/master/demo.ipynb
#https://archive.physionet.org/physiobank/database/html/mitdbdir/mitdbdir.htm
#http://www.tara.tcd.ie/bitstream/handle/2262/17623/automatic.pdf?sequence=1
def labeltotext(val):
    if val in ['N','L','R','e','j'] :
        return 0 #N
    elif val in ['A','a','J','S']:
        return 1 #S
    elif val in ['V','E']:
        return 2 #V
    elif val == 'F':
        return 3 #F
    elif val in ['/','f','Q']:
        return 4 #Q
    else: 
        pass
    
rootdir = '/data/fjsdata/physionet/MIT-BIH/mitdb/'
right_len = 180 #right sample length around of peak value of QRS
left_len = 180 #left sample length around of peak value of QRS
#get trainset
trData = [] #[QRS value, label]
for bt in [101,106,108,109,112,114,115,116,118,119,122,124,201,203,205,207,208,209,215,220,223,230]:#22 records for train
    file = os.path.join(rootdir,str(bt))
    try:
        annotation = wfdb.rdann(file, 'atr') 
        qrs_spl = annotation.sample #numpy.ndarray
        qrs_sym = annotation.symbol #list
        record = wfdb.rdrecord(file)
        signal = record.p_signal #numpy.ndarray
        max_len = record.sig_len #length of samples
        lead_name =  record.sig_name #names of lead channels,list
        for i in range(annotation.ann_len):
            if qrs_sym[i] in ['N','L','R','e','j','A','a','J','S','V','E','F','/','f','Q']:#seven diseases samples
                pos = qrs_spl[i] #corresponding position of peak value of QRS
                if pos+right_len<=max_len and pos-left_len>=0:
                    max_idx = pos+right_len#np.min([max_len, pos+trunc_len])
                    min_idx = pos-left_len#np.max([0, pos-trunc_len])
                    #for j, val in enumerate(lead_name):
                        #QRS_value = signal[:,j][min_idx:max_idx]
                        #data.append([QRS_value,labeltotext(qrs_sym[i]),val])#[QRS value, label, lead name]
                    QRS_value = signal[:,0][min_idx:max_idx] #only one lead
                    trData.append([QRS_value,labeltotext(qrs_sym[i])])
    except:
        pass
    
trData = pd.DataFrame(np.array(trData))
X_train = pd.DataFrame(trData[0].values.tolist())
y_train = trData[1]
print('The shape of trainset is: (%d,%d)'%(X_train.shape[0],X_train.shape[1]))
print(trData[1].value_counts())
#get testset
teData = [] #[QRS value, label]
for bt in [100,103,105,111,113,117,121,123,200,202,210,212,213,214,219,221,222,228,231,232,233,234]:#22 records for test
    file = os.path.join(rootdir,str(bt))
    try:
        annotation = wfdb.rdann(file, 'atr') 
        qrs_spl = annotation.sample #numpy.ndarray
        qrs_sym = annotation.symbol #list
        record = wfdb.rdrecord(file)
        signal = record.p_signal #numpy.ndarray
        max_len = record.sig_len #length of samples
        lead_name =  record.sig_name #names of lead channels,list
        for i in range(annotation.ann_len):
            if qrs_sym[i] in ['N','L','R','e','j','A','a','J','S','V','E','F','/','f','Q']:#seven diseases samples
                pos = qrs_spl[i] #corresponding position of peak value of QRS
                if pos+right_len<=max_len and pos-left_len>=0:
                    max_idx = pos+right_len#np.min([max_len, pos+trunc_len])
                    min_idx = pos-left_len#np.max([0, pos-trunc_len])
                    #for j, val in enumerate(lead_name):
                        #QRS_value = signal[:,j][min_idx:max_idx]
                        #data.append([QRS_value,labeltotext(qrs_sym[i]),val])#[QRS value, label, lead name]
                    QRS_value = signal[:,0][min_idx:max_idx] #only one lead
                    teData.append([QRS_value,labeltotext(qrs_sym[i])])
    except:
        pass

teData = pd.DataFrame(np.array(teData))
X_test = pd.DataFrame(teData[0].values.tolist())
y_test = teData[1]
print('The shape of testset is: (%d,%d)'%(X_test.shape[0],X_test.shape[1]))
print(teData[1].value_counts())

#model: faiss+index
# buliding index of trainset
tstart = time.time()
cpu_index = faiss.IndexFlatL2(right_len+left_len) #
gpu_index = faiss.index_cpu_to_all_gpus(cpu_index) #make all gpu usable
gpu_index.add(np.ascontiguousarray(X_train, dtype=np.float32)) #add data(must be float32) to index
elapsed = time.time() - tstart    
print('Completed buliding index in %d seconds' % int(elapsed))
#performance
X_test = np.ascontiguousarray(X_test, dtype=np.float32)
scores, neighbors = gpu_index.search(X_test, k=1) #return top1
y_pred = []
for i in neighbors.flatten():
    y_pred.append(np.array(y_train)[i]) #label of top1
print ( 'Accuracy: %.6f'%accuracy_score(y_test.tolist(), y_pred))
#confusion matrix
labels = list(set(y_pred))
cm = confusion_matrix(y_test.tolist(), y_pred, labels=labels ) #labels=['N','S','V','F','Q']
print (cm)
print ('Specificity: %.6f'%float(cm[0][0]/np.sum(cm[0])))
print ('Sensitivity of S: %.6f'%float(cm[1][1]/np.sum(cm[1])))
print ('Sensitivity of V: %.6f'%float(cm[2][2]/np.sum(cm[2])))
print ('Sensitivity of F: %.6f'%float(cm[3][3]/np.sum(cm[3])))
print ('Sensitivity of Q: %.6f'%float(cm[4][4]/np.sum(cm[4])))

The shape of trainset is: (50995,360)
0    45841
2     3788
1      944
3      414
4        8
Name: 1, dtype: int64
The shape of testset is: (49687,360)
0    44235
2     3220
1     1837
3      388
4        7
Name: 1, dtype: int64
Completed buliding index in 1 seconds
Accuracy: 0.885423
[[41528  1207   666   809    25]
 [ 1728    88    11     6     4]
 [  649    44  2373   154     0]
 [  347     3    33     5     0]
 [    5     0     2     0     0]]
Specificity: 0.938804
Sensitivity of S: 0.047904
Sensitivity of V: 0.736957
Sensitivity of F: 0.012887
Sensitivity of Q: 0.000000


In [10]:
#AHA2MIT
#https://archive.physionet.org/physiobank/annotations.shtml

def labeltotext(val):
    if val in ['N','L','R','e','j'] :
        return 0 #N
    elif val in ['A','a','J','S']:
        return 1 #S
    elif val in ['V','E']:
        return 2 #V
    elif val == 'F':
        return 3 #F
    elif val in ['/','f','Q']:
        return 4 #Q
    else: 
        pass
rootdir = '/data/fjsdata/ECG/AHA2MIT'
filename = list(set( [os.path.splitext(base)[0] for base in os.listdir(rootdir)]) )
teData = [] #[QRS value, label]
for bt in filename:#22 records for test
    file = os.path.join(rootdir,str(bt))
    try:
        annotation = wfdb.rdann(file, 'atr') 
        qrs_spl = annotation.sample #numpy.ndarray
        qrs_sym = annotation.symbol #list
        #print (list(set(qrs_sym)))
        record = wfdb.rdrecord(file)
        signal = record.p_signal #numpy.ndarray
        max_len = record.sig_len #length of samples
        lead_name =  record.sig_name #names of lead channels,list
        for i in range(annotation.ann_len):
            if qrs_sym[i] in ['N','L','R','e','j','A','a','J','S','V','E','F','/','f','Q']:#five diseases samples
                pos = qrs_spl[i] #corresponding position of peak value of QRS
                if pos+right_len<=max_len and pos-left_len>=0:
                    max_idx = pos+right_len#np.min([max_len, pos+trunc_len])
                    min_idx = pos-left_len#np.max([0, pos-trunc_len])
                    QRS_value = signal[:,0][min_idx:max_idx] #only one lead
                    teData.append([QRS_value, labeltotext(qrs_sym[i])])
    except:
        pass

teData = pd.DataFrame(np.array(teData))
X_test = pd.DataFrame(teData[0].values.tolist()).fillna(0)
y_test = teData[1]
print('The shape of testset is: (%d,%d)'%(X_test.shape[0],X_test.shape[1]))
print(teData[1].value_counts())

# buliding index of trainset
tstart = time.time()
cpu_index = faiss.IndexFlatL2(right_len+left_len) #
gpu_index = faiss.index_cpu_to_all_gpus(cpu_index) #make all gpu usable
gpu_index.add(np.ascontiguousarray(X_train, dtype=np.float32)) #add data(must be float32) to index
elapsed = time.time() - tstart    
print('Completed buliding index in %d seconds' % int(elapsed))
#performance
X_test = np.ascontiguousarray(X_test, dtype=np.float32)
scores, neighbors = gpu_index.search(X_test, k=1) #return top1
y_pred = []
for i in neighbors.flatten():
    y_pred.append(np.array(y_train)[i]) #label of top1
print ( 'Accuracy: %.6f'%accuracy_score(y_test.tolist(), y_pred))
#confusion matrix
labels = list(set(y_pred))
cm = confusion_matrix(y_test.tolist(), y_pred, labels=labels ) #labels=['N','S','V','F','Q']
print (cm)
print ('Specificity: %.6f'%float(cm[0][0]/np.sum(cm[0])))
print ('Sensitivity of S: %.6f'%float(cm[1][1]/np.sum(cm[1])))
print ('Sensitivity of V: %.6f'%float(cm[2][2]/np.sum(cm[2])))
print ('Sensitivity of F: %.6f'%float(cm[3][3]/np.sum(cm[3])))
print ('Sensitivity of Q: %.6f'%float(cm[4][4]/np.sum(cm[4])))

Completed buliding index in 1 seconds
Accuracy: 0.739895
[[244570  37574  25312   8600    436]
 [     0      0      0      0      0]
 [ 11079   3269  16322    498     16]
 [   607    334    273     50      2]
 [  2036    372   1314     13      8]]
Specificity: 0.772753
Sensitivity of S: nan
Sensitivity of V: 0.523409
Sensitivity of F: 0.039494
Sensitivity of Q: 0.002137


