In [1]:
from os import listdir
from itertools import combinations
import pandas as pd
import numpy as np
import subprocess
import tensorflow as tf
import utils as utl
#from collections import Counter

import sys
sys.stdin.encoding

'UTF-8'

### Load wearable sensor-data

In [2]:
base_dir = 'data/'
remove_files = ('.ipynb_checkpoints', 'code_tester.ipynb','.idea', '.cache')

# sensor data files
datalist = listdir(base_dir + 'sensors') 
datalist = [x for x in datalist if x not in remove_files] #포함시키지 않을 경로명 제외하고 리스트 생성
#print(datalist)

pid_list = [x.split('_')[0] for x in datalist]
pid_list = list(set(pid_list))
pid_list.sort()

In [3]:
PList = []
for pid in pid_list:

    activity = '_activity_intraday.csv'
    HR = '_HR_intraday.csv'
    PList.append({'pid': pid[4:6], 'HR': pid+HR, 'act': pid+activity})


### Load medical diagnosis data (Blood test)

In [4]:
diagnosis = pd.read_csv(base_dir + 'label/diagnosis.csv')
diagnosis.head()

Unnamed: 0,Serial#,Group,DOB,age,gender,visit,Ht,Lab_date,freeT4
0,1,1,1973-01-03,43,2,1,162.0,2016-11-10,3.43
1,1,1,1973-01-03,43,2,2,162.0,2016-12-16,1.8
2,1,1,1973-01-03,43,2,3,162.0,2017-01-06,1.61
3,1,1,1973-01-03,43,2,4,162.0,2017-02-03,1.49
4,2,1,1976-12-11,39,2,1,166.0,2016-11-18,1.67


## Pre-processing

creating each input as 
* pre 5-day Heart Rate sequence by second
* pre 5-day Activity sequence by second
* interpolation to eliminate data loss
* age, gender, Height


## Dataset Class: hr_data

In [5]:
'''
Helper functions to trim sequence data using lab date (blood test date)
5 days * 24 hours * 60 mins = 7200 sequences ordered by timestamp
HRseq = [66, 67, 76, 64, 90, 114, ...]
Actseq = [0.667, 0.667, ..., 3.252, 6.554, ...] # calories
'''

#for x in range(1,2): # embedded number of id
def getDiagByPid(diagnosis, pid):
    ret = []
    idx = diagnosis['Serial#']==pid
    selected = diagnosis[:][idx]
    for index, row in selected.iterrows():
        item = {}
        for key in row.keys():
            #print(key)
            item[key] = row[key]
        ret.append(item)
    return ret    
def getSeqByDate(seq, lab_date):
    # (Lab date - 5 DAY) 보다 sequence date 기록이 적은 경우, labdate 후 5일 seq 반환
    
    
    end_date = pd.Timestamp(lab_date)
    start_date = end_date - pd.Timedelta('10 day')
    
    try:
        seq_tmp = seq[start_date:end_date]
    except:
        return 'null', None
    
    if len(seq_tmp) == 0:
        start_date = end_date
        end_date = start_date + pd.Timedelta('10 day')
        seq_tmp = seq[start_date:end_date]
        if len(seq_tmp) == 0:
            return 'null', None
        else:
            return 'before', seq_tmp
    else:
        return 'after', seq_tmp
    
def getSeqByPid(basedir, seqlist, pid):
    print(pid)
    '''
    Return HR, Activity Sequences by Pid
    ({'Timestamp': timestamp, 'Value': value})
    '''
    #find HR file name
    candidate = [x for x in seqlist if x['pid']==pid]
        
    hrfile = basedir + 'sensors/' + candidate[0]['HR']
    actfile = basedir + 'sensors/' + candidate[0]['act']
    
    # HR: 5 seconds -> 1 minute average aggregation
    hrseq = pd.read_csv(hrfile, header=0, parse_dates=['HEART RATE DATE/TIME'], index_col = 'HEART RATE DATE/TIME', usecols = ['HEART RATE DATE/TIME', 'VALUE'])
    hrseq = hrseq.resample('1T').mean().sort_index()
    actseq = pd.read_csv(actfile, header=0, quotechar ='\"', parse_dates=["ACTIVITY DATE/TIME"],index_col="ACTIVITY DATE/TIME", usecols=["ACTIVITY DATE/TIME", "CALORIES"])
    actseq = actseq.sort_index()
    return hrseq, actseq
    
    
class hr_data:
    def __init__(self, seq, diagnosis):
        self.seq_list = seq
        self.diag = diagnosis
        
        # pid_list: 1, 2, 3, 5, 6, 7, ...
        pid_list = [x['pid'] for x in self.seq_list]
        
        self.dataset = []
        # 환자별로 시퀀스 생성
        for pid in pid_list:
            pid_int = int(pid)
            for medical_record in getDiagByPid(self.diag, pid_int):
                HR_pd, ACT_pd = getSeqByPid('data/', self.seq_list, pid)
                
                # Input Data
                date = medical_record['Lab_date']
                age = medical_record['age']
                gender = medical_record['gender']
                height = medical_record['Ht']
                s_type, HRseq = getSeqByDate(HR_pd, date)
                s_type2, ACTseq = getSeqByDate(ACT_pd, date)
                
                if s_type != s_type2 or s_type == 'null' or s_type2 == 'null':
                    continue
                
                # Label
                freeT4 = medical_record['freeT4']
                categorical = [age, gender, height, s_type]
                
                x1 = HRseq[:-1]
                x2 = ACTseq[:-1]
                if len(x1) == 7200 and len(x2) == 7200:
                    self.dataset.append({'categorical':categorical, 'HR': x1, 'ACT': x2, 'freeT4':freeT4})
                else:
                    continue
    def fillna(self):
        for record in self.dataset:
            record['HR'] = record['HR'].fillna(0)
            record['ACT'] = record['ACT'].fillna(0)
    

In [6]:
# Dataset instance

dataset = hr_data(PList, diagnosis)
dataset.fillna()

01
01
01
01
02
02
02
04
04
04
04
05
05
05
05
06
06
07
07
07
07
07
08
08
08
09
09
09
09
10
10
10
10
10
11
11
11
11
11
11
13
13
13
13
13
14
14
14
14
15
15
15
15
15
16
16
16
16
17
17
17
18
18
18
18
19
19
19
19
20
20
20
20
21
21
21
21
22
22
22
22
23
23
23
23
24
24
24
24
25
25
25
25
26
26
26
27
27
27
27
28
28
28
28
29
29
29
29
30
30
30
31
31
31
32
32
32
33
33
33
34
34
34
35
35
35
36
36
36
37
37
37
38
38
38
39
39
39
40
40
40


# Input

<img src="img/input.png" style="height:80px">

Save this data into files "X.dat, Y.dat" to load fast in training step

In [5]:
X = [ (x['HR'].values, x['ACT'].values) for x in dataset.dataset]
X = np.array(X).reshape([96, 14400])
Y = [ x['freeT4'] for x in dataset.dataset]
Y = np.array(Y)

NameError: name 'dataset' is not defined

In [16]:
X.tofile('X.dat')
Y.tofile('Y.dat')

## Test to load pre-processed data

In [5]:
X = np.fromfile('X.dat', dtype=float).reshape([96,14400])
Y = np.fromfile('Y.dat', dtype=float)

In [6]:
train_x, val_x, test_x, train_y, val_y, test_y = utl.train_val_test_split(X, Y, split_frac=0.80)
print("Data Set Size")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

Data Set Size
Train set: 		(76, 14400) 
Validation set: 	(10, 14400) 
Test set: 		(10, 14400)
