# txt 파일에 string으로 기록되어 있는 ground truth stage를 숫자로 변환
- Wake --> S0 --> 5
- REM --> REM --> 4
- N1 --> S1 --> 3
- N2 --> S2 --> 2
- N3 --> S3 --> 1

In [1]:
import os 
import numpy as np
from tqdm.notebook import tqdm, trange
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
stage_path = 'D:\\USC\\Sleep dataset\\Samsung_data\\GROUND_TRUTH_STAGING'
edf_path = 'D:\\USC\\Sleep dataset\\Samsung_data\\REGULAR_EDF'

# Choose the prepared data

## 1. Check subjects whether or not they really have edf files
- Some subjects do not have edf file. So we need to exclude them

## 2. Select subjects who have both ground_truth_hypnogram and edf 
- stage와 edf가 동시에 존재하는 파일만 선택해야 함
- 모든 stage file과 모든 edf 폴더의 이름을 비교
- 일치하는 개수가 5이상은 것만 선택


In [3]:
def choose_prepared_data(stage_names, edf_names):
    choose_sub = []
    
    # 1. Check subjects whether or not they really have edf files
    choose_edf = []
    edf_subjects = os.listdir(edf_path)
    for edf_subject in edf_subjects:
        temp_dir = os.path.join(edf_path, edf_subject)
        temp_dir_list = os.listdir(temp_dir)
        for name in temp_dir_list:
            if 'edf' in name:
                choose_edf.append(edf_subject[0:6])
    
    # 2. Select subjects listed on both hypnogram and edf
    #    - use choose_edf to be compared with the stage_names
    for stage_name in stage_names:
        for edf_name in choose_edf:
            # get the first six characters from each name
            temp_stage = stage_name[0:6]
            temp_edf = edf_name
            
            # compare the two six-length-characters
            how_many_correspond = 0
            for i in range(0,6): 
                if temp_stage[i] == temp_edf[i]:
                    how_many_correspond += 1
            
            # choose the subject if corresponds at least five times
            if how_many_correspond >= 5:
                choose_sub.append(stage_name)    
    
    
    return choose_sub

In [4]:
stage_names = os.listdir(stage_path)
edf_names = os.listdir(edf_path)

# print(stage_names)
# print(edf_names)

choose_sub = choose_prepared_data(stage_names, edf_names)
print(choose_sub)

['LE004-Events.txt', 'LE010-Events.txt', 'LE011-Events.txt', 'LE014-Events.txt', 'LE019-Events.txt', 'LE020-Events.txt', 'LE021-Events.txt', 'LE022-Events.txt', 'LE023-Events.txt']


# Read the staging files in txt and extract only staging results

- Read txt files based on the 'choos_sub', which means the prepared data

In [5]:
def ReadTxtFiles(choose_sub, stage_path):
    datas = []
    for stage_name in choose_sub:
        with open(os.path.join(stage_path, stage_name), 'r') as f:
            # Read txt file
            lines = f.readlines()
            # include only valid information which contains the stages
            lines = lines[18:]
            new_lines = []
            
            # From long strings, pick only 'stage' up
            for line in lines:
                temp_line = line.strip().split('\t')
                temp_line = temp_line[0].split('-')
                new_lines.append(temp_line[1]) # get the stages as string  
                
            datas.append(new_lines)
    return datas

In [6]:
hypnos_string = ReadTxtFiles(choose_sub, stage_path)
print(hypnos_string)

[['S0', 'S0', 'S0', 'S0', 'S0', 'S0', 'S0', 'S0', 'S0', 'S0', 'S0', 'S0', 'S0', 'S0', 'S0', 'S0', 'S1', 'S1', 'S1', 'S1', 'S1', 'S1', 'S2', 'S2', 'S2', 'S2', 'S2', 'S2', 'S2', 'S2', 'S2', 'S2', 'S2', 'S2', 'S2', 'S2', 'S2', 'S2', 'S2', 'S2', 'S2', 'S2', 'S2', 'S2', 'S2', 'S2', 'S2', 'S2', 'S2', 'S2', 'S2', 'S2', 'S3', 'S2', 'S2', 'S3', 'S2', 'S2', 'S3', 'S3', 'S2', 'S2', 'S3', 'S3', 'S3', 'S3', 'S3', 'S2', 'S1', 'S1', 'S3', 'S2', 'S3', 'S2', 'S1', 'S3', 'S2', 'S3', 'S3', 'S3', 'S3', 'S3', 'S3', 'S3', 'S3', 'S3', 'S3', 'S3', 'S3', 'S3', 'S3', 'S3', 'S3', 'S3', 'S3', 'S3', 'S3', 'S3', 'S3', 'S3', 'S3', 'S3', 'S3', 'S3', 'S3', 'S3', 'S3', 'S2', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 

In [7]:
def string2num(hypno_string):
    # input: single hypnogram
    hypno_as_num = []
    # dictionary to convert string to num
    stage_dic = {"S0": 0, "REM": 4, "S1": 1, "S2": 2, "S3": 3}
    for stage in hypno_string:
        hypno_as_num.append(stage_dic[stage])
    return hypno_as_num

def S2N(hypno_string):
    hypno_N = []
    convert_dic = {'S0':'W', 'REM':'R', 'S1':'N1', 'S2':'N2', 'S3':'N3'}
    for stage in hypno_string:
        hypno_N.append(convert_dic[stage])
        
    return hypno_N        

In [8]:
# Convert prepared hypnograms in string into num
hypnos_num = []
for hypno_string in hypnos_string:
    hypnos_num.append(string2num(hypno_string))
    
print("{} hypnograms are included".format(len(hypnos_num)))

9 hypnograms are included


In [9]:
# Convert each hypnogram into pd.DataFrame and save it as csv file
import pandas as pd

i = 0 # choose_sub 인뎃싱을 위한 변수
for hypno_num in hypnos_num:
    temp_df = pd.DataFrame(hypno_num, columns=['stages'])
    path_csv = 'D:\\USC\\test_data\\Prepared_InNum_Hypnos\\' + choose_sub[i][0:5] + '.csv'
    temp_df.to_csv(path_or_buf=path_csv, index=None)
    i += 1