In [6]:
import os
import pandas as pd
import xml.etree.ElementTree as ET

# 지정된 폴더 경로
folder_path = r"/home/guno/guno/label_data"

# XML 파일 목록을 가져옴
xml_files = [f for f in os.listdir(folder_path) if f.endswith('.xml')]

# 데이터를 저장할 리스트
data = []

# 각 XML 파일을 열고 <Duration> 값을 추출
for xml_file in xml_files:
    file_path = os.path.join(folder_path, xml_file)
    
    # XML 파싱
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    # <PatientInfo> 내의 <Duration> 태그 찾기
    duration = root.findtext('PatientInfo/Duration')
    
    # 파일명과 Duration 값을 리스트에 추가
    data.append({
        'Filename': xml_file,
        'Duration': duration
    })

# DataFrame으로 변환
df = pd.DataFrame(data)

# Filename을 기준으로 DataFrame 정렬
df = df.sort_values(by='Filename').reset_index(drop=True)
df

Unnamed: 0,Filename,Duration
0,155_10_74915608.xml,23:59:00
1,155_1_74123360.xml,23:40:00
2,155_2_74773385.xml,23:59:00
3,155_3_74895083.xml,23:59:00
4,155_4_71282493.xml,23:59:00
5,155_5_71072418.xml,23:59:00
6,155_6_73892199.xml,23:59:00
7,155_7_73455754.xml,22:40:00
8,155_8_74003505.xml,23:56:00
9,155_9_74003505.xml,23:59:00


In [9]:
import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from tqdm import tqdm
import pickle
import xml.etree.ElementTree as ET

# Directory containing the CSV files and XML files
directory_path = r'/home/guno/guno/label_data'

# XML 파일 목록을 가져옴
xml_files = [f for f in os.listdir(directory_path) if f.endswith('.xml')]

# XML에서 Duration 데이터를 추출하고 딕셔너리에 저장
duration_dict = {}

for xml_file in xml_files:
    file_path = os.path.join(directory_path, xml_file)
    
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    # <PatientInfo> 내의 <Duration> 태그 찾기
    duration = root.findtext('PatientInfo/Duration')
    
    if duration:
        # Duration을 시, 분, 초로 변환
        h, m, s = map(int, duration.split(':'))
        duration_timedelta = timedelta(hours=h, minutes=m, seconds=s)
        
        # 파일명에서 앞부분 추출 (예: '155_10')
        file_prefix = '_'.join(xml_file.split('_')[:2])
        
        # 딕셔너리에 저장
        duration_dict[file_prefix] = duration_timedelta

# Function to process each file and generate labels
def process_file(file_path, start_datetime, duration):
    # Load the CSV file
    data = pd.read_csv(file_path)
    
    # Parse the date and time into a single datetime column
    data['datetime'] = pd.to_datetime(data['Date'] + ' ' + data['Time'])
    
    # Calculate the total duration of the data from the XML duration
    end_datetime = start_datetime + duration
    total_duration = end_datetime - start_datetime
    
    # Estimate the number of segments
    segment_duration = timedelta(seconds=10)
    total_segments = int(total_duration / segment_duration)
    
    # Create the segments of 10 seconds each
    segments = []
    labels = []

    current_start_time = start_datetime

    # Segment the data with tqdm progress bar
    for _ in range(total_segments):
        current_end_time = current_start_time + segment_duration
        segment_data = data[(data['datetime'] >= current_start_time) & (data['datetime'] < current_end_time)]
        
        # Determine the label for the segment
        if not segment_data.empty:
            event_labels = segment_data['Event'].unique()
            if len(event_labels) > 1:
                segment_label = ','.join(map(str, sorted(event_labels)))
            else:
                segment_label = event_labels[0]
        else:
            segment_label = 'normal'
        
        segments.append([current_start_time, current_end_time])
        labels.append(segment_label)
        
        current_start_time = current_end_time  # Update the start time for the next segment

    # Convert to ndarray
    labels_ndarray = np.array(labels)
    
    return labels_ndarray

# List to store labels and filenames
all_labels_with_filenames = []

# Iterate over each file in the directory
for i, file_name in enumerate(tqdm(os.listdir(directory_path))):
    if file_name.endswith('.csv'):
        file_path = os.path.join(directory_path, file_name)
        
        # 파일명에서 앞부분 추출 (예: '155_10')
        file_prefix = '_'.join(file_name.split('_')[:2])
        
        if file_prefix in duration_dict:
            # 파일 이름에서 확장자 제거
            file_name_without_extension = file_name.split('.')[0]
            
            # CSV 파일 이름에서 날짜 및 시간 부분 추출
            date_str = file_name_without_extension.split('_')[3]  # Extract the date part
            start_time_str = file_name_without_extension.split('_')[4]  # Extract the time part

            # start_datetime 생성
            start_datetime = datetime.strptime(date_str + start_time_str, '%Y%m%d%H%M%S')

            # XML에서 추출한 duration을 사용
            duration = duration_dict[file_prefix]

            # Labels 생성
            labels_ndarray = process_file(file_path, start_datetime, duration)

            # Append a dictionary containing filename, number of segments, and labels
            all_labels_with_filenames.append({
                "filename": file_name,
                "num_segments": len(labels_ndarray),
                "labels": labels_ndarray.tolist()  # Convert ndarray to list for easier viewing
            })

# Save the combined labels with filenames to a pickle file
output_pickle_path = r'/home/guno/guno/combined_labels.pkl'
with open(output_pickle_path, 'wb') as f:
    pickle.dump(all_labels_with_filenames, f)

print(f"Labels from all files have been saved to {output_pickle_path}")


100%|██████████| 20/20 [00:36<00:00,  1.85s/it]


In [14]:
### pickle file load 해서 확인하기

import pickle

# Path to the pickle file
pickle_file_path = r'/home/guno/guno/combined_labels.pkl'

# Code to load and display the pickle file in the desired format
with open(output_pickle_path, 'rb') as f:
    all_labels_with_filenames = pickle.load(f)

# Display the contents in the desired format
for i, file_info in enumerate(all_labels_with_filenames):
    print(f"File {i+1}: {{'filename': '{file_info['filename']}', 'num_segments': {file_info['num_segments']}}}")
    print(f"Labels: {file_info['labels']}\n")

File 1: {'filename': '155_2_74773385_20070302_100500.csv', 'num_segments': 8634}
Labels: ['New Shape,Tachycardia', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'norm

In [13]:
### file 1 데이터만 따로 확인하기

import pickle

# Path to the pickle file
pickle_file_path = r'/home/guno/guno/combined_labels.pkl'

# Code to load the pickle file
with open(pickle_file_path, 'rb') as f:
    all_labels_with_filenames = pickle.load(f)

# Display the contents for File 1
file_index = 0  # Index for File 1 (0-based index, so 0 corresponds to File 1)
file_info = all_labels_with_filenames[file_index]

print(f"File {file_index + 1}: {{'filename': '{file_info['filename']}', 'num_segments': {file_info['num_segments']}}}")
print(f"Labels: {file_info['labels']}\n")


File 1: {'filename': '155_2_74773385_20070302_100500.csv', 'num_segments': 8634}
Labels: ['New Shape,Tachycardia', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'norm