In [5]:
import os
import warnings
from datetime import timedelta
warnings.filterwarnings('ignore')
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

df_nurse_record = pd.read_csv("/Volumes/Seagate/new_nursing_0909.csv")
df_nurse_record = df_nurse_record[df_nurse_record['RecordUnit'] == 'ICUC']


unique_numbers_list = set(df_nurse_record['AlsUnitNo'].unique().tolist())
pkl_list = set([int(i.replace('.pkl', '')) for i in os.listdir('/Volumes/Seagate/pkl/') if i.endswith('.pkl')])

patient_id_list = list(pkl_list & unique_numbers_list)

In [None]:
# v1
# 
import pickle as pkl
import pandas as pd
from datetime import timedelta

def preprocess_data(patient_id):
    pickle = pkl.load(open(f'/Volumes/Seagate/pkl/{patient_id}.pkl', 'rb'))
    icuc = pickle[pickle['NURSING_RecordUnit']=='ICUC']

    # 필요한 컬럼 리스트
    required_columns = [
        'AlsUnitNo', 'TimeStamp', 'DatSeq',
        'Label', 'Severity',
        'ABP_WAVEFORM', 'ECG_WAVEFORM', 'PPG_WAVEFORM', 'RESP_WAVEFORM',
        'ABP_time_diff_sec', 'II_time_diff_sec', 'Pleth_time_diff_sec', 'Resp_time_diff_sec',
        'SpO2_numeric', 'Pulse_numeric', 'ST_numeric', 'Tskin_numeric', 
        'ABP_numeric', 'NBP_numeric', 'HR_numeric', 'RR_numeric',
        'SpO2_numeric_time_diff_sec', 'Pulse_numeric_time_diff_sec', 
        'ST_numeric_time_diff_sec', 'Tskin_numeric_time_diff_sec', 
        'ABP_numeric_time_diff_sec', 'NBP_numeric_time_diff_sec', 
        'HR_numeric_time_diff_sec', 'RR_numeric_time_diff_sec',
    ]
    
    # 모든 필요한 컬럼을 순서대로 생성
    filtered = pd.DataFrame()
    for col in required_columns:
        if col in icuc.columns:
            filtered[col] = icuc[col].copy()
        else:
            filtered[col] = pd.NA
            print(f"Warning: Missing column: {col}")
    
    # Severity 처리
    filtered['Severity'] = filtered['Severity'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)
    filtered.insert(
        filtered.columns.get_loc('Severity')+1, 
        'SeverityColor', 
        filtered['Severity'].map({0: "Red", 1:"Yellow", 2:"ShortYellow", 3:"SevereCyan", 4:"Cyan", 5:"SilentCyan", 6:"White"})
    )

    filtered_nursing = df_nurse_record[df_nurse_record['AlsUnitNo'] == patient_id][
        ['TimeStamp', 'AssessmentNm', 'ImplementationNm', 'AttributeNm', 'AttributeDetail', 'AttributeDetailValue', 'AttributeDetailValue2', 'InDateTime', 'OutDateTime']
    ].sort_values('TimeStamp')

    # TimeStamp를 datetime으로 변환
    if filtered['TimeStamp'].dtype == 'object':
        filtered['TimeStamp'] = pd.to_datetime(filtered['TimeStamp'])
    if filtered_nursing['TimeStamp'].dtype == 'object':
        filtered_nursing['TimeStamp'] = pd.to_datetime(filtered_nursing['TimeStamp'])

    # 컬럼명 매핑
    column_mapping = {
        'TimeStamp': '시행일시',
        'AssessmentNm': '간호진단프로토콜(코드명)',
        'ImplementationNm': '간호활동(코드명)',
        'AttributeNm': '간호속성코드(코드명)',
        'AttributeDetail': '간호속성명칭',
        'AttributeDetailValue': '속성',
        'AttributeDetailValue2': '속성Text',
        'InDateTime': '입원시간',
        'OutDateTime': '퇴원기간'
    }

    # 앞 뒤로 둘 다 하나씩은 있어야함
    def get_nursing_records_optimized():
        result = []
        for idx, row in filtered.iterrows():
            timestamp = row['TimeStamp']
            start_time = timestamp - timedelta(minutes=30)
            end_time = timestamp + timedelta(minutes=30)
            
            # 이전 30분 레코드 확인
            before_mask = (filtered_nursing['TimeStamp'] >= start_time) & (filtered_nursing['TimeStamp'] < timestamp)
            before_records = filtered_nursing[before_mask]
            
            # 이후 30분 레코드 확인
            after_mask = (filtered_nursing['TimeStamp'] > timestamp) & (filtered_nursing['TimeStamp'] <= end_time)
            after_records = filtered_nursing[after_mask]
            
            # 한 쪽이라도 비어있으면 []
            if before_records.empty or after_records.empty:
                result.append([])
            else:
                # 전체 범위 레코드 가져오기
                mask = (filtered_nursing['TimeStamp'] >= start_time) & (filtered_nursing['TimeStamp'] <= end_time)
                records = filtered_nursing[mask]
                result.append(records.rename(columns=column_mapping).to_dict('records'))
        
        return result


    filtered['NursingRecords_ba30'] = get_nursing_records_optimized()

    # 필터링 조건 적용 - 존재하는 컬럼만 체크
    waveform_cols = ['ABP_time_diff_sec', 'II_time_diff_sec', 'Pleth_time_diff_sec', 'Resp_time_diff_sec']
    existing_waveform_cols = [col for col in waveform_cols if col in filtered.columns and filtered[col].notna().any()]
    
    # 조건 생성
    conditions = []
    
    # Waveform 컬럼 조건 (존재하는 컬럼만)
    if existing_waveform_cols:
        conditions.append((filtered[existing_waveform_cols] <= 60).all(axis=1))
    
    # NursingRecords 조건
    conditions.append(filtered['NursingRecords_ba30'].apply(lambda x: len(x) > 0))
    
    # 모든 조건 적용
    if conditions:
        final_mask = conditions[0]
        for condition in conditions[1:]:
            final_mask = final_mask & condition
        filtered = filtered[final_mask].copy()
    
    
    # 라벨 특수문자 제거
    filtered['Label'] = filtered['Label'].apply(lambda x: [item.replace('  ', ' ').replace('?', '').replace('!', '').strip() for item in x] if isinstance(x, list) else x)
    filtered['isView'] = True # 뷰어에선 안 보이고, False로 처리하고싶은 애들 (김정민 교수님 필터 참고해서 수정해야함.)
    
    # -------
    filtered['isSelected'] = False # 뷰어에서 T/F 가 선택되었나요? 의 여부
    filtered['isAlarm'] = False
    filtered['Comment'] = ''

    return filtered

In [13]:
# v2
import pickle as pkl
import pandas as pd
from datetime import timedelta

def preprocess_data(patient_id):
    pickle = pkl.load(open(f'/Volumes/Seagate/pkl/{patient_id}.pkl', 'rb'))
    icuc = pickle[pickle['NURSING_RecordUnit']=='ICUC']

    # 필요한 컬럼 리스트
    required_columns = [
        'AlsUnitNo', 'TimeStamp',
        'Label', 'Severity',
        'ABP_WAVEFORM', 'ECG_WAVEFORM', 'PPG_WAVEFORM', 'RESP_WAVEFORM',
        'ABP_time_diff_sec', 'II_time_diff_sec', 'Pleth_time_diff_sec', 'Resp_time_diff_sec',
        'SpO2_numeric', 'Pulse_numeric', 'ST_numeric', 'Tskin_numeric', 
        'ABP_numeric', 'NBP_numeric', 'HR_numeric', 'RR_numeric',
        'SpO2_numeric_time_diff_sec', 'Pulse_numeric_time_diff_sec', 
        'ST_numeric_time_diff_sec', 'Tskin_numeric_time_diff_sec', 
        'ABP_numeric_time_diff_sec', 'NBP_numeric_time_diff_sec', 
        'HR_numeric_time_diff_sec', 'RR_numeric_time_diff_sec',
    ]
    
    # 모든 필요한 컬럼을 순서대로 생성
    filtered = pd.DataFrame()
    for col in required_columns:
        if col in icuc.columns:
            filtered[col] = icuc[col].copy()
        else:
            filtered[col] = pd.NA
            print(f"Warning: Missing column: {col}")
    
    # Severity 처리
    filtered['Severity'] = filtered['Severity'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)
    filtered.insert(
        filtered.columns.get_loc('Severity')+1, 
        'SeverityColor', 
        filtered['Severity'].map({0: "Red", 1:"Yellow", 2:"ShortYellow", 3:"SevereCyan", 4:"Cyan", 5:"SilentCyan", 6:"White"})
    )

    filtered_nursing = df_nurse_record[df_nurse_record['AlsUnitNo'] == patient_id][
        ['TimeStamp', 'AssessmentNm', 'ImplementationNm', 'AttributeNm', 'AttributeDetail', 'AttributeDetailValue', 'AttributeDetailValue2', 'InDateTime', 'OutDateTime']
    ].sort_values('TimeStamp')

    # TimeStamp를 datetime으로 변환
    if filtered['TimeStamp'].dtype == 'object':
        filtered['TimeStamp'] = pd.to_datetime(filtered['TimeStamp'])
    if filtered_nursing['TimeStamp'].dtype == 'object':
        filtered_nursing['TimeStamp'] = pd.to_datetime(filtered_nursing['TimeStamp'])

    # 컬럼명 매핑 (InDateTime, OutDateTime 제외)
    column_mapping = {
        'TimeStamp': '시행일시',
        'AssessmentNm': '간호진단프로토콜(코드명)',
        'ImplementationNm': '간호활동(코드명)',
        'AttributeNm': '간호속성코드(코드명)',
        'AttributeDetail': '간호속성명칭',
        'AttributeDetailValue': '속성',
        'AttributeDetailValue2': '속성Text'
    }

    # 앞 뒤로 둘 다 하나씩은 있어야함
    def get_nursing_records_optimized():
        result = []
        admission_in_list = []
        admission_out_list = []
        
        for idx, row in filtered.iterrows():
            timestamp = row['TimeStamp']
            start_time = timestamp - timedelta(minutes=30)
            end_time = timestamp + timedelta(minutes=30)
            
            # 이전 30분 레코드 확인
            before_mask = (filtered_nursing['TimeStamp'] >= start_time) & (filtered_nursing['TimeStamp'] < timestamp)
            before_records = filtered_nursing[before_mask]
            
            # 이후 30분 레코드 확인
            after_mask = (filtered_nursing['TimeStamp'] > timestamp) & (filtered_nursing['TimeStamp'] <= end_time)
            after_records = filtered_nursing[after_mask]
            
            # 한 쪽이라도 비어있으면 []
            if before_records.empty or after_records.empty:
                result.append([])
                admission_in_list.append('')
                admission_out_list.append('')
            else:
                # 전체 범위 레코드 가져오기
                mask = (filtered_nursing['TimeStamp'] >= start_time) & (filtered_nursing['TimeStamp'] <= end_time)
                records = filtered_nursing[mask]
                
                # 첫 번째 레코드의 InDateTime, OutDateTime 값 (단일 값)
                in_value = records['InDateTime'].iloc[0] if len(records) > 0 and pd.notna(records['InDateTime'].iloc[0]) else ''
                out_value = records['OutDateTime'].iloc[0] if len(records) > 0 and pd.notna(records['OutDateTime'].iloc[0]) else ''
                
                admission_in_list.append(in_value)
                admission_out_list.append(out_value)
                
                # NursingRecords에는 InDateTime, OutDateTime 제외하고 넣기
                records_without_admission = records.drop(columns=['InDateTime', 'OutDateTime'])
                result.append(records_without_admission.rename(columns=column_mapping).to_dict('records'))
        
        return result, admission_in_list, admission_out_list

    # 함수 호출 및 할당
    nursing_records, admission_ins, admission_outs = get_nursing_records_optimized()
    filtered['NursingRecords_ba30'] = nursing_records
    filtered['AdmissionIn'] = admission_ins
    filtered['AdmissionOut'] = admission_outs

    # 필터링 조건 적용 - 존재하는 컬럼만 체크
    waveform_cols = ['ABP_time_diff_sec', 'II_time_diff_sec', 'Pleth_time_diff_sec', 'Resp_time_diff_sec']
    existing_waveform_cols = [col for col in waveform_cols if col in filtered.columns and filtered[col].notna().any()]
    
    # 조건 생성
    conditions = []
    
    # Waveform 컬럼 조건 (존재하는 컬럼만)
    if existing_waveform_cols:
        conditions.append((filtered[existing_waveform_cols] <= 60).all(axis=1))
    
    # NursingRecords 조건
    conditions.append(filtered['NursingRecords_ba30'].apply(lambda x: len(x) > 0))
    
    # 모든 조건 적용
    if conditions:
        final_mask = conditions[0]
        for condition in conditions[1:]:
            final_mask = final_mask & condition
        filtered = filtered[final_mask].copy()
    
    # 라벨 특수문자 제거
    filtered['Label'] = filtered['Label'].apply(lambda x: [item.replace('  ', ' ').replace('?', '').replace('!', '').strip() for item in x] if isinstance(x, list) else x)
    
    # 나머지 컬럼들
    filtered['isView'] = True # 뷰어에선 안 보이고, False로 처리하고싶은 애들 (김정민 교수님 필터 참고해서 수정해야함.)
    filtered['isSelected'] = False # 뷰어에서 T/F 가 선택되었나요? 의 여부
    filtered['Classification'] = False
    filtered['Comment'] = ''

    return filtered

In [20]:
# v3
import pickle as pkl
import pandas as pd
from datetime import timedelta
from collections import Counter

# 기술적 알람 목록 로드
def load_technical_alarms(file_path="Filtered_AlarmLabelList.txt"):
    """기술적 알람 목록을 파일에서 로드하여 정규화"""
    technical_alarms = set()
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        
        for line in lines:
            line = line.strip()
            if not line:  # 빈 줄 건너뛰기
                continue
            
            # 슬래시로 구분된 여러 라벨 처리
            if "/" in line:
                labels = [label.strip() for label in line.split("/")]
            else:
                labels = [line]
            
            # 각 라벨을 정규화하여 저장
            for label in labels:
                if label:  # 빈 문자열이 아닌 경우만
                    normalized_label = label.lower().strip().replace(" ", "")
                    if normalized_label:
                        technical_alarms.add(normalized_label)
        
        print(f"기술적 알람 목록 로드 완료: {len(technical_alarms)}개 라벨")
        
    except Exception as e:
        print(f"기술적 알람 목록 로드 오류: {e}")
    
    return technical_alarms

def normalize_alarm_label(label):
    """알람 라벨을 비교를 위해 정규화"""
    if not label:
        return ""
    return str(label).lower().replace(" ", "").strip()

def is_only_technical_alarms(label_list):
    """Label 리스트가 기술적 알람만 포함하는지 확인"""
    if not label_list or not isinstance(label_list, list):
        return False
    
    # 빈 리스트나 None만 있는 경우
    valid_labels = [label for label in label_list if label and str(label).strip() not in ["None", "[]", ""]]
    if not valid_labels:
        return False
    
    # 모든 라벨이 기술적 알람인지 확인
    for label in valid_labels:
        normalized_label = normalize_alarm_label(label)
        if normalized_label not in TECHNICAL_ALARMS:
            return False  # 하나라도 임상적 알람이 있으면 False
    
    return True  # 모두 기술적 알람인 경우만 True

# 전역 변수로 기술적 알람 목록 로드
TECHNICAL_ALARMS = load_technical_alarms()

def preprocess_data(patient_id):
    pickle = pkl.load(open(f'/Volumes/Seagate/pkl/{patient_id}.pkl', 'rb'))
    icuc = pickle[pickle['NURSING_RecordUnit']=='ICUC']

    # 필요한 컬럼 리스트
    required_columns = [
        'AlsUnitNo', 'TimeStamp',
        'Label', 'Severity',
        'ABP_WAVEFORM', 'ECG_WAVEFORM', 'PPG_WAVEFORM', 'RESP_WAVEFORM',
        'ABP_time_diff_sec', 'II_time_diff_sec', 'Pleth_time_diff_sec', 'Resp_time_diff_sec',
        'SpO2_numeric', 'Pulse_numeric', 'ST_numeric', 'Tskin_numeric', 
        'ABP_numeric', 'NBP_numeric', 'HR_numeric', 'RR_numeric',
        'SpO2_numeric_time_diff_sec', 'Pulse_numeric_time_diff_sec', 
        'ST_numeric_time_diff_sec', 'Tskin_numeric_time_diff_sec', 
        'ABP_numeric_time_diff_sec', 'NBP_numeric_time_diff_sec', 
        'HR_numeric_time_diff_sec', 'RR_numeric_time_diff_sec',
    ]
    
    # 모든 필요한 컬럼을 순서대로 생성
    filtered = pd.DataFrame()
    for col in required_columns:
        if col in icuc.columns:
            filtered[col] = icuc[col].copy()
        else:
            filtered[col] = pd.NA
            print(f"Warning: Missing column: {col}")
    
    # Severity 처리
    filtered['Severity'] = filtered['Severity'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)
    filtered.insert(
        filtered.columns.get_loc('Severity')+1, 
        'SeverityColor', 
        filtered['Severity'].map({0: "Red", 1:"Yellow", 2:"ShortYellow", 3:"SevereCyan", 4:"Cyan", 5:"SilentCyan", 6:"White"})
    )

    filtered_nursing = df_nurse_record[df_nurse_record['AlsUnitNo'] == patient_id][
        ['TimeStamp', 'AssessmentNm', 'ImplementationNm', 'AttributeNm', 'AttributeDetail', 'AttributeDetailValue', 'AttributeDetailValue2', 'InDateTime', 'OutDateTime']
    ].sort_values('TimeStamp')

    # TimeStamp를 datetime으로 변환
    if filtered['TimeStamp'].dtype == 'object':
        filtered['TimeStamp'] = pd.to_datetime(filtered['TimeStamp'])
    if filtered_nursing['TimeStamp'].dtype == 'object':
        filtered_nursing['TimeStamp'] = pd.to_datetime(filtered_nursing['TimeStamp'])

    # 컬럼명 매핑 (InDateTime, OutDateTime 제외)
    column_mapping = {
        'TimeStamp': '시행일시',
        'AssessmentNm': '간호진단프로토콜(코드명)',
        'ImplementationNm': '간호활동(코드명)',
        'AttributeNm': '간호속성코드(코드명)',
        'AttributeDetail': '간호속성명칭',
        'AttributeDetailValue': '속성',
        'AttributeDetailValue2': '속성Text'
    }

    # 앞 뒤로 둘 다 하나씩은 있어야함
    def get_nursing_records_optimized():
        result = []
        admission_in_list = []
        admission_out_list = []
        
        for idx, row in filtered.iterrows():
            timestamp = row['TimeStamp']
            start_time = timestamp - timedelta(minutes=30)
            end_time = timestamp + timedelta(minutes=30)
            
            # 이전 30분 레코드 확인
            before_mask = (filtered_nursing['TimeStamp'] >= start_time) & (filtered_nursing['TimeStamp'] < timestamp)
            before_records = filtered_nursing[before_mask]
            
            # 이후 30분 레코드 확인
            after_mask = (filtered_nursing['TimeStamp'] > timestamp) & (filtered_nursing['TimeStamp'] <= end_time)
            after_records = filtered_nursing[after_mask]
            
            # 한 쪽이라도 비어있으면 []
            if before_records.empty or after_records.empty:
                result.append([])
                admission_in_list.append('')
                admission_out_list.append('')
            else:
                # 전체 범위 레코드 가져오기
                mask = (filtered_nursing['TimeStamp'] >= start_time) & (filtered_nursing['TimeStamp'] <= end_time)
                records = filtered_nursing[mask]
                
                # 첫 번째 레코드의 InDateTime, OutDateTime 값 (단일 값)
                in_value = records['InDateTime'].iloc[0] if len(records) > 0 and pd.notna(records['InDateTime'].iloc[0]) else ''
                out_value = records['OutDateTime'].iloc[0] if len(records) > 0 and pd.notna(records['OutDateTime'].iloc[0]) else ''
                
                admission_in_list.append(in_value)
                admission_out_list.append(out_value)
                
                # NursingRecords에는 InDateTime, OutDateTime 제외하고 넣기
                records_without_admission = records.drop(columns=['InDateTime', 'OutDateTime'])
                result.append(records_without_admission.rename(columns=column_mapping).to_dict('records'))
        
        return result, admission_in_list, admission_out_list

    # 함수 호출 및 할당
    nursing_records, admission_ins, admission_outs = get_nursing_records_optimized()
    filtered['NursingRecords_ba30'] = nursing_records
    filtered['AdmissionIn'] = admission_ins
    filtered['AdmissionOut'] = admission_outs

    # 필터링 조건 적용 - 존재하는 컬럼만 체크
    waveform_cols = ['ABP_time_diff_sec', 'II_time_diff_sec', 'Pleth_time_diff_sec', 'Resp_time_diff_sec']
    waveform_cols += ['SpO2_numeric_time_diff_sec', 'Pulse_numeric_time_diff_sec', 
        'ST_numeric_time_diff_sec', 'Tskin_numeric_time_diff_sec', 
        'ABP_numeric_time_diff_sec', 'NBP_numeric_time_diff_sec', 
        'HR_numeric_time_diff_sec', 'RR_numeric_time_diff_sec',]
    existing_waveform_cols = [col for col in waveform_cols if col in filtered.columns and filtered[col].notna().any()]
    
    # 조건 생성
    conditions = []
    
    # Waveform 컬럼 조건 (존재하는 컬럼만)
    if existing_waveform_cols:
        conditions.append((filtered[existing_waveform_cols] <= 60).all(axis=1))
    
    # NursingRecords 조건
    conditions.append(filtered['NursingRecords_ba30'].apply(lambda x: len(x) > 0))
    
    # 모든 조건 적용
    if conditions:
        final_mask = conditions[0]
        for condition in conditions[1:]:
            final_mask = final_mask & condition
        filtered = filtered[final_mask].copy()
    
    # 라벨 특수문자 제거
    filtered['Label'] = filtered['Label'].apply(lambda x: [item.replace('  ', ' ').replace('?', '').replace('!', '').strip() for item in x] if isinstance(x, list) else x)
    
    # 기술적 알람 필터링 전 통계 수집
    before_count = len(filtered)
    technical_alarm_counter = Counter()
    
    # 제거될 기술적 알람들 카운팅
    for idx, row in filtered.iterrows():
        if is_only_technical_alarms(row['Label']):
            if isinstance(row['Label'], list):
                for label in row['Label']:
                    if label and str(label).strip() not in ["None", "[]", ""]:
                        technical_alarm_counter[label] += 1
    
    # 기술적 알람만 있는 행 필터링
    filtered = filtered[~filtered['Label'].apply(is_only_technical_alarms)].copy()
    
    # 필터링 후 통계 출력
    after_count = len(filtered)
    removed_count = before_count - after_count
    
    print(f"\n=== 기술적 알람 필터링 결과 ===")
    print(f"필터링 전 데이터: {before_count}개")
    print(f"필터링 후 데이터: {after_count}개")
    print(f"제거된 데이터: {removed_count}개")
    
    if technical_alarm_counter:
        print(f"\n제거된 기술적 알람 종류별 개수:")
        for alarm, count in technical_alarm_counter.most_common():
            print(f"  - {alarm}: {count}개")
    
    # 나머지 컬럼들
    filtered['isView'] = True # 뷰어에선 안 보이고, False로 처리하고싶은 애들 (김정민 교수님 필터 참고해서 수정해야함.)
    filtered['isSelected'] = False # 뷰어에서 T/F 가 선택되었나요? 의 여부
    filtered['Classification'] = False
    filtered['Comment'] = ''

    return filtered

기술적 알람 목록 로드 완료: 55개 라벨


In [None]:
# v4
import pickle as pkl
import pandas as pd
from datetime import timedelta
from collections import Counter

# 기술적 알람 목록 로드
def load_technical_alarms(file_path="Filtered_AlarmLabelList.txt"):
    """기술적 알람 목록을 파일에서 로드하여 정규화"""
    technical_alarms = set()
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        
        for line in lines:
            line = line.strip()
            if not line:  # 빈 줄 건너뛰기
                continue
            
            # 슬래시로 구분된 여러 라벨 처리
            if "/" in line:
                labels = [label.strip() for label in line.split("/")]
            else:
                labels = [line]
            
            # 각 라벨을 정규화하여 저장
            for label in labels:
                if label:  # 빈 문자열이 아닌 경우만
                    normalized_label = label.lower().strip().replace(" ", "")
                    if normalized_label:
                        technical_alarms.add(normalized_label)
        
        print(f"기술적 알람 목록 로드 완료: {len(technical_alarms)}개 라벨")
        
    except Exception as e:
        print(f"기술적 알람 목록 로드 오류: {e}")
    
    return technical_alarms

def normalize_alarm_label(label):
    """알람 라벨을 비교를 위해 정규화"""
    if not label:
        return ""
    return str(label).lower().replace(" ", "").strip()

def is_only_technical_alarms(label_list):
    """Label 리스트가 기술적 알람만 포함하는지 확인"""
    if not label_list or not isinstance(label_list, list):
        return False
    
    # 빈 리스트나 None만 있는 경우
    valid_labels = [label for label in label_list if label and str(label).strip() not in ["None", "[]", ""]]
    if not valid_labels:
        return False
    
    # 모든 라벨이 기술적 알람인지 확인
    for label in valid_labels:
        normalized_label = normalize_alarm_label(label)
        if normalized_label not in TECHNICAL_ALARMS:
            return False  # 하나라도 임상적 알람이 있으면 False
    
    return True  # 모두 기술적 알람인 경우만 True

# 전역 변수로 기술적 알람 목록 로드
TECHNICAL_ALARMS = load_technical_alarms()

def preprocess_data(patient_id):
    pickle = pkl.load(open(f'/Volumes/Seagate/pkl/{patient_id}.pkl', 'rb'))
    icuc = pickle[pickle['NURSING_RecordUnit']=='ICUC']

    # 필요한 컬럼 리스트
    required_columns = [
        'AlsUnitNo', 'TimeStamp',
        'Label', 'Severity',
        'ABP_WAVEFORM', 'ECG_WAVEFORM', 'PPG_WAVEFORM', 'RESP_WAVEFORM',
        'ABP_time_diff_sec', 'II_time_diff_sec', 'Pleth_time_diff_sec', 'Resp_time_diff_sec',
        'SpO2_numeric', 'Pulse_numeric', 'ST_numeric', 'Tskin_numeric', 
        'ABP_numeric', 'NBP_numeric', 'HR_numeric', 'RR_numeric',
        'SpO2_numeric_time_diff_sec', 'Pulse_numeric_time_diff_sec', 
        'ST_numeric_time_diff_sec', 'Tskin_numeric_time_diff_sec', 
        'ABP_numeric_time_diff_sec', 'NBP_numeric_time_diff_sec', 
        'HR_numeric_time_diff_sec', 'RR_numeric_time_diff_sec',
    ]
    
    # 모든 필요한 컬럼을 순서대로 생성
    filtered = pd.DataFrame()
    for col in required_columns:
        if col in icuc.columns:
            filtered[col] = icuc[col].copy()
        else:
            filtered[col] = pd.NA
            print(f"Warning: Missing column: {col}")
    
    # Severity 처리
    filtered['Severity'] = filtered['Severity'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)
    filtered.insert(
        filtered.columns.get_loc('Severity')+1, 
        'SeverityColor', 
        filtered['Severity'].map({0: "Red", 1:"Yellow", 2:"ShortYellow", 3:"SevereCyan", 4:"Cyan", 5:"SilentCyan", 6:"White"})
    )

    filtered_nursing = df_nurse_record[df_nurse_record['AlsUnitNo'] == patient_id][
        ['TimeStamp', 'AssessmentNm', 'ImplementationNm', 'AttributeNm', 'AttributeDetail', 'AttributeDetailValue', 'AttributeDetailValue2', 'InDateTime', 'OutDateTime']
    ].sort_values('TimeStamp')

    # TimeStamp를 datetime으로 변환
    if filtered['TimeStamp'].dtype == 'object':
        filtered['TimeStamp'] = pd.to_datetime(filtered['TimeStamp'])
    if filtered_nursing['TimeStamp'].dtype == 'object':
        filtered_nursing['TimeStamp'] = pd.to_datetime(filtered_nursing['TimeStamp'])

    # 컬럼명 매핑 (InDateTime, OutDateTime 제외)
    column_mapping = {
        'TimeStamp': '시행일시',
        'AssessmentNm': '간호진단프로토콜(코드명)',
        'ImplementationNm': '간호활동(코드명)',
        'AttributeNm': '간호속성코드(코드명)',
        'AttributeDetail': '간호속성명칭',
        'AttributeDetailValue': '속성',
        'AttributeDetailValue2': '속성Text'
    }

    # 앞 뒤로 둘 다 하나씩은 있어야함
    def get_nursing_records_optimized():
        result = []
        admission_in_list = []
        admission_out_list = []
        in_out_consistency_list = []  # In/Out 시간 일관성 체크 리스트 추가
        
        for idx, row in filtered.iterrows():
            timestamp = row['TimeStamp']
            start_time = timestamp - timedelta(minutes=30)
            end_time = timestamp + timedelta(minutes=30)
            
            # 이전 30분 레코드 확인
            before_mask = (filtered_nursing['TimeStamp'] >= start_time) & (filtered_nursing['TimeStamp'] < timestamp)
            before_records = filtered_nursing[before_mask]
            
            # 이후 30분 레코드 확인
            after_mask = (filtered_nursing['TimeStamp'] > timestamp) & (filtered_nursing['TimeStamp'] <= end_time)
            after_records = filtered_nursing[after_mask]
            
            # 한 쪽이라도 비어있으면 []
            if before_records.empty or after_records.empty:
                result.append([])
                admission_in_list.append('')
                admission_out_list.append('')
                in_out_consistency_list.append(False)  # 데이터가 없으므로 일관성도 False
            else:
                # 전체 범위 레코드 가져오기
                mask = (filtered_nursing['TimeStamp'] >= start_time) & (filtered_nursing['TimeStamp'] <= end_time)
                records = filtered_nursing[mask]
                
                # In/Out 시간 일관성 체크
                unique_in_times = records['InDateTime'].dropna().unique()
                unique_out_times = records['OutDateTime'].dropna().unique()
                
                # 모든 간호기록의 InDateTime과 OutDateTime이 각각 동일한지 확인
                # NaN이 아닌 값들 중에서 unique 값이 1개만 있어야 일관성 있음
                is_consistent = (len(unique_in_times) <= 1) and (len(unique_out_times) <= 1)
                in_out_consistency_list.append(is_consistent)
                
                # 첫 번째 레코드의 InDateTime, OutDateTime 값 (단일 값)
                in_value = records['InDateTime'].iloc[0] if len(records) > 0 and pd.notna(records['InDateTime'].iloc[0]) else ''
                out_value = records['OutDateTime'].iloc[0] if len(records) > 0 and pd.notna(records['OutDateTime'].iloc[0]) else ''
                
                admission_in_list.append(in_value)
                admission_out_list.append(out_value)
                
                # NursingRecords에는 InDateTime, OutDateTime 제외하고 넣기
                records_without_admission = records.drop(columns=['InDateTime', 'OutDateTime'])
                result.append(records_without_admission.rename(columns=column_mapping).to_dict('records'))
        
        return result, admission_in_list, admission_out_list, in_out_consistency_list

    # 함수 호출 및 할당
    nursing_records, admission_ins, admission_outs, in_out_consistency = get_nursing_records_optimized()
    filtered['NursingRecords_ba30'] = nursing_records
    filtered['AdmissionIn'] = admission_ins
    filtered['AdmissionOut'] = admission_outs
    filtered['InOutConsistent'] = in_out_consistency  # 일관성 플래그 추가

    # In/Out 시간 일관성 필터링 전 통계
    inconsistent_count = (~filtered['InOutConsistent']).sum()
    print(f"\n=== In/Out 시간 일관성 체크 ===")
    print(f"In/Out 시간이 일관되지 않은 알람 개수: {inconsistent_count}개")

    # 필터링 조건 적용 - 존재하는 컬럼만 체크
    waveform_cols = ['ABP_time_diff_sec', 'II_time_diff_sec', 'Pleth_time_diff_sec', 'Resp_time_diff_sec']
    waveform_cols += ['SpO2_numeric_time_diff_sec', 'Pulse_numeric_time_diff_sec', 
        'ST_numeric_time_diff_sec', 'Tskin_numeric_time_diff_sec', 
        'ABP_numeric_time_diff_sec', 'NBP_numeric_time_diff_sec', 
        'HR_numeric_time_diff_sec', 'RR_numeric_time_diff_sec',]
    existing_waveform_cols = [col for col in waveform_cols if col in filtered.columns and filtered[col].notna().any()]
    
    # 조건 생성
    conditions = []
    
    # Waveform 컬럼 조건 (존재하는 컬럼만)
    if existing_waveform_cols:
        conditions.append((filtered[existing_waveform_cols] <= 60).all(axis=1))
    
    # NursingRecords 조건
    conditions.append(filtered['NursingRecords_ba30'].apply(lambda x: len(x) > 0))
    
    # In/Out 시간 일관성 조건 추가
    conditions.append(filtered['InOutConsistent'])
    
    # 모든 조건 적용
    if conditions:
        final_mask = conditions[0]
        for condition in conditions[1:]:
            final_mask = final_mask & condition
        filtered = filtered[final_mask].copy()
    
    # InOutConsistent 컬럼 제거 (최종 데이터에는 필요 없음)
    filtered = filtered.drop(columns=['InOutConsistent'])
    
    # 라벨 특수문자 제거
    filtered['Label'] = filtered['Label'].apply(lambda x: [item.replace('  ', ' ').replace('?', '').replace('!', '').strip() for item in x] if isinstance(x, list) else x)
    
    # # 기술적 알람 필터링 전 통계 수집
    # before_count = len(filtered)
    # technical_alarm_counter = Counter()
    
    # # 제거될 기술적 알람들 카운팅
    # for idx, row in filtered.iterrows():
    #     if is_only_technical_alarms(row['Label']):
    #         if isinstance(row['Label'], list):
    #             for label in row['Label']:
    #                 if label and str(label).strip() not in ["None", "[]", ""]:
    #                     technical_alarm_counter[label] += 1
    
    # # 기술적 알람만 있는 행 필터링
    # filtered = filtered[~filtered['Label'].apply(is_only_technical_alarms)].copy()
    
    # # 필터링 후 통계 출력
    # after_count = len(filtered)
    # removed_count = before_count - after_count
    
    # print(f"\n=== 기술적 알람 필터링 결과 ===")
    # print(f"필터링 전 데이터: {before_count}개")
    # print(f"필터링 후 데이터: {after_count}개")
    # print(f"제거된 데이터: {removed_count}개")
    
    # if technical_alarm_counter:
    #     print(f"\n제거된 기술적 알람 종류별 개수:")
    #     for alarm, count in technical_alarm_counter.most_common():
    #         print(f"  - {alarm}: {count}개")
    
    # 나머지 컬럼들
    filtered['isView'] = True # 뷰어에선 안 보이고, False로 처리하고싶은 애들 (김정민 교수님 필터 참고해서 수정해야함.)
    filtered['isSelected'] = False # 뷰어에서 T/F 가 선택되었나요? 의 여부
    filtered['Classification'] = False
    filtered['Comment'] = ''

    return filtered

기술적 알람 목록 로드 완료: 55개 라벨


In [7]:
import pickle as pkl
import os

def save_processed_data(patient_id, output_dir='/Volumes/Seagate/pre_processed1/'):
    filtered = preprocess_data(patient_id)
    
    if len(filtered) > 0:
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        output_path = os.path.join(output_dir, f'{patient_id}_processed.pkl')
        with open(output_path, 'wb') as f:
            pkl.dump(filtered, f)
        print(f"{patient_id}: {len(filtered)} rows saved")
    else:
        print(f"{patient_id}: No rows, skipped")

def batch_process(patient_ids):
    for patient_id in patient_ids:
        try:
            save_processed_data(patient_id)
        except Exception as e:
            print(f"{patient_id}: Error - {e}")


for i in patient_id_list:
    try:
        filtered = preprocess_data(i)
        if len(filtered) > 0:
            with open(f'/Volumes/Seagate/pre_processed1/{i}.pkl', 'wb') as f:
                pkl.dump(filtered, f)
            print(f"{i}: {len(filtered)} rows saved")
        else:
            print(f"{i}: No rows, skipped")
    except Exception as e:
        print(f"{i}: Error - {e}")


=== In/Out 시간 일관성 체크 ===
In/Out 시간이 일관되지 않은 알람 개수: 0.0개

=== 기술적 알람 필터링 결과 ===
필터링 전 데이터: 0개
필터링 후 데이터: 0개
제거된 데이터: 0개
8660993: No rows, skipped

=== In/Out 시간 일관성 체크 ===
In/Out 시간이 일관되지 않은 알람 개수: 44개

=== 기술적 알람 필터링 결과 ===
필터링 전 데이터: 8개
필터링 후 데이터: 6개
제거된 데이터: 2개

제거된 기술적 알람 종류별 개수:
  - PPV bad ABP Signal: 2개
11526147: 6 rows saved

=== In/Out 시간 일관성 체크 ===
In/Out 시간이 일관되지 않은 알람 개수: 0.0개

=== 기술적 알람 필터링 결과 ===
필터링 전 데이터: 0개
필터링 후 데이터: 0개
제거된 데이터: 0개
11466763: No rows, skipped

=== In/Out 시간 일관성 체크 ===
In/Out 시간이 일관되지 않은 알람 개수: 0.0개

=== 기술적 알람 필터링 결과 ===
필터링 전 데이터: 0개
필터링 후 데이터: 0개
제거된 데이터: 0개
1949715: No rows, skipped

=== In/Out 시간 일관성 체크 ===
In/Out 시간이 일관되지 않은 알람 개수: 66개

=== 기술적 알람 필터링 결과 ===
필터링 전 데이터: 0개
필터링 후 데이터: 0개
제거된 데이터: 0개
11319320: No rows, skipped

=== In/Out 시간 일관성 체크 ===
In/Out 시간이 일관되지 않은 알람 개수: 0.0개

=== 기술적 알람 필터링 결과 ===
필터링 전 데이터: 0개
필터링 후 데이터: 0개
제거된 데이터: 0개
11448345: No rows, skipped

=== In/Out 시간 일관성 체크 ===
In/Out 시간이 일관되지 않은 알람 개수: 2168개

=== 기술적 알람 필터링 결과 ==

In [8]:
import os
import pickle as pkl
import pandas as pd
from IPython.display import display

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

def load_processed_data(patient_id, data_dir='/Volumes/Seagate/pre_processed1/'):
    file_path = os.path.join(data_dir, f'{patient_id}.pkl')
    with open(file_path, 'rb') as f:
        data = pkl.load(f)
    return data

a = load_processed_data(10138773, data_dir='DATA/')
display(a)

Unnamed: 0,AlsUnitNo,TimeStamp,Label,Severity,SeverityColor,ABP_WAVEFORM,ECG_WAVEFORM,PPG_WAVEFORM,RESP_WAVEFORM,ABP_time_diff_sec,II_time_diff_sec,Pleth_time_diff_sec,Resp_time_diff_sec,SpO2_numeric,Pulse_numeric,ST_numeric,Tskin_numeric,ABP_numeric,NBP_numeric,HR_numeric,RR_numeric,SpO2_numeric_time_diff_sec,Pulse_numeric_time_diff_sec,ST_numeric_time_diff_sec,Tskin_numeric_time_diff_sec,ABP_numeric_time_diff_sec,NBP_numeric_time_diff_sec,HR_numeric_time_diff_sec,RR_numeric_time_diff_sec,NursingRecords_ba30,AdmissionIn,AdmissionOut,isView,isSelected,Classification,Comment
42,10138773,2024-06-19 19:37:00.757,"[HR 111 >100, NBPs 71 <90]",2,ShortYellow,"[-50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50...","[0.55, 0.59, 0.585, 0.525, 0.465, 0.505, 0.54,...","[0.3232421875, 0.3193359375, 0.31298828125, 0....","[1.7501527183872938, 1.7336591325595603, 1.686...",30.72,0.0,0.0,0.0,99.9,106.0,-0.2,,,49.0,104.0,18.0,0.19,0.19,0.19,0.19,32.48,36.757,0.19,0.19,"[{'시행일시': 2024-06-19 19:20:00, '간호진단프로토콜(코드명)'...",2024-06-19 19:16:08,2024-06-22 15:10:52,True,False,False,
43,10138773,2024-06-19 19:37:05.877,"[NBPs 71 <90, Cannot Analyze QT]",1,Yellow,"[-50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50...","[0.43, 0.43, 0.42, 0.435, 0.455, 0.455, 0.44, ...","[0.59814453125, 0.58935546875, 0.58154296875, ...","[-0.7513744654856445, -0.7513744654856445, -0....",25.6,0.0,0.0,0.0,99.9,106.0,-0.2,,,49.0,107.0,18.0,0.19,0.19,0.19,0.19,27.36,41.877,0.19,0.19,"[{'시행일시': 2024-06-19 19:20:00, '간호진단프로토콜(코드명)'...",2024-06-19 19:16:08,2024-06-22 15:10:52,True,False,False,
44,10138773,2024-06-19 19:37:10.997,"[HR 111 >100, Tskin NoTransducer, Temp NoTra...",2,ShortYellow,"[-50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50...","[0.43, 0.45, 0.435, 0.435, 0.425, 0.44, 0.435,...","[-0.13623046875, -0.140625, -0.1455078125, -0....","[0.27550397067806964, 0.2180818570555895, 0.16...",20.48,0.0,0.0,0.0,99.9,106.0,0.0,,,49.0,106.0,16.0,0.19,0.19,0.19,0.19,22.24,46.997,0.19,0.19,"[{'시행일시': 2024-06-19 19:20:00, '간호진단프로토콜(코드명)'...",2024-06-19 19:16:08,2024-06-22 15:10:52,True,False,False,
45,10138773,2024-06-19 19:37:16.117,[NBPs 68 <90],1,Yellow,"[-50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50...","[0.53, 0.5, 0.49, 0.54, 0.56, 0.52, 0.485, 0.5...","[-0.48828125, -0.494140625, -0.4970703125, -0....","[-0.11606597434331094, -0.11973121563836286, -...",15.36,0.0,0.0,0.0,99.2,106.0,0.0,,,50.0,105.0,18.0,0.19,0.19,0.19,0.19,17.12,48.883,0.19,0.19,"[{'시행일시': 2024-06-19 19:20:00, '간호진단프로토콜(코드명)'...",2024-06-19 19:16:08,2024-06-22 15:10:52,True,False,False,
46,10138773,2024-06-19 19:37:21.237,[Temp Deactivated],4,Cyan,"[-50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50...","[0.66, 0.595, 0.625, 0.73, 0.665, 0.57, 0.6, 0...","[-0.0185546875, -0.03369140625, -0.04443359375...","[-0.39706780696395844, -0.4129505192425168, -0...",10.24,0.0,0.0,0.0,98.1,106.0,0.0,,,50.0,104.0,16.0,0.48,0.48,0.48,1.214,12.0,43.763,0.48,0.48,"[{'시행일시': 2024-06-19 19:20:00, '간호진단프로토콜(코드명)'...",2024-06-19 19:16:08,2024-06-22 15:10:52,True,False,False,
47,10138773,2024-06-19 19:37:26.357,[Cannot Analyze QT],5,SilentCyan,"[-50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50...","[0.59, 0.56, 0.525, 0.565, 0.62, 0.6, 0.535, 0...","[0.18505859375, 0.18896484375, 0.1943359375, 0...","[0.8338423946243128, 0.8356750152718387, 0.840...",5.12,0.0,0.0,0.0,98.4,104.0,0.0,,,50.0,106.0,16.0,0.48,0.48,0.48,6.334,6.88,38.643,0.48,0.48,"[{'시행일시': 2024-06-19 19:20:00, '간호진단프로토콜(코드명)'...",2024-06-19 19:16:08,2024-06-22 15:10:52,True,False,False,
48,10138773,2024-06-19 19:37:31.477,[ABP Zero+Check Cal],5,SilentCyan,"[-50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50...","[0.525, 0.515, 0.45, 0.48, 0.515, 0.475, 0.41,...","[0.05908203125, 0.06201171875, 0.06640625, 0.0...","[-0.31093463653023823, -0.3213194868662187, -0...",0.0,0.0,0.0,0.0,99.0,105.0,0.0,,,50.0,106.0,16.0,0.48,0.48,0.48,11.454,1.76,33.523,0.48,0.48,"[{'시행일시': 2024-06-19 19:20:00, '간호진단프로토콜(코드명)'...",2024-06-19 19:16:08,2024-06-22 15:10:52,True,False,False,
53,10138773,2024-06-19 19:38:07.317,[HR 111 >100],2,ShortYellow,"[16.4375, 16.25, 16.0625, 15.875, 15.8125, 15....","[0.695, 0.705, 0.65, 0.645, 0.68, 0.68, 0.665,...","[0.1171875, 0.12451171875, 0.130859375, 0.1357...","[-0.3335369578497251, -0.3323152107513745, -0....",0.0,0.0,0.0,0.0,98.9,105.0,-0.1,,,50.0,107.0,17.0,0.32,0.32,0.32,25.28,0.32,2.317,0.32,0.32,"[{'시행일시': 2024-06-19 19:20:00, '간호진단프로토콜(코드명)'...",2024-06-19 19:16:08,2024-06-22 15:10:52,True,False,False,
59,10138773,2024-06-19 19:38:48.277,[Cannot Analyze QT],5,SilentCyan,"[24.3125, 24.1875, 24.0625, 24.0, 23.9375, 23....","[0.45, 0.435, 0.38, 0.395, 0.41, 0.4, 0.385, 0...","[-0.0498046875, -0.05029296875, -0.04931640625...","[0.9517409896151496, 0.9804520464263897, 1.012...",0.0,0.0,0.0,0.0,98.6,103.0,0.0,,,50.0,104.0,17.0,0.32,0.32,0.32,0.32,0.32,43.277,0.32,0.32,"[{'시행일시': 2024-06-19 19:20:00, '간호진단프로토콜(코드명)'...",2024-06-19 19:16:08,2024-06-22 15:10:52,True,False,False,
61,10138773,2024-06-19 19:38:58.517,[Cannot Analyze QT],5,SilentCyan,"[22.4375, 22.625, 22.75, 22.875, 23.0, 23.0, 2...","[0.65, 0.65, 0.715, 0.735, 0.64, 0.625, 0.725,...","[0.1806640625, 0.1376953125, 0.09912109375, 0....","[1.4599877825290164, 1.4557116676847892, 1.445...",0.0,0.0,0.0,0.0,98.0,105.0,0.1,,,50.0,104.0,18.0,0.32,0.32,0.32,0.32,0.32,53.517,0.32,0.32,"[{'시행일시': 2024-06-19 19:20:00, '간호진단프로토콜(코드명)'...",2024-06-19 19:16:08,2024-06-22 15:10:52,True,False,False,


2