In [2]:
import os
import warnings
from datetime import timedelta
warnings.filterwarnings('ignore')
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

drive_location = 'E:'

df_nurse_record = pd.read_csv(f"{drive_location}/new_nursing_0909.csv")
df_nurse_record = df_nurse_record[df_nurse_record['RecordUnit'] == 'ICUC']


unique_numbers_list = set(df_nurse_record['AlsUnitNo'].unique().tolist())
pkl_list = set([int(i.replace('.pkl', '')) for i in os.listdir(f'{drive_location}/pkl/') if i.endswith('.pkl')])

patient_id_list = list(pkl_list & unique_numbers_list)

In [3]:
# v4
import pickle as pkl
import pandas as pd
from datetime import timedelta
from collections import Counter
import re

def label_normalization(text: str = ''):
    text = text.replace('?', '').replace('!', '')
    text = text.replace('>', ' > ').replace('<', ' < ')
    text = re.sub(r'\s{2,}', ' ', text).strip()
    return text

# Í∏∞Ïà†Ï†Å ÏïåÎûå Î™©Î°ù Î°úÎìú
def load_technical_alarms(file_path="Filtered_AlarmLabelList.txt"):
    """Í∏∞Ïà†Ï†Å ÏïåÎûå Î™©Î°ùÏùÑ ÌååÏùºÏóêÏÑú Î°úÎìúÌïòÏó¨ Ï†ïÍ∑úÌôî"""
    technical_alarms = set()
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        
        for line in lines:
            line = line.strip()
            if not line:  # Îπà Ï§Ñ Í±¥ÎÑàÎõ∞Í∏∞
                continue
            
            # Ïä¨ÎûòÏãúÎ°ú Íµ¨Î∂ÑÎêú Ïó¨Îü¨ ÎùºÎ≤® Ï≤òÎ¶¨
            if "/" in line:
                labels = [label.strip() for label in line.split("/")]
            else:
                labels = [line]
            
            # Í∞Å ÎùºÎ≤®ÏùÑ Ï†ïÍ∑úÌôîÌïòÏó¨ Ï†ÄÏû•
            for label in labels:
                if label:  # Îπà Î¨∏ÏûêÏó¥Ïù¥ ÏïÑÎãå Í≤ΩÏö∞Îßå
                    normalized_label = label.lower().strip().replace(" ", "")
                    if normalized_label:
                        technical_alarms.add(normalized_label)
        
        print(f"Í∏∞Ïà†Ï†Å ÏïåÎûå Î™©Î°ù Î°úÎìú ÏôÑÎ£å: {len(technical_alarms)}Í∞ú ÎùºÎ≤®")
        
    except Exception as e:
        print(f"Í∏∞Ïà†Ï†Å ÏïåÎûå Î™©Î°ù Î°úÎìú Ïò§Î•ò: {e}")
    
    return technical_alarms

def normalize_alarm_label(label):
    """ÏïåÎûå ÎùºÎ≤®ÏùÑ ÎπÑÍµêÎ•º ÏúÑÌï¥ Ï†ïÍ∑úÌôî"""
    if not label:
        return ""
    return str(label).lower().replace(" ", "").strip()

def is_only_technical_alarms(label_list):
    """Label Î¶¨Ïä§Ìä∏Í∞Ä Í∏∞Ïà†Ï†Å ÏïåÎûåÎßå Ìè¨Ìï®ÌïòÎäîÏßÄ ÌôïÏù∏"""
    if not label_list or not isinstance(label_list, list):
        return False
    
    # Îπà Î¶¨Ïä§Ìä∏ÎÇò NoneÎßå ÏûàÎäî Í≤ΩÏö∞
    valid_labels = [label for label in label_list if label and str(label).strip() not in ["None", "[]", ""]]
    if not valid_labels:
        return False
    
    # Î™®Îì† ÎùºÎ≤®Ïù¥ Í∏∞Ïà†Ï†Å ÏïåÎûåÏù∏ÏßÄ ÌôïÏù∏
    for label in valid_labels:
        normalized_label = normalize_alarm_label(label)
        if normalized_label not in TECHNICAL_ALARMS:
            return False  # ÌïòÎÇòÎùºÎèÑ ÏûÑÏÉÅÏ†Å ÏïåÎûåÏù¥ ÏûàÏúºÎ©¥ False
    
    return True  # Î™®Îëê Í∏∞Ïà†Ï†Å ÏïåÎûåÏù∏ Í≤ΩÏö∞Îßå True

# Ï†ÑÏó≠ Î≥ÄÏàòÎ°ú Í∏∞Ïà†Ï†Å ÏïåÎûå Î™©Î°ù Î°úÎìú
TECHNICAL_ALARMS = load_technical_alarms()

def preprocess_data(patient_id):
    pickle = pkl.load(open(f'{drive_location}/pkl/{patient_id}.pkl', 'rb'))
    icuc = pickle[pickle['NURSING_RecordUnit']=='ICUC']

    # ÌïÑÏöîÌïú Ïª¨Îüº Î¶¨Ïä§Ìä∏
    required_columns = [
        'AlsUnitNo', 'TimeStamp',
        'Label', 'Severity',
        'ABP_WAVEFORM', 'ECG_WAVEFORM', 'PPG_WAVEFORM', 'RESP_WAVEFORM',
        'ABP_time_diff_sec', 'II_time_diff_sec', 'Pleth_time_diff_sec', 'Resp_time_diff_sec',
        'SpO2_numeric', 'Pulse_numeric', 'ST_numeric', 'Tskin_numeric', 
        'ABP_numeric', 'NBP_numeric', 'HR_numeric', 'RR_numeric',
        'SpO2_numeric_time_diff_sec', 'Pulse_numeric_time_diff_sec', 
        'ST_numeric_time_diff_sec', 'Tskin_numeric_time_diff_sec', 
        'ABP_numeric_time_diff_sec', 'NBP_numeric_time_diff_sec', 
        'HR_numeric_time_diff_sec', 'RR_numeric_time_diff_sec',
    ]
    
    # Î™®Îì† ÌïÑÏöîÌïú Ïª¨ÎüºÏùÑ ÏàúÏÑúÎåÄÎ°ú ÏÉùÏÑ±
    filtered = pd.DataFrame()
    for col in required_columns:
        if col in icuc.columns:
            filtered[col] = icuc[col].copy()
        else:
            filtered[col] = pd.NA
            print(f"Warning: Missing column: {col}")
    
    # Severity Ï≤òÎ¶¨
    filtered['Severity'] = filtered['Severity'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)
    filtered.insert(
        filtered.columns.get_loc('Severity')+1, 
        'SeverityColor', 
        filtered['Severity'].map({0: "Red", 1:"Yellow", 2:"ShortYellow", 3:"SevereCyan", 4:"Cyan", 5:"SilentCyan", 6:"White"})
    )

    filtered_nursing = df_nurse_record[df_nurse_record['AlsUnitNo'] == patient_id][
        ['TimeStamp', 'AssessmentNm', 'ImplementationNm', 'AttributeNm', 'AttributeDetail', 'AttributeDetailValue', 'AttributeDetailValue2', 'InDateTime', 'OutDateTime']
    ].sort_values('TimeStamp')

    # TimeStampÎ•º datetimeÏúºÎ°ú Î≥ÄÌôò
    if filtered['TimeStamp'].dtype == 'object':
        filtered['TimeStamp'] = pd.to_datetime(filtered['TimeStamp'])
    if filtered_nursing['TimeStamp'].dtype == 'object':
        filtered_nursing['TimeStamp'] = pd.to_datetime(filtered_nursing['TimeStamp'])

    # Ïª¨ÎüºÎ™Ö Îß§Ìïë (InDateTime, OutDateTime Ï†úÏô∏)
    column_mapping = {
        'TimeStamp': 'ÏãúÌñâÏùºÏãú',
        'AssessmentNm': 'Í∞ÑÌò∏ÏßÑÎã®ÌîÑÎ°úÌÜ†ÏΩú(ÏΩîÎìúÎ™Ö)',
        'ImplementationNm': 'Í∞ÑÌò∏ÌôúÎèô(ÏΩîÎìúÎ™Ö)',
        'AttributeNm': 'Í∞ÑÌò∏ÏÜçÏÑ±ÏΩîÎìú(ÏΩîÎìúÎ™Ö)',
        'AttributeDetail': 'Í∞ÑÌò∏ÏÜçÏÑ±Î™ÖÏπ≠',
        'AttributeDetailValue': 'ÏÜçÏÑ±',
        'AttributeDetailValue2': 'ÏÜçÏÑ±Text'
    }

    # Ïïû Îí§Î°ú Îëò Îã§ ÌïòÎÇòÏî©ÏùÄ ÏûàÏñ¥ÏïºÌï®
    def get_nursing_records_optimized():
        result = []
        admission_in_list = []
        admission_out_list = []
        in_out_consistency_list = []  # In/Out ÏãúÍ∞Ñ ÏùºÍ¥ÄÏÑ± Ï≤¥ÌÅ¨ Î¶¨Ïä§Ìä∏ Ï∂îÍ∞Ä
        
        for idx, row in filtered.iterrows():
            timestamp = row['TimeStamp']
            start_time = timestamp - timedelta(minutes=30)
            end_time = timestamp + timedelta(minutes=30)
            
            # Ïù¥Ï†Ñ 30Î∂Ñ Î†àÏΩîÎìú ÌôïÏù∏
            before_mask = (filtered_nursing['TimeStamp'] >= start_time) & (filtered_nursing['TimeStamp'] < timestamp)
            before_records = filtered_nursing[before_mask]
            
            # Ïù¥ÌõÑ 30Î∂Ñ Î†àÏΩîÎìú ÌôïÏù∏
            after_mask = (filtered_nursing['TimeStamp'] > timestamp) & (filtered_nursing['TimeStamp'] <= end_time)
            after_records = filtered_nursing[after_mask]
            
            # Ìïú Ï™ΩÏù¥ÎùºÎèÑ ÎπÑÏñ¥ÏûàÏúºÎ©¥ []
            if before_records.empty or after_records.empty:
                result.append([])
                admission_in_list.append('')
                admission_out_list.append('')
                in_out_consistency_list.append(False)  # Îç∞Ïù¥ÌÑ∞Í∞Ä ÏóÜÏúºÎØÄÎ°ú ÏùºÍ¥ÄÏÑ±ÎèÑ False
            else:
                # Ï†ÑÏ≤¥ Î≤îÏúÑ Î†àÏΩîÎìú Í∞ÄÏ†∏Ïò§Í∏∞
                mask = (filtered_nursing['TimeStamp'] >= start_time) & (filtered_nursing['TimeStamp'] <= end_time)
                records = filtered_nursing[mask]
                
                # In/Out ÏãúÍ∞Ñ ÏùºÍ¥ÄÏÑ± Ï≤¥ÌÅ¨
                unique_in_times = records['InDateTime'].dropna().unique()
                unique_out_times = records['OutDateTime'].dropna().unique()
                
                # Î™®Îì† Í∞ÑÌò∏Í∏∞Î°ùÏùò InDateTimeÍ≥º OutDateTimeÏù¥ Í∞ÅÍ∞Å ÎèôÏùºÌïúÏßÄ ÌôïÏù∏
                # NaNÏù¥ ÏïÑÎãå Í∞íÎì§ Ï§ëÏóêÏÑú unique Í∞íÏù¥ 1Í∞úÎßå ÏûàÏñ¥Ïïº ÏùºÍ¥ÄÏÑ± ÏûàÏùå
                is_consistent = (len(unique_in_times) <= 1) and (len(unique_out_times) <= 1)
                in_out_consistency_list.append(is_consistent)
                
                # Ï≤´ Î≤àÏß∏ Î†àÏΩîÎìúÏùò InDateTime, OutDateTime Í∞í (Îã®Ïùº Í∞í)
                in_value = records['InDateTime'].iloc[0] if len(records) > 0 and pd.notna(records['InDateTime'].iloc[0]) else ''
                out_value = records['OutDateTime'].iloc[0] if len(records) > 0 and pd.notna(records['OutDateTime'].iloc[0]) else ''
                
                admission_in_list.append(in_value)
                admission_out_list.append(out_value)
                
                # NursingRecordsÏóêÎäî InDateTime, OutDateTime Ï†úÏô∏ÌïòÍ≥† ÎÑ£Í∏∞
                records_without_admission = records.drop(columns=['InDateTime', 'OutDateTime'])
                result.append(records_without_admission.rename(columns=column_mapping).to_dict('records'))
        
        return result, admission_in_list, admission_out_list, in_out_consistency_list

    # Ìï®Ïàò Ìò∏Ï∂ú Î∞è Ìï†Îãπ
    nursing_records, admission_ins, admission_outs, in_out_consistency = get_nursing_records_optimized()
    filtered['NursingRecords_ba30'] = nursing_records
    filtered['AdmissionIn'] = admission_ins
    filtered['AdmissionOut'] = admission_outs
    filtered['InOutConsistent'] = in_out_consistency  # ÏùºÍ¥ÄÏÑ± ÌîåÎûòÍ∑∏ Ï∂îÍ∞Ä

    # ÌïÑÌÑ∞ÎßÅ Ï°∞Í±¥ Ï†ÅÏö© - Ï°¥Ïû¨ÌïòÎäî Ïª¨ÎüºÎßå Ï≤¥ÌÅ¨
    waveform_cols = ['ABP_time_diff_sec', 'II_time_diff_sec', 'Pleth_time_diff_sec', 'Resp_time_diff_sec']
    waveform_cols += ['SpO2_numeric_time_diff_sec', 'Pulse_numeric_time_diff_sec', 
        'ST_numeric_time_diff_sec', 'Tskin_numeric_time_diff_sec', 
        'ABP_numeric_time_diff_sec', 'NBP_numeric_time_diff_sec', 
        'HR_numeric_time_diff_sec', 'RR_numeric_time_diff_sec',]
    existing_waveform_cols = [col for col in waveform_cols if col in filtered.columns and filtered[col].notna().any()]
    
    # Ï°∞Í±¥ ÏÉùÏÑ±
    conditions = []
    
    # Waveform Ïª¨Îüº Ï°∞Í±¥ (Ï°¥Ïû¨ÌïòÎäî Ïª¨ÎüºÎßå)
    if existing_waveform_cols:
        conditions.append((filtered[existing_waveform_cols] <= 60).all(axis=1))
    
    # NursingRecords Ï°∞Í±¥
    conditions.append(filtered['NursingRecords_ba30'].apply(lambda x: len(x) > 0))
    
    # In/Out ÏãúÍ∞Ñ ÏùºÍ¥ÄÏÑ± Ï°∞Í±¥ Ï∂îÍ∞Ä
    conditions.append(filtered['InOutConsistent'])
    
    # Î™®Îì† Ï°∞Í±¥ Ï†ÅÏö©
    if conditions:
        final_mask = conditions[0]
        for condition in conditions[1:]:
            final_mask = final_mask & condition
        filtered = filtered[final_mask].copy()
    
    # InOutConsistent Ïª¨Îüº Ï†úÍ±∞ (ÏµúÏ¢Ö Îç∞Ïù¥ÌÑ∞ÏóêÎäî ÌïÑÏöî ÏóÜÏùå)
    filtered = filtered.drop(columns=['InOutConsistent'])
    
    # ÎùºÎ≤® ÌäπÏàòÎ¨∏Ïûê Ï†úÍ±∞
    filtered['Label'] = filtered['Label'].apply(
        lambda x: [label_normalization(item) for item in x] if isinstance(x, list) else x
    )
    
    # Technical Alarm Ïù∏ Í≤ΩÏö∞ isView=FalseÎ°ú Ï≤òÎ¶¨Ìï¥Î≤ÑÎ¶¨Í∏∞
    filtered['isView'] = filtered['Label'].apply(
        lambda labels: not is_only_technical_alarms(labels)
    )
    
    filtered['isSelected'] = False # Î∑∞Ïñ¥ÏóêÏÑú T/F Í∞Ä ÏÑ†ÌÉùÎêòÏóàÎÇòÏöî? Ïùò Ïó¨Î∂Ä
    filtered['Classification'] = False
    filtered['Comment'] = ''

    # Numeric Data Í∞Ä Î™®Îëê Ï†ïÏÉÅÏù∏ Í≤ΩÏö∞
    numeric_cols = [
        'SpO2_numeric', 'Pulse_numeric', 'ST_numeric', 'Tskin_numeric',
        'ABP_numeric', 'NBP_numeric', 'HR_numeric', 'RR_numeric'
    ]
    filtered[numeric_cols] = filtered[numeric_cols].apply(
        pd.to_numeric, errors='coerce'
    )
    filtered = filtered.dropna(subset=numeric_cols, how='any')  # allÎ°ú ÌïòÎ©¥ Ï†ÑÎ∂Ä NaNÏùº ÎïåÎßå Ï†úÍ±∞


    return filtered

Í∏∞Ïà†Ï†Å ÏïåÎûå Î™©Î°ù Î°úÎìú ÏôÑÎ£å: 58Í∞ú ÎùºÎ≤®


In [None]:
import pickle as pkl
import os

def save_processed_data(patient_id, output_dir=f'{drive_location}/pre_processed1/'):
    filtered = preprocess_data(patient_id)
    
    if len(filtered) > 0:
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        output_path = os.path.join(output_dir, f'{patient_id}_processed.pkl')
        with open(output_path, 'wb') as f:
            pkl.dump(filtered, f)
        print(f"{patient_id}: {len(filtered)} rows saved")
    else:
        print(f"{patient_id}: No rows, skipped")

def batch_process(patient_ids):
    for patient_id in patient_ids:
        try:
            save_processed_data(patient_id)
        except Exception as e:
            print(f"{patient_id}: Error - {e}")


# !rm -rf DATA/
# !mkdir DATA
# !rm -rf sicu_alarms.db
for i in patient_id_list:
    try:
        filtered = preprocess_data(i)
        if len(filtered) > 0:
            with open(f'DATA/{i}.pkl', 'wb') as f:
                pkl.dump(filtered, f)
            print(f"{i}: {len(filtered)} rows saved")
        else:
            print(f"{i}: No rows, skipped")
    except Exception as e:
        print(f"{i}: Error - {e}")




8660993: No rows, skipped
11526147: No rows, skipped
11466763: No rows, skipped
1949715: No rows, skipped
11319320: No rows, skipped
11448345: No rows, skipped
10532891: No rows, skipped
11059230: No rows, skipped
11618341: No rows, skipped
10811448: 25 rows saved
2586684: No rows, skipped
11456575: No rows, skipped
11452483: No rows, skipped
11460687: No rows, skipped
11417682: No rows, skipped
9648217: No rows, skipped
10264681: No rows, skipped
4526187: No rows, skipped
11405419: No rows, skipped
4776045: No rows, skipped
9879666: No rows, skipped
11688056: 84 rows saved
10645625: No rows, skipped
11417722: No rows, skipped
11350145: No rows, skipped
11079810: No rows, skipped
11393161: No rows, skipped
4685968: No rows, skipped
3723418: No rows, skipped
9971876: No rows, skipped
11559076: No rows, skipped
11421863: No rows, skipped
2937002: No rows, skipped
10842283: No rows, skipped
11542704: No rows, skipped
11520179: No rows, skipped
11407540: No rows, skipped
11505853: No rows,

KeyboardInterrupt: 

In [6]:
!python pkl_to_sqlite.py

Creating SQLite database: sicu_alarms.db
Found 57 PKL files to convert

[1/57] Converting 10052298.pkl...
  - Loaded 1 rows, 36 columns
  - Created table with 36 columns
  - Column 'Label' contains array/list data
  - Column 'ABP_WAVEFORM' contains array/list data
  - Column 'ECG_WAVEFORM' contains array/list data
  - Column 'PPG_WAVEFORM' contains array/list data
  - Column 'RESP_WAVEFORM' contains array/list data
  - Column 'NursingRecords_ba30' contains array/list data
  - Successfully inserted 1 rows

[2/57] Converting 10114394.pkl...
  - Loaded 9 rows, 36 columns
  - Created table with 36 columns
  - Column 'Label' contains array/list data
  - Column 'ABP_WAVEFORM' contains array/list data
  - Column 'ECG_WAVEFORM' contains array/list data
  - Column 'PPG_WAVEFORM' contains array/list data
  - Column 'RESP_WAVEFORM' contains array/list data
  - Column 'NursingRecords_ba30' contains array/list data
  - Successfully inserted 9 rows

[3/57] Converting 10138773.pkl...
  - Loaded 2 row