# 1. 准备工作

## 1.1 导包

In [1]:
import pandas as pd
import numpy as np
import csv
import os
import sys
import math
import nltk
import re
import torch
import csv
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Doog\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## 1.2 设置常量

In [2]:
# Set parameter as constant 
DATA_PATH_stays= "../data/ICUSTAYS.csv"
DATA_PATH_notes="../data/NOTEEVENTS.csv" 
DATA_PATH_admission = "../data/ADMISSIONS.csv"

DATA_PATH_train ="../data/AKI/train_listfile.csv" 
DATA_PATH_test = "../data/AKI/test_listfile.csv" 
DATA_PATH_validation = "../data/AKI/val_listfile.csv" 

OUTPUT_PATH_train = "../data/train"
OUTPUT_PATH_test = "../data/test"

SEPARATOR=","
CATEGORY_LIST = ['pad', 'Respiratory', 'ECG','Radiology','Nursing/other','Rehab Services','Nutrition','Pharmacy','Social Work','Case Management',
            'Physician','General','Nursing','Echo','Consult']
category_id = {cate: idx for idx, cate in enumerate(CATEGORY_LIST)}

MAX_TIME = 48.0 # set max num of hours, in our case is 35(days) x 24 =  

## 1.3 固定随机种子，保证实验的可重复性

In [3]:
# 固定随机种子, 保证可复现性
def same_seed(seed = 42): 
    '''Fixes random number generator seeds for reproducibility.'''
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
same_seed(seed = 42)

## 1.4 定义全局功能函数

In [None]:
# Some functions used later
def read_icustays_table(DATA_PATH_stays):
    stays = pd.read_csv(DATA_PATH_stays, sep = SEPARATOR)
    stays.INTIME = pd.to_datetime(stays.INTIME)
    stays.OUTTIME = pd.to_datetime(stays.OUTTIME)
    # drop the column that are not used in the future
    stays = stays.drop(['ROW_ID', 'DBSOURCE', 'FIRST_CAREUNIT', 'LAST_CAREUNIT', 'FIRST_WARDID', 'LAST_WARDID'], axis=1)
    return stays

def read_notes_table(DATA_PATH_notes, keep_discharge=False):
    # 读取 NOTEEVENTS.csv，跳过格式错误行
    df_note = pd.read_csv(
        DATA_PATH_notes,
        sep=SEPARATOR,
        engine='python',
        encoding='utf-8',
        on_bad_lines='skip',  # 替代已弃用的 error_bad_lines
        quoting=csv.QUOTE_ALL
    )

    # 可选：去除 discharge summary 类别
    if not keep_discharge:
        df_note = df_note[df_note['CATEGORY'] != 'Discharge summary']
    # 去除被标记为错误的笔记
    df_note = df_note[df_note['ISERROR'] != 1]
    # 删除无用列（若存在）
    drop_cols = ['ROW_ID', 'STORETIME', 'DESCRIPTION', 'CGID', 'ISERROR']
    df_note = df_note.drop(columns=drop_cols, errors='ignore')
    return df_note

def merge_on_subject_admission(table1):
    admission = pd.read_csv(DATA_PATH_admission, sep = SEPARATOR)
    # drop the column that are not used in the future
    admission = admission.drop(['ROW_ID', "ADMITTIME","DEATHTIME","ADMISSION_TYPE","ADMISSION_LOCATION", "DISCHARGE_LOCATION",
                        "INSURANCE","LANGUAGE","RELIGION", "MARITAL_STATUS","ETHNICITY","EDREGTIME","EDOUTTIME","HOSPITAL_EXPIRE_FLAG","HAS_CHARTEVENTS_DATA"], axis=1)
    return table1.merge(admission, how='inner', left_on=['SUBJECT_ID', 'HADM_ID'], right_on=['SUBJECT_ID', 'HADM_ID'])


def filter_notes_on_stays(notes, stays):
    return pd.merge(notes, stays.drop_duplicates(), left_on=['SUBJECT_ID', 'HADM_ID'], right_on=['SUBJECT_ID', 'HADM_ID'], how='inner')
    
def create_train_val_test_notes(notes, subject_id_list, max_time=240):
    """
    根据 subject_id_list 从 notes 中筛选相关笔记，计算相对 ICU 入院时间（HOURS），
    并筛选指定时间窗内的记录，返回处理后的 notes。

    参数：
        notes : pd.DataFrame 包含 NOTE、INTIME、CHARTTIME 等字段
        subject_id_list : list 要保留的 SUBJECT_ID
        max_time : float 最大保留的时间跨度（单位：小时）

    返回：
        pd.DataFrame 处理后 notes 子集
    """
    # 确保时间字段为 datetime 类型
    notes['DISCHTIME'] = pd.to_datetime(notes['DISCHTIME'], errors='coerce')
    notes['CHARTDATE'] = pd.to_datetime(notes['CHARTDATE'], errors='coerce')
    notes['CHARTTIME'] = pd.to_datetime(notes['CHARTTIME'], errors='coerce')

    # 根据天数截断 DISCHTIME → DISCHDATE（虽然本函数没用）
    notes['DISCHDATE'] = notes['DISCHTIME'].values.astype('<M8[D]')

    # 只保留目标 subject_id
    notes = notes[notes['SUBJECT_ID'].isin(subject_id_list)].copy()
    notes.sort_values(by=['SUBJECT_ID', 'CHARTTIME'], inplace=True, ignore_index=True)

    # 计算相对时间差（单位：小时）
    notes['HOURST'] = (notes['CHARTTIME'] - notes['INTIME']) / np.timedelta64(1, 'h')
    notes['HOURSD'] = (notes['CHARTDATE'] - notes['INTIME']) / np.timedelta64(1, 'h')

    # 优先使用 CHARTTIME 计算的小时数，若为空则用 CHARTDATE 计算的
    notes['HOURS'] = notes['HOURST'].fillna(notes['HOURSD'])

    # 筛选时间窗 [0, max_time]
    notes = notes[(notes['HOURS'] >= 0) & (notes['HOURS'] <= max_time)]

    # 清除中间计算列
    notes.drop(columns=['HOURST', 'HOURSD', 'HOURS'], inplace=True, errors='ignore')

    return notes

def split_doc(d):
    """Split sentences in a document and saved the sentences to a list.

    Args:
        d: a document
        final_d: a list of sentences
    """

    d = d.strip().split(".") #d = d.strip().split(".") # split document by "." to sentences
    final_d = []
    for s in d:
        if s != "":  # ignore if the sentence is empty
            final_d.append(s.strip())
    return final_d  # Now the sentences are splitted from documents and saved to a list

#clean the sentence with some regex to delete some of the strange annotation like ---- **** etc.
def _preprocess1(x):
    y=re.sub('\[(.*?)\]','',x) #remove de-identified brackets
    y=re.sub('[0-9]+.','',y) #remove 1.2. since the segmenter segments based on this
    y=re.sub('dr.','doctor',y)
    y=re.sub('m.d.','md',y)
    y=re.sub('admission date:','',y)
    y=re.sub('discharge date:','',y)
    y=re.sub('--|__|==','',y)
    return y

def tokenize(sent, mimic3_embedding=None, nlp=None):
    """Tokenize the sentences according to the existing word from embedding.

    Args:
        sent: input a sentence
        mimic3_embedding: find the existing word in embedding files
        cleaned_tokens: the tokens are cleaned and mapped to the mimic embedding
    """

    #tokenizer = re.compile('\w+|\*\*|[^\s\w]')
    tokens = nltk.word_tokenize(sent.lower())
    #tokens = tokenizer.findall(sent.lower())
    cleaned_tokens = []
    for tok in tokens:
        tok = _clean_token(tok)
        if mimic3_embedding:
            if tok in mimic3_embedding:
                cleaned_tokens.append(tok)
            else:
                cleaned_tokens.append('UNK')
        else:
            cleaned_tokens.append(tok)
    return cleaned_tokens

def _clean_token(s):
    """If the token is digit, then round the actual value into the nearest 10 times value.
    Args:
        s: original digit, 65 -> 60
        """
    if len(s) > 1:
        if s.isdigit():
            l = len(s)
            s = str(int(s)//(10**(l-1)) * 10**(l-1))
    return s.lower()

def break_up_notes_by_subject(notes, output_path, subjects=None, verbose=1):
    subjects = notes.SUBJECT_ID.unique() if subjects is None else subjects
    nb_subjects = subjects.shape[0]
    notes = notes.drop_duplicates()
    
    #convert to Date time objects
    notes['DISCHTIME'] = pd.to_datetime(notes['DISCHTIME'])   
    notes['CHARTDATE'] = pd.to_datetime(notes['CHARTDATE'])
    notes['CHARTTIME'] = pd.to_datetime(notes['CHARTTIME'])
    notes['DISCHDATE'] = notes['DISCHTIME'].values.astype('<M8[D]')
    
    verbose = 1
    output_path = output_path

    for i, subject_id in enumerate(subjects):
            if verbose:
                sys.stdout.write('\rSUBJECT {0} of {1}...'.format(i+1, nb_subjects))
            dn = os.path.join(output_path, str(subject_id))
            try:
                os.makedirs(dn)
            except:
                pass
            patient_note = notes.loc[notes.SUBJECT_ID == subject_id].sort_values(by=['CHARTDATE', 'CHARTTIME'])#.to_csv(os.path.join(dn, 'diagnoses.csv'), index=False)
            intime = patient_note.INTIME

            patient_note['HOURST'] = (patient_note.CHARTTIME - intime).apply(lambda s: s / np.timedelta64(1, 's')) / 60./60
            patient_note['HOURSD'] = (patient_note.CHARTDATE - intime).apply(lambda s: s / np.timedelta64(1, 's')) / 60./60

            note_filename = str(subject_id) + '_note.txt' 
            f = open(os.path.join(dn, note_filename), 'w')
            j = 0
            for doc, cat, c_date, c_time, ht, hd, icd_id, h_id in zip(patient_note['TEXT'], patient_note['CATEGORY'], patient_note['CHARTDATE'], patient_note['CHARTTIME'], patient_note['HOURST'], patient_note['HOURSD'], patient_note['ICUSTAY_ID'], patient_note['HADM_ID']): #'ICUSTAY_ID' 
                category = cat.strip() #d[0].strip()
                category_index = category_id[category]
                sentences = split_doc(doc)
                #print(sentences)
                #print(len(sentences))

                
                #filter notes written after 48.0 hours
                #print(ht, hd)
                if math.isnan(ht):
                    h = hd
                else:
                    h = ht
                
                '''
                if h >= 0. or h <= MAX_TIME:
                    continue
                '''
                    
                for sent in sentences:
                #k=0
                #if k < len(sentences):
                    #sent = sentences[k]
                    sent = _preprocess1(sent)
                    #print(sent)
                    cleaned_tokens = tokenize(sent)
                    #print(cleaned_tokens)
                    if len(cleaned_tokens) > 0:
                        sent_head = '%s,%s,%s,%s,%s,%s,%s'%(str(j), str(category_index), c_date, c_time, str(h), str(icd_id), str(h_id))
                        #print(sent_head + "\n")
                        f.write(sent_head + "\n")
                        for t in cleaned_tokens:
                            #print(t + "\n")
                            f.write(t + "\n")
                        f.write("\n")
                    #k+=1
                j += 1
            f.close()

            
    if verbose:
        sys.stdout.write('DONE!\n')


# 2. 读取文件

In [5]:
#read csv file
stays = read_icustays_table(DATA_PATH_stays)
notes = read_notes_table(DATA_PATH_notes)
notes = merge_on_subject_admission(notes)
notes = filter_notes_on_stays(notes, stays)

train_id = pd.read_csv(DATA_PATH_train, sep= SEPARATOR)
test_id = pd.read_csv(DATA_PATH_test, sep= SEPARATOR) 
validation_id = pd.read_csv(DATA_PATH_validation, sep= SEPARATOR)

# 3. 构造note的train/test

## 3.1 从 listfile（train/test/val）中提取 subject_id 列表，
就和`preprocessing.ipynb`的subject_id一样

In [16]:
def extract_subject_ids(listfile_df):
    return (
        listfile_df['notes']
        .apply(lambda x: int(x.split("_")[0]))
        .drop_duplicates()
        .sort_values()
        .tolist()
    )

train_list = extract_subject_ids(train_id)
test_list = extract_subject_ids(test_id)
val_list = extract_subject_ids(validation_id)

## 3.2 从 notes 表中筛选指定 subject_id 的患者笔记记录
统一标准化其时间戳字段，并计算相对于 ICU 入院时间（INTIME）的时间差（HOURS）用于建模。

In [18]:
notes_train = create_train_val_test_notes(notes, train_list)
notes_test = create_train_val_test_notes(notes, test_list)
notes_validation = create_train_val_test_notes(notes, val_list)

## 3.2 具有文本的 subject_id 少于具有特征的 subject_id，需要为模型重新创建新的 list.csv 文件。

In [28]:
#new subject_id list
new_train_list = notes_train.SUBJECT_ID.unique().tolist()
print("new train: " + str(len(new_train_list)) + " subjects, old train: " + str(len(train_list)) + " subjects")
new_test_list = notes_test.SUBJECT_ID.unique().tolist()
print("new test: " + str(len(new_test_list)) + " subjects, old test: " + str(len(test_list)) + " subjects")
new_val_list = notes_validation.SUBJECT_ID.unique().tolist()
print("new val: " + str(len(new_val_list)) + " subjects, old val: " + str(len(val_list)) + " subjects")

new train: 26865 subjects, old train: 27427 subjects
new test: 3345 subjects, old test: 3428 subjects
new val: 3363 subjects, old val: 3425 subjects


## 3.3 构建新的train/val/test lifelist.csv

In [34]:
try:
    os.makedirs("../data/train")
    os.makedirs("../data/test")
except:
    pass

In [37]:
def update_listfile_by_subject_ids(listfile_df, valid_subject_ids, output_path):
    """
    从 listfile 中提取 subject_id, 并筛选出存在于 valid_subject_ids 的记录，保存为新的 listfile.csv

    参数：
        listfile_df : pd.DataFrame, 原始的 listfile DataFrame, 必须包含 'notes' 列
        valid_subject_ids : list, 有效的 subject_id 列表
        output_path : str, 要保存的新 listfile 文件路径
    """
    # 提取 notes 文件名前缀作为 SUBJECT_ID
    listfile_df['SUBJECT_ID'] = listfile_df['notes'].apply(lambda x: int(x.split("_")[0]))

    # 筛选在有效列表中的记录
    filtered_df = (
        listfile_df[listfile_df['SUBJECT_ID'].isin(valid_subject_ids)]
        .sort_values(by='SUBJECT_ID')
        .drop(columns='SUBJECT_ID')
        .reset_index(drop=True)
    )

    # 保存为 CSV
    filtered_df.to_csv(output_path, index=False)
    print(f"已保存新 listfile 至: {output_path}（剩余 {len(filtered_df)} 条记录）")


In [38]:
update_listfile_by_subject_ids(train_id, new_train_list, "../data/train/train_listfile.csv")
update_listfile_by_subject_ids(test_id, new_test_list, "../data/test/test_listfile.csv")
update_listfile_by_subject_ids(validation_id, new_val_list, "../data/train/val_listfile.csv")


已保存新 listfile 至: ../data/train/train_listfile.csv（剩余 37150 条记录）
已保存新 listfile 至: ../data/test/test_listfile.csv（剩余 4689 条记录）
已保存新 listfile 至: ../data/train/val_listfile.csv（剩余 4656 条记录）


## 3.4 将每个 subject_id 的笔记分配到对应的 train/test 文件夹中。

In [42]:
#make a dir for each patient
break_up_notes_by_subject(notes_train, OUTPUT_PATH_train)
break_up_notes_by_subject(notes_test, OUTPUT_PATH_test)
break_up_notes_by_subject(notes_validation, OUTPUT_PATH_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  notes['DISCHTIME'] = pd.to_datetime(notes['DISCHTIME'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  notes['CHARTDATE'] = pd.to_datetime(notes['CHARTDATE'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  notes['CHARTTIME'] = pd.to_datetime(notes['CHARTTIME'])
A value is trying to be set on a copy

SUBJECT 26865 of 26865...DONE!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  notes['DISCHTIME'] = pd.to_datetime(notes['DISCHTIME'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  notes['CHARTDATE'] = pd.to_datetime(notes['CHARTDATE'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  notes['CHARTTIME'] = pd.to_datetime(notes['CHARTTIME'])
A value is trying to be set on a copy

SUBJECT 3345 of 3345...DONE!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  notes['DISCHTIME'] = pd.to_datetime(notes['DISCHTIME'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  notes['CHARTDATE'] = pd.to_datetime(notes['CHARTDATE'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  notes['CHARTTIME'] = pd.to_datetime(notes['CHARTTIME'])
A value is trying to be set on a copy

SUBJECT 3363 of 3363...DONE!


# 4. 生成用于词嵌入的总笔记文本

In [43]:
joinedlist = new_train_list + new_val_list

In [44]:
joinedlist = sorted(joinedlist, reverse=False)

In [45]:
notes_word_embeddings = create_train_val_test_notes(notes, joinedlist)

In [46]:
dump_note = ''.join(notes_word_embeddings['TEXT'].dropna().astype(str).tolist())


In [47]:
sentences = split_doc(dump_note)

In [50]:
try:
    os.makedirs("../data/dump_notes")
except:
    pass

dump_note_filename = 'dump_notes.txt' 

In [52]:
f = open(os.path.join("../data/dump_notes", dump_note_filename), 'w')
for sent in sentences:
    #tokenize and remove strange token with regex as per the separated function shared for mimi3csv.py
    sent = _preprocess1(sent)
    cleaned_tokens = tokenize(sent)
    #output format: one sentence per line
    if len(cleaned_tokens) > 0:
        for t in cleaned_tokens:
            f.write(t + " ")
        f.write("\n")
f.close()