In [1]:
import pandas as pd
import re, json 
import argparse
import os

# px, dxprx

ROOT_PATH = 'result-dxprx'
if not os.path.isdir(ROOT_PATH):
    os.mkdir(ROOT_PATH)

LOAD_FILE_PATH = '/home/caoyu/project/GraphCLHealth/data/mimiciii/NOTEEVENTS.csv'
SAVE_FILE_PATH = f'{ROOT_PATH}/sections.csv'


'''
Table --> Sections

1. load NOTEEVENTS.csv

2. get discharge sumamry notes
    a) NOTEVENTS.CATEGORY = 'Discharge Summary'
    b) NOTEVENTS.DESCRIPTION = 'Report'
    c) eliminate a short-note

3. preprocess discharge sumamry notes
    a) clean text
    b) split sections by headers
    
4. save csv file
    a) PK: NOTEVENTS.ROW_ID
    b) TEXT: string(doubled-list)
    
'''

def load_noteevents(file_path):

    df = pd.read_csv(file_path)

    # dataframe dtype config
    df.CHARTDATE = pd.to_datetime(df.CHARTDATE, format='%Y-%m-%d', errors='raise')
    df.CHARTTIME = pd.to_datetime(df.CHARTTIME, format='%Y-%m-%d %H:%M:%S', errors='raise')
    df.STORETIME = pd.to_datetime(df.STORETIME)

    return df


def save_csv_file(csv_data, file_path):
    csv_data.to_csv(file_path, index=False)
    print('save successfully!')


def get_discharge_summary(df_notevents):

    cond1 = (df_notevents.CATEGORY == 'Discharge summary')
    cond2 = (df_notevents.DESCRIPTION == 'Report')

    df_discharge_smmary = df_notevents[cond1&cond2]
    df_discharge_smmary = df_discharge_smmary[['ROW_ID', 'TEXT']]
    
    # eliminate a short-note (subject_id=30561, hadm_id=178941)
    df_discharge_smmary = df_discharge_smmary[df_discharge_smmary.TEXT.apply(lambda x: len(x) > 100)]

    return df_discharge_smmary


def pattern_repl(matchobj):
    # Return a replacement string to be used for match object
    return ' '.rjust(len(matchobj.group(0)))  


def clean_text(text):
    # 1. Replace [**Patterns**] with spaces.
    text = re.sub(r'\[\*\*.*?\*\*\]', pattern_repl, text)
    
    # 2. Replace `_` with spaces.
    new_text = re.sub(r'_', ' ', text)
    
    return new_text


def split_section(text):
    headers, sections = [], []
#     pattern = "^([A-z0-9 ]+)(:)|Discharge Date:|Sex:|JOB#:|Unit No:|FOLLOW-UP PLANS:"
    except_pattern = "(?!(Sig:)|(disp:))"
    include_keywords = "(Discharge Date:)|(Sex:)|(JOB#:)|(Unit No:)|(FOLLOW-UP PLANS:)"
    pattern = "^" + except_pattern + "([A-z0-9 ]+)(:)|" + include_keywords
    SEPERATORS = re.compile(pattern, re.I | re.M)
    start = 0
    
    for matcher in SEPERATORS.finditer(text):
        # cut off by the position of later SEPERATOR
        end = matcher.start()
        if end != start: # except for first line
            section = text[start:end]
            if ':' not in section: #
                pass
            else:
                section = section[len(header):].strip() # except for header in section
                sections.append(section)
        start = end
        end = matcher.end()
        
        # collect each title in the beginning of section
        header = text[start:end].lower()
        headers.append(header)
        
    # add last section
    section = text[start:]
    section = section[len(header):].strip()
    sections.append(section)
    
    return headers, sections


def clean_header(header):
    # delete : (colon)
    header = re.sub(r',', '', header)
    new_header = re.sub(r':', '', header)
    new_header = new_header.strip()
    return new_header


def clean_section(section):
    # Replace multiple spaces with a space.
    new_section = ' '.join(section.split())
    return new_section


def preprocess_discharge_summary(text):
    text = clean_text(text)
    headers, sections = split_section(text)
    
    # for duplicated keys problem when formulate dict type data
#     for idx in range(len(headers)):
#         h = clean_header(headers[idx])
#         s = clean_section(sections[idx])
#         result[h] = s
    
    new_headers, new_sections = [], []
    for idx in range(len(headers)):
        h = clean_header(headers[idx])
        s = clean_section(sections[idx])
        new_headers.append(h)
        new_sections.append(s)
    return [new_headers, new_sections]




def main():
    print('Load NOTEEVENTS start!')
    data = load_noteevents(file_path=LOAD_FILE_PATH)
    print('Load NOTEEVENTS successfully!')
    data       = get_discharge_summary(data)
    print('Get discharge summary successfully!')
    notes      = data.TEXT.apply(lambda x: json.dumps(preprocess_discharge_summary(x)))
    print('Preprocess notes successfully!')
    new_data   = pd.concat([data.ROW_ID, notes], axis=1)

    save_csv_file(csv_data=new_data, file_path=SAVE_FILE_PATH)

    
if __name__ == '__main__':
    main()

Load NOTEEVENTS start!




Load NOTEEVENTS successfully!
Get discharge summary successfully!
Preprocess notes successfully!
save successfully!


###  ----- for debug start -----
### 查看 note headers 有哪些类型？

In [2]:
import pandas as pd
import re, json 
import os
import argparse


LOAD_FILE_PATH = os.path.join(ROOT_PATH,'sections.csv')
SAVE_FILE_PATH = os.path.join(ROOT_PATH,'p_sections.csv')


In [3]:
def load_csv_file(file_path):
    df = pd.read_csv(file_path)
    return df

def extract_dx_and_prx_section(text):
    #prx_section = []
    dx_section = ""
    text = json.loads(text) # change string format to dict
    headers, sections = text[0], text[1]
    
    print(headers)
    
    pos1, pos2, pos3, pos4 = -999, -999, -999, -999
    
    h1 = 'discharge medications'
    h2 = 'discharge disposition'
    h3 = 'discharge diagnosis'
    h4 = 'discharge condition'
    h5 = 'laboratory studies'
    
    if h1 in headers:
        pos1 = headers.index(h1)
    if h2 in headers:
        pos2 = headers.index(h2)
    if h3 in headers:
        pos3 = headers.index(h3)
    if h4 in headers:
        pos4 = headers.index(h4)

    if pos1 + pos2 + pos3 + pos4 > 0: # have all together
        if pos1 < pos2 < pos3 < pos4: # well organized
#             px_headers = headers[pos1:pos2]
            dx_section = ' '.join(sections[pos3:pos4])
    
    query = 'major surgical or invasive procedure'
    try:
        pos = headers.index(query)
    except:
        pos = ""
        
    if pos:
        prx_section = sections[pos]
    else:
        prx_section = ""
            
    return {'dx':dx_section, 'prx':prx_section}

In [4]:
data       = load_csv_file(file_path=LOAD_FILE_PATH)

In [5]:
len(data)

55176

In [6]:
data[0:2]

Unnamed: 0,ROW_ID,TEXT
0,174,"[[""admission date"", ""discharge date"", ""service..."
1,175,"[[""admission date"", ""discharge date"", ""date of..."


In [7]:
notes      = data[1:3].TEXT.apply(lambda x: extract_dx_and_prx_section(x))

['admission date', 'discharge date', 'date of birth', 'sex', 'service', 'history of present illness', 'review of systems is negative for the following', 'past medical history', 'medications on admission', 'allergies', 'family history', 'social history', 'physical exam at time of admission', 'laboratory studies', 'brief summary of hospital course', 'discharge condition', 'discharge status', 'discharge medications', 'follow-up plans', 'final diagnoses', 'dictated by', 'd', 't', 'job#']
['admission date', 'discharge date', 'service', 'allergies', 'attending', 'chief complaint', 'major surgical or invasive procedure', 'history of present illness', 'past medical history', 'pmh', 'social history', 'social history', 'family history', 'family history', 'physical exam', 'brief hospital course', 'medications on admission', 'discharge medications', 'discharge disposition', 'facility', 'discharge diagnosis', 'discharge condition', 'discharge instructions', 'completed by']


###  ----- for debug end -----

## 继续下一步

In [8]:
import pandas as pd
import re, json 
import os
import argparse


LOAD_FILE_PATH = os.path.join(ROOT_PATH,'sections.csv')
SAVE_FILE_PATH = os.path.join(ROOT_PATH,'p_sections.csv')


'''
preprocessing for mimic discharge summary note

1. load NOTEEVENTS.csv

2. get discharge sumamry notes
    a) NOTEVENTS.CATEGORY = 'Discharge Summary'
    b) NOTEVENTS.DESCRIPTION = 'Report'
    c) eliminate a short-note

3. preprocess discharge sumamry notes
    a) clean text
    b) split sections by headers
    
4. save csv file
    a) PK: NOTEVENTS.ROW_ID
    b) TEXT: string(doubled-list)
    
'''

def load_csv_file(file_path):
    df = pd.read_csv(file_path)
    return df


def save_csv_file(csv_data, file_path):
    csv_data.to_csv(file_path, index=False)
    return print('save successfully!')


def extract_px_section(text):
    px_section = []
    text = json.loads(text) # change string format to dict
    headers, sections = text[0], text[1]
    
    pos1, pos2, pos3, pos4 = -999, -999, -999, -999
    
    h1 = 'discharge medications'
    h2 = 'discharge disposition'
    h3 = 'discharge diagnosis'
    h4 = 'discharge condition'
    
#     h5 = 'FINAL DIAGNOSES:'
    
    
    if h1 in headers:
        pos1 = headers.index(h1)
    if h2 in headers:
        pos2 = headers.index(h2)
    if h3 in headers:
        pos3 = headers.index(h3)
    if h4 in headers:
        pos4 = headers.index(h4)

    if pos1 + pos2 + pos3 + pos4 > 0: # have all together
        if pos1 < pos2 < pos3 < pos4: # well organized
#             px_headers = headers[pos1:pos2]
            px_section = ' '.join(sections[pos1:pos2])
        else:
            px_section = ''
    else:
        px_section = ''
            
    return {'px':px_section}

def extract_prx_section(text):
    prx_section = []
    text = json.loads(text) # change string format to dict
    headers, sections = text[0], text[1]
    
    query = 'major surgical or invasive procedure'
    try:
        pos = headers.index(query)
    except:
        pos = ""
        
    if pos:
        prx_section = sections[pos]
            
    return {'prx':prx_section}

def extract_dx_and_prx_section(text):
    #prx_section = []
    dx_section = ""
    text = json.loads(text) # change string format to dict
    headers, sections = text[0], text[1]
    
    pos1, pos2, pos3, pos4 = -999, -999, -999, -999
    
    h1 = 'discharge medications'
    h2 = 'discharge disposition'
    h3 = 'discharge diagnosis'
    h4 = 'discharge condition'
    
    if h1 in headers:
        pos1 = headers.index(h1)
    if h2 in headers:
        pos2 = headers.index(h2)
    if h3 in headers:
        pos3 = headers.index(h3)
    if h4 in headers:
        pos4 = headers.index(h4)

    if pos1 + pos2 + pos3 + pos4 > 0: # have all together
        if pos1 < pos2 < pos3 < pos4: # well organized
#             px_headers = headers[pos1:pos2]
            dx_section = ' '.join(sections[pos3:pos4])
    
    query = 'major surgical or invasive procedure'
    try:
        pos = headers.index(query)
    except:
        pos = ""
        
    if pos:
        prx_section = sections[pos]
    else:
        prx_section = ""
            
    return {'dx':dx_section, 'prx':prx_section}

# def main():
data       = load_csv_file(file_path=LOAD_FILE_PATH)
notes      = data.TEXT.apply(lambda x: extract_dx_and_prx_section(x))
print('extract section from notes successfully!')
new_data   = pd.concat([data.ROW_ID, notes], axis=1)
    
save_csv_file(csv_data=new_data, file_path=SAVE_FILE_PATH)

# if __name__ == '__main__':
#     main()

extract section from notes successfully!
save successfully!


In [9]:
import pandas as pd
import re, json
import argparse
import torch
import os

import spacy, scispacy


NOTE_PATH = os.path.join('/home/caoyu/project/GraphCLHealth/data/mimiciii','NOTEEVENTS.csv')
LOAD_FILE_PATH = os.path.join(ROOT_PATH,'p_sections.csv')
SEC_SAVE_FILE_PATH = os.path.join(ROOT_PATH,'p_sections.txt')
ADM_SAVE_FILE_PATH = os.path.join(ROOT_PATH,'p_hadm_ids.txt')

def load_csv_file(file_path):
    df = pd.read_csv(file_path)
    return df


def save_file(data, file_path,txt=False):
#     with open(file_path, "w") as file:
    torch.save([(hadm_id, text) for hadm_id, text in zip(data[0],data[1])],os.path.join(ROOT_PATH,'p_sections'))
    return print('save successfully!')


def preprocess_scispacy(nlp, section_text):
    section_text_p = ' '.join([token.text for token in nlp(section_text)])
    return section_text_p


def main():
#     data       = load_csv_file(file_path=LOAD_FILE_PATH)
    data = new_data
    # data = data.iloc[:10]
    
    # not na 
    data = data[data.TEXT.notna()]

    # length > 200
    data = data[data.TEXT.apply(lambda x: sum([len(elem.split()) for elem in x.values()]) > 5)]

    # delete ""
#     data1 = data.copy()
#     data1.TEXT = data.TEXT.apply(lambda x: x[1:-1])

    # preprocessed by scispacy
#     nlp = spacy.load("en_core_sci_sm")
#     data.TEXT = data1.TEXT.apply(lambda x: preprocess_scispacy(nlp, x))
#     del data1
    print('preprocess successfully!')

    # recover and extract full info of data(subject_id, hamd_id)
    noteevents = load_csv_file(file_path=NOTE_PATH)
    noteevents = noteevents[['ROW_ID', 'SUBJECT_ID', 'HADM_ID']]

    # data=p / noteevents
    data1 = noteevents[noteevents.ROW_ID.isin(data.ROW_ID)]
    
    # save txt file
    print('data len: {}', len(data))
#     save_file(data=data.TEXT, file_path=SEC_SAVE_FILE_PATH)
    hadm_id = data1.HADM_ID.astype(int).astype(str)
    print('data1 len: {}', len(hadm_id))
    save_file(data=(hadm_id,data.TEXT), file_path=ADM_SAVE_FILE_PATH, txt=True)


if __name__ == '__main__':
    main()

preprocess successfully!




data len: {} 32919
data1 len: {} 32919
save successfully!
