# Post-processing CLAMP Output files

We took the outputs from CLAMP (Clinical Language Annotation, Modeling, and Processing) and post-processed the output by 1) cleaning the data, 2) processing the different section headings, 3) formatting it for input into our models

In [1]:
from google.cloud import storage
import pandas as pd
import numpy as np
import re

## Extract pid/nid from title and create column

We are extracting the patient ID (pid) and note ID (nid) to concatenate the notes per patient.

In [2]:
def extract_ids(file_name, df):
    pid = re.search("(?<=pid)[0-9]*(?=-nid)", file_name).group()
    nid = re.search("(?<=nid)[0-9]*(?=\.txt)", file_name).group()
    # Add column
    df.insert(loc = 0, column = 'PID', value = pid)
    df.insert(loc = 1, column = 'NID', value = nid)
    return df

## Processing Section Headers
- Select the first detection of a section in the notes and remove duplicates. 
- Assign all the terms that follow each section heading into that section (add new column for dataframe)
- Combine specific section labels into categories

In [14]:
# Processes the dataframe to place names entities into sections
def process_section_headers(df):
    section_specific_list = ['ALLERGY','AP','ASSESSMENT','ATTESTATION','CALL','CHIEF_COMPLAINT','CONSENT','CURRENT_MEDICATIONS',
                             'ENCOUNTER_DIAGNOSES','EXAM','EYE_EXAM','EYE_MEDICATIONS','FAMILY_HX','HPI','INSTRUCTION','INTERPRETATION',
                             'MEDICATION','OCULAR_HISTORY','PAST_SURGICAL_HISTORY','PLAN','PMH','PROBLEM_LIST','RISK','ROS','SLE',
                             'SOCIAL_HX','VISUAL_ACUITY']
    section_category_list = ["AP","INSTRUCTION","CC/HPI","HISTORY","ALLERGY","EXAM","ROS","MEDICATION",
                         "FAM_SOC_HX","INSTRUCTION","ATTESTATION","CONSENT","INTERPRETATION"]
    specific_to_category_dict = {'ALLERGY':'ALLERGY','AP':'AP','ASSESSMENT':'AP','ATTESTATION':'ATTESTATION','CALL':'INSTRUCTION',
                                 'CHIEF_COMPLAINT':'CC/HPI','CONSENT':'CONSENT','CURRENT_MEDICATIONS':'MEDICATION',
                                 'ENCOUNTER_DIAGNOSES':'AP','EXAM':'EXAM','EYE_EXAM':'EXAM','EYE_MEDICATIONS':'MEDICATION',
                                 'FAMILY_HX':'FAM_SOC_HX','HPI':'CC/HPI','INSTRUCTION':'INSTRUCTION','INTERPRETATION':'INTERPRETATION',
                                 'MEDICATION':'MEDICATION','OCULAR_HISTORY':'HISTORY','PAST_SURGICAL_HISTORY':'HISTORY','PLAN':'AP',
                                 'PMH':'HISTORY','PROBLEM_LIST':'HISTORY','RISK':'CONSENT','ROS':'ROS','SLE':'EXAM',
                                 'SOCIAL_HX':'FAM_SOC_HX','VISUAL_ACUITY':'EXAM'}
    
    # Create new columns in the dataframe
    df['Section_Header'] = ''
    df['Section_Category'] = ''

    #Isolate section header entities
    df_headers = df.loc[df['Semantic'].isin(section_specific_list)]

    # Remove duplicated section headers
    df_headers_nodup = df_headers.drop_duplicates(subset='Semantic', keep='first')
    df_headers_nodup = df_headers_nodup.reset_index(drop=True)
#     print("---------------- Section headers: --------------------")
#     display(df_headers_nodup)

    #if no section headers exist
    if (df_headers_nodup.shape[0] <= 0):
        return df
    
    #Assign entities that follow each section heading into that section
    header_index = 0
    for index, row in df.iterrows():
        curr_header = df_headers_nodup.iloc[header_index]
       
        # 0 to n-1 sections
        if (header_index < df_headers_nodup.shape[0]-1):
            next_header = df_headers_nodup.iloc[header_index+1]
            
            # Captures edge case where the first entity is not a section header
            if (row['Start'] < curr_header['Start']):
                continue
            
            # Adds the section header to the dataframe
            df.at[index,'Section_Header']= curr_header['Semantic']
            
            # Iterates
            if (row['Start'] >= next_header['Start']):
                header_index += 1
        
        # Set all remaining entities to final section
        else:
            df.at[index,'Section_Header']= curr_header['Semantic']
    
    #Combine specific headings into broad categories
    df['Section_Category'] = df['Section_Header'].map(specific_to_category_dict)

    return df

## Remove unnecessary terms

- Remove the entities that are under the categories: Allergy, Attestation, Instruction, Consent, Fam_Soc_Hx
- Remove rows that do not have CUI's

In [15]:
def remove_unnecessary_terms(df):
    # Remove entities that are under specific categories
    unnecessary_categories = ['ALLERGY','ATTESTATION','INSTRUCTION','CONSENT','FAM_SOC_HX']
    index_to_remove = df[(df['Section_Category'].isin(unnecessary_categories))].index
#     print(' ------------------- Entities that are removed ---------------------------')
#     display(df.iloc[index_to_remove])
    df = df.drop(index_to_remove)
    
    # Remove rows that have null CUI's
    df = df.dropna(subset=['CUI'])
    df = df.reset_index(drop=True)
    return df

## Clean Columns 
- Remove all the unnecessary information (RxNorm, Generic) from the CUI column
- Fix blank assertion column- if they are blank, set as present

In [16]:
def clean_columns(df):
    #Create new column for RxNorm and Generic codes
    df = pd.concat([df, df['CUI'].str.split(',', expand=True)], axis=1)
    df.rename({0: "CUI_clean", 
               1: "RxNorm", 
               2: "Generic"}, 
              axis = "columns", inplace = True)
    
    for index, row in df.iterrows():
        # Some have RxNorm/Generic codes but not CUI codes
        if not (row['CUI_clean'] == 'null' or row['CUI_clean'].startswith('C')):
            value = row['CUI']
            df.at[index,'CUI_clean'] = "null"
            split = value.split(',')
            df.at[index,'RxNorm'] = split[0]
            df.at[index,'Generic'] = split[1]
    
        # Clean assertion column by setting all blank assertions to present (drugs are default present)
        if pd.isnull(row['Assertion']):
            df.at[index,'Assertion'] = "present"
    
    return df

# Main: Run the postprocessing pipeline

In [20]:
client = storage.Client()

blobs = client.list_blobs('stanfordoptimagroup', prefix="STRIDE/lowva/clampnoteoutput/")

print("Blobs:")

count = 0
num_rows = 0
for blob in blobs:
    if(count%100 == 0):
        print("Loading File Number: ", count)
        print("Loading File: ", blob.name)
    
    if(blob.name).endswith(".txt"):
        count += 1
        df = pd.read_csv('gs://stanfordoptimagroup/' + blob.name, delimiter = "\t", engine = "python", error_bad_lines = False, warn_bad_lines = True)
        df = extract_ids(blob.name, df)
        df = process_section_headers(df)
        df = remove_unnecessary_terms(df)
        df_processed = clean_columns(df)
        
        num_rows += df_processed.shape[0]
        
        #for first iteration, df_final does not exist
        if(count == 1):
            df_final = df_processed
            continue
        else:
            df_final = df_final.append(df_processed, ignore_index=True)
    else:
        print("............... File is not a text output and is skipped")

df_final.to_csv("clamp_output_postprocessing.csv", index=False)

Blobs:
Loading File Number:  0
Loading File:  STRIDE/lowva/clampnoteoutput/00003-pid1790-nid174426007.txt
Loading File Number:  100
Loading File:  STRIDE/lowva/clampnoteoutput/00486-pid56175-nid679279630.txt
Loading File Number:  200
Loading File:  STRIDE/lowva/clampnoteoutput/00917-pid81747-nid386388251.txt
Loading File Number:  300
Loading File:  STRIDE/lowva/clampnoteoutput/01196-pid93394-nid575209722.txt
Loading File Number:  400
Loading File:  STRIDE/lowva/clampnoteoutput/01538-pid110543-nid299035220.txt


Skipping line 6: '	' expected after '"'


Loading File Number:  500
Loading File:  STRIDE/lowva/clampnoteoutput/02064-pid145293-nid82661986.txt
Loading File Number:  600
Loading File:  STRIDE/lowva/clampnoteoutput/02470-pid176150-nid734602488.txt
Loading File Number:  700
Loading File:  STRIDE/lowva/clampnoteoutput/02820-pid211787-nid115155499.txt
Loading File Number:  800
Loading File:  STRIDE/lowva/clampnoteoutput/03221-pid224319-nid376453471.txt
Loading File Number:  900
Loading File:  STRIDE/lowva/clampnoteoutput/03549-pid246050-nid516467866.txt
Loading File Number:  1000
Loading File:  STRIDE/lowva/clampnoteoutput/03902-pid259695-nid57417100.txt
Loading File Number:  1100
Loading File:  STRIDE/lowva/clampnoteoutput/04226-pid278693-nid346238080.txt
Loading File Number:  1200
Loading File:  STRIDE/lowva/clampnoteoutput/04562-pid296165-nid170205445.txt
Loading File Number:  1300
Loading File:  STRIDE/lowva/clampnoteoutput/05009-pid333803-nid208981295.txt
Loading File Number:  1400
Loading File:  STRIDE/lowva/clampnoteoutput/

Skipping line 31: '	' expected after '"'


Loading File Number:  1700
Loading File:  STRIDE/lowva/clampnoteoutput/06587-pid413286-nid320109054.txt
Loading File Number:  1800
Loading File:  STRIDE/lowva/clampnoteoutput/06954-pid427022-nid63976067.txt
Loading File Number:  1900
Loading File:  STRIDE/lowva/clampnoteoutput/07512-pid444502-nid495896482.txt
Loading File Number:  2000
Loading File:  STRIDE/lowva/clampnoteoutput/07892-pid471731-nid73635060.txt
Loading File Number:  2100
Loading File:  STRIDE/lowva/clampnoteoutput/08286-pid492700-nid96878766.txt
Loading File Number:  2200
Loading File:  STRIDE/lowva/clampnoteoutput/08590-pid516531-nid101145861.txt
Loading File Number:  2300
Loading File:  STRIDE/lowva/clampnoteoutput/08926-pid520646-nid140615092.txt
Loading File Number:  2400
Loading File:  STRIDE/lowva/clampnoteoutput/09215-pid541535-nid267150618.txt
Loading File Number:  2500
Loading File:  STRIDE/lowva/clampnoteoutput/09619-pid573138-nid83386228.txt
Loading File Number:  2600
Loading File:  STRIDE/lowva/clampnoteoutp

Skipping line 21: '	' expected after '"'


Loading File Number:  5900
Loading File:  STRIDE/lowva/clampnoteoutput/22519-pid1401541-nid118096328.txt
Loading File Number:  6000
Loading File:  STRIDE/lowva/clampnoteoutput/22987-pid1426251-nid159178460.txt
Loading File Number:  6100
Loading File:  STRIDE/lowva/clampnoteoutput/23357-pid1439363-nid264828810.txt


Skipping line 2: unexpected end of data


Loading File Number:  6200
Loading File:  STRIDE/lowva/clampnoteoutput/23726-pid1459995-nid90331397.txt
Loading File Number:  6300
Loading File:  STRIDE/lowva/clampnoteoutput/24094-pid1472402-nid56513229.txt
Loading File Number:  6400
Loading File:  STRIDE/lowva/clampnoteoutput/24558-pid1515514-nid91221716.txt
Loading File Number:  6500
Loading File:  STRIDE/lowva/clampnoteoutput/24953-pid1532293-nid547718221.txt
Loading File Number:  6600
Loading File:  STRIDE/lowva/clampnoteoutput/25399-pid1554539-nid82024571.txt
Loading File Number:  6700
Loading File:  STRIDE/lowva/clampnoteoutput/25776-pid1583271-nid490472634.txt
Loading File Number:  6800
Loading File:  STRIDE/lowva/clampnoteoutput/26298-pid1607461-nid415490110.txt
Loading File Number:  6900
Loading File:  STRIDE/lowva/clampnoteoutput/26652-pid1624964-nid87406815.txt
Loading File Number:  7000
Loading File:  STRIDE/lowva/clampnoteoutput/27040-pid1642817-nid506994828.txt
Loading File Number:  7100
Loading File:  STRIDE/lowva/clamp

Skipping line 58: '	' expected after '"'


Loading File Number:  9700
Loading File:  STRIDE/lowva/clampnoteoutput/37220-pid2164115-nid360294578.txt
Loading File Number:  9800
Loading File:  STRIDE/lowva/clampnoteoutput/37711-pid2191575-nid66044167.txt
Loading File Number:  9900
Loading File:  STRIDE/lowva/clampnoteoutput/38039-pid2211936-nid581180657.txt
Loading File Number:  10000
Loading File:  STRIDE/lowva/clampnoteoutput/38332-pid2220192-nid608134431.txt


Skipping line 28: unexpected end of data


Loading File Number:  10100
Loading File:  STRIDE/lowva/clampnoteoutput/38701-pid2244827-nid207698844.txt
Loading File Number:  10200
Loading File:  STRIDE/lowva/clampnoteoutput/38981-pid2255694-nid59229747.txt
Loading File Number:  10300
Loading File:  STRIDE/lowva/clampnoteoutput/39341-pid2263008-nid169428966.txt
Loading File Number:  10400
Loading File:  STRIDE/lowva/clampnoteoutput/39627-pid2276066-nid329839948.txt
Loading File Number:  10500
Loading File:  STRIDE/lowva/clampnoteoutput/39998-pid2301889-nid312183764.txt
Loading File Number:  10600
Loading File:  STRIDE/lowva/clampnoteoutput/40418-pid2309512-nid502713524.txt
Loading File Number:  10700
Loading File:  STRIDE/lowva/clampnoteoutput/40873-pid2339972-nid79700391.txt
Loading File Number:  10800
Loading File:  STRIDE/lowva/clampnoteoutput/41296-pid2350164-nid175836179.txt
Loading File Number:  10900
Loading File:  STRIDE/lowva/clampnoteoutput/41707-pid2357187-nid310699532.txt
Loading File Number:  11000
Loading File:  STRID

Skipping line 48: unexpected end of data


Loading File Number:  14500
Loading File:  STRIDE/lowva/clampnoteoutput/55127-pid3164800-nid62072539.txt
Loading File Number:  14600
Loading File:  STRIDE/lowva/clampnoteoutput/55476-pid3169722-nid419274090.txt
Loading File Number:  14700
Loading File:  STRIDE/lowva/clampnoteoutput/55860-pid3194675-nid593802012.txt
Loading File Number:  14800
Loading File:  STRIDE/lowva/clampnoteoutput/56263-pid3220298-nid694381056.txt
Loading File Number:  14900
Loading File:  STRIDE/lowva/clampnoteoutput/56750-pid3260791-nid392416602.txt
Loading File Number:  15000
Loading File:  STRIDE/lowva/clampnoteoutput/57034-pid3272371-nid96186883.txt
Loading File Number:  15100
Loading File:  STRIDE/lowva/clampnoteoutput/57399-pid3285956-nid242085143.txt
Loading File Number:  15200
Loading File:  STRIDE/lowva/clampnoteoutput/57701-pid3295247-nid69720719.txt
Loading File Number:  15300
Loading File:  STRIDE/lowva/clampnoteoutput/58121-pid3326703-nid228644003.txt
Loading File Number:  15400
Loading File:  STRIDE

In [21]:
pd.set_option("max_rows", 20)
display(df_final)
print(num_rows)

Unnamed: 0,PID,NID,Start,End,Semantic,CUI,Assertion,Entity,Section_Header,Section_Category,CUI_clean,RxNorm,Generic
0,1790,174426007,279,303,test,C0154863,present,vitreoretinal evaluation,HPI,CC/HPI,C0154863,,
1,1790,174426007,329,361,problem,C0242383,present,age-related macular degeneration,HPI,CC/HPI,C0242383,,
2,1790,174426007,366,375,problem,C0086543,present,cataracts,HPI,CC/HPI,C0086543,,
3,1790,174426007,442,471,problem,C0586742,present,increasing difficulty reading,HPI,CC/HPI,C0586742,,
4,1790,174426007,508,521,problem,C0743410,absent,acute changes,HPI,CC/HPI,C0743410,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
882913,3843115,228200434,8612,8627,treatment,C0022209,present,HANDIHALER INH),CURRENT_MEDICATIONS,MEDICATION,C0022209,,
882914,3843115,228200434,8673,8691,drug,"C0947644, RxNorm=[287524], Generic=[4603]",present,furosemide (LASIX),CURRENT_MEDICATIONS,MEDICATION,C0947644,RxNorm=[287524],Generic=[4603]
882915,3843115,228200434,8749,8756,drug,"C0004057, RxNorm=[1191], Generic=[1191]",present,aspirin,CURRENT_MEDICATIONS,MEDICATION,C0004057,RxNorm=[1191],Generic=[1191]
882916,3843115,228200434,8992,9001,drug,"C0001927, RxNorm=[435], Generic=[435]",present,albuterol,CURRENT_MEDICATIONS,MEDICATION,C0001927,RxNorm=[435],Generic=[435]


882918
