In [94]:
!pip install docx2txt
!pip install bloatectomy
!pip install python-docx



In [95]:
import nltk as nltk
import pandas as pd
import numpy as np
import pylab as pl
import docx2txt
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.collocations import *
from operator import itemgetter
from bloatectomy import bloatectomy
import sys
import os
import re
from os import listdir
from os.path import isfile, join
nltk.download('stopwords')
nltk.download('punkt')

REMOTE_DESKTOP = False
# REMOTE_DESKTOP = True

if REMOTE_DESKTOP:
    base_path = "N:\\DNA\\DSA\\NLP\\N-Grams\\"
    sys.path.append(f"{base_path}Code_Library\\modules\\python_fnts")
    database = 'cfqdw'
    server = 'CFQSQLPRD01DAR'
else:
    base_path = "C:/Users/hocke/OneDrive/Desktop/EricZacharia/03-Education/02-GraduateSchool/01-UChicago/01-Quarters/05-Summer2021/UChicagoMed/NLP/N-Grams/"

while True:
    INPT = input(f"What data? (c) cardiology?, (o) original?, (d) development?")
    path = f"{base_path}data"

    if INPT == "d":
        data_file = "data_development.csv"
        break
    elif INPT == "c":
        data_file = "cardiology_data.csv"
        meds_file = "meds.csv"
        labs_file = "labs.csv"
        icds_file = "icds.csv"
        break
    elif INPT == "o":
        data_file = "data.csv"
        break
    else:
        print("Invalid response.")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hocke\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hocke\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [96]:
def load_note_text(df):
    '''
    Returns a list of strings (individual patient notes)
    '''
    if INPT == "o" or INPT == "d":
        correct_diagnosis = ' '.join(df.loc[df["outcome"] == 1]['note_text'])  
        misdiagnosis = ' '.join(df.loc[df["outcome"] == 0]['note_text'])
        return correct_diagnosis, misdiagnosis
    elif INPT == "c":
        # Change the cardiology column names for notes and encounter id to match columns for different data sets
        df.rename(columns={"merged_notes": "note_text"}, inplace=True)
        had_Cardiac_Arrhythmia = ' '.join(df.loc[df["had_Cardiac_Arrhythmia"] == 1]['note_text'])
        not_Cardiac_Arrhythmia = ' '.join(df.loc[df["had_Cardiac_Arrhythmia"] == 0]['note_text'])
        had_CHF = ' '.join(df.loc[df["had_CHF"] == 1]['note_text'])
        not_CHF = ' '.join(df.loc[df["had_CHF"] == 0]['note_text'])
        had_Cardiomyopathy = ' '.join(df.loc[df["had_Cardiomyopathy"] == 1]['note_text'])
        not_Cardiomyopathy = ' '.join(df.loc[df["had_Cardiomyopathy"] == 0]['note_text'])
        return had_Cardiac_Arrhythmia, had_CHF, had_Cardiomyopathy, not_Cardiac_Arrhythmia, not_CHF, not_Cardiomyopathy
    elif INPT == "m":
        # Load note files from folder (exclude last readme file)
        files = [f for f in listdir(path) if isfile(join(path, f))][:-1]
        notes = []
        for file in files:
            with open(f"{path}/{file}") as f:
                # Ignore the first five lines (Dates and "History of Present Illness") (some files include these lines up to line 7...)
                for _ in range(5):
                    f.readline()
                lst = f.read().splitlines()  # remove line endings
                note = ""
                for line in lst:
                    note += line + " "
                notes.append(note)
        correct_diagnosis = ' '.join(notes[:len(notes)//2])  
        misdiagnosis = ' '.join(notes[len(notes)//2:]) 
        return correct_diagnosis, misdiagnosis
    else:
        print("Error. Undefined data source.")       


def remove_bloat(text, style="remov", output="docx", filename="bloatectomized_file", path=" "):
    '''
    input_text: file, str, list
    An input document (.txt, .rtf, .docx), a string of text, or list of hadm_ids for postgres mimiciii database or the raw text.

    style: str, optional, default=highlight
    How to denote a duplicate. The following are allowed: highlight, bold, remov.

    output: str, optional, default=html
    Type of marked output file as an html or a word document (docx). The following are allowed: html, docx.

    filename: str, optional, default=bloatectomized_file A string to name output file of the marked document.

    path: str, optional, default=' '
    The directory for output files.
    '''
    text = re.sub(r"[\x00-\x08\x0b\x0e-\x1f\x7f]", "", text) # Remove control characters to be XML compatible
    bloatectomy(text, style=style, output=output, filename=filename, path=path)
    return docx2txt.process(f"{path}{filename}.docx")


def contains_sublist(lst, sublst):
    n = len(sublst)
    return any((sublst == lst[i:i+n]) for i in xrange(len(lst)-n+1))


def text2tokens(text):
    """
        Opens a file and converts it to tokens

        Parameter
        ---------    
        filename: str
                The file to open

        Return
        ------
        list
               A list of strings
    """

    tokens = nltk.word_tokenize(text)
    return nltk.Text(tokens)


def get_bigrams(tokens, bigram_measures, filter_value=1):
    finder_bigrams = BigramCollocationFinder.from_words(tokens)
    finder_bigrams.apply_freq_filter(filter_value)
    finder_bigrams_scored = finder_bigrams.score_ngrams(
        bigram_measures.mi_like)

    bg_scored = pd.DataFrame(
        [x for x in finder_bigrams_scored], columns=['bigram', 'score'])
    bg_count = pd.DataFrame(
        finder_bigrams.ngram_fd.items(), columns=['bigram', 'count'])
    return pd.merge(bg_scored, bg_count, on='bigram', how='left')


def get_trigrams(tokens, trigram_measures, filter_value=0):
    finder_trigrams = TrigramCollocationFinder.from_words(tokens)
    finder_trigrams.apply_freq_filter(filter_value)
    finder_trigrams_scored = finder_trigrams.score_ngrams(
        trigram_measures.mi_like)

    tg_scored = pd.DataFrame(
        [x for x in finder_trigrams_scored], columns=['trigram', 'score'])
    tg_count = pd.DataFrame(
        finder_trigrams.ngram_fd.items(), columns=['trigram', 'count'])

    return pd.merge(tg_scored, tg_count, on='trigram', how='left')


def get_ngrams(text_pos, text_neg, num_ngrams=50):
    '''
    text_pos = list of list of strings
    text_neg = list of list of strings
    num_ngrams = the top number of ngrams to be returned in each of the four ngram types

    return four lists of lists of strings, where each string is a ngram
    '''
    # create new FamishedNotes folder if it doesn't exist
    # Remove Bloat (Doesn't remove all bloat as seen in with style="highlight")
    if not os.path.exists(f'./FamishedNotes'):
        os.makedirs(f'./FamishedNotes')
    text_pos_famished = remove_bloat(text_pos, style="remov", output="docx", filename=f"text_pos_famished", path=path[:-(len(path) - len(base_path))]+"FamishedNotes/")
    text_neg_famished = remove_bloat(text_neg, style="remov", output="docx", filename=f"text_neg_famished", path=path[:-(len(path) - len(base_path))]+"FamishedNotes/")

    # define stop words
    stopwords = nltk.corpus.stopwords.words('english')+['.', ',', ':', '/', '\\', '>', '<', '?', '!', '[', ']', '*', '-', '(', ')']
    # define ngram engines
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    trigram_measures = nltk.collocations.TrigramAssocMeasures()

    # remove stop words from text
    tokens_pos = text2tokens(text_pos_famished)
    content_pos = [w for w in tokens_pos if w.lower() not in stopwords]
    tokens_neg = text2tokens(text_neg_famished)
    content_neg = [w for w in tokens_neg if w.lower() not in stopwords]

    # Find bi/trigrams
    bi_pos = get_bigrams(content_pos, bigram_measures, 3)
    bi_neg = get_bigrams(content_neg, bigram_measures, 25)
    tri_pos = get_trigrams(content_pos, trigram_measures, 3)
    tri_neg = get_trigrams(content_neg, trigram_measures, 25)
    bi_pos.plot(x='count', y='score', kind='scatter')
    bi_neg.plot(x='count', y='score', kind='scatter')
    tri_pos.plot(x='count', y='score', kind='scatter')
    tri_neg.plot(x='count', y='score', kind='scatter')

    top_n = 3
    print(f"Top {top_n} bigrams for correctly diagnosed notes: \n", bi_pos[:top_n])
    print()
    print(f"Top {top_n} bigrams for misdiagnosed notes: \n", bi_neg[:top_n])
    print()
    print(f"Top {top_n} trigrams for correctly diagnosed notes: \n", tri_pos[:top_n])
    print()
    print(f"Top {top_n} trigrams for misdiagnosed notes: \n", tri_neg[:top_n])

    # Create a unique set of ngrams by polarity with scores of at least 10 and 0.2
    set_bi_pos = set(bi_pos[bi_pos.score >= 10]['bigram'])
    set_bi_neg = set(bi_neg[bi_neg.score >= 10]['bigram'])
    set_tri_pos = set(tri_pos[tri_pos.score >= 0.2]['trigram'])
    set_tri_neg = set(tri_neg[tri_neg.score >= 0.2]['trigram'])

    # Select ngrams by polarity with scores of at least 10 and 0.2 that don't exist in the opposite polarity set
    select_bi_pos = [x for x in bi_pos[bi_pos.score >= 10]['bigram'] if x not in set_bi_neg]
    select_bi_neg = [x for x in bi_neg[bi_neg.score >= 10]['bigram'] if x not in set_bi_pos]
    select_tri_pos = [x for x in tri_pos[tri_pos.score >= 0.2]['trigram'] if x not in set_tri_neg]
    select_tri_neg = [x for x in tri_neg[tri_neg.score >= 0.2]['trigram'] if x not in set_tri_pos]

    # Limit to top {num_ngrams} or less
    select_bi_neg = select_bi_neg[:num_ngrams] if len(select_bi_neg) > num_ngrams else select_bi_neg
    select_bi_pos = select_bi_pos[:num_ngrams] if len(select_bi_pos) > num_ngrams else select_bi_pos
    select_tri_neg = select_tri_neg[:num_ngrams] if len(select_tri_neg) > num_ngrams else select_tri_neg
    select_tri_pos = select_tri_pos[:num_ngrams] if len(select_tri_pos) > num_ngrams else select_tri_pos
    
    return select_tri_pos, select_tri_neg, select_bi_pos, select_bi_neg


In [97]:
num_ngrams = 50

# Load data to be split for training and testing
df_notes = pd.read_csv(f"{path}/{data_file}", encoding="ISO-8859-1")
# remove all special characters and digits
df_notes.replace([r'[^\x00-\x7F]', r'[\d+]'], ['', ''], regex=True, inplace=True)
# remove commonly found words that may be causing adverse effects on prediction accuracy
df_notes.replace(['Value', 'Date/Time', 'Neg', 'Hx', 'Outcome', 'Maintaining'], ['', '', '', '', '', ''], regex=True, inplace=True)
print('notes ', df_notes.shape)

if INPT == "c":
    df_notes.rename(columns={"merged_notes": "note_text"}, inplace=True)
    df_meds = pd.read_csv(f"{path}/{meds_file}")
    df_labs = pd.read_csv(f"{path}/{labs_file}")
    df_icds = pd.read_csv(f"{path}/{icds_file}")
    df_icds.rename(columns={"Encounter_Key": "encounter_key"}, inplace=True)

    print('meds ', df_meds.shape, '\n', df_meds.columns)
    print('labs ', df_labs.shape, '\n', df_labs.columns)
    print('icds ', df_icds.shape, '\n', df_icds.columns)


notes  (73032, 9)


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


meds  (834669, 11) 
 Index(['Unnamed: 0', 'pat_enc_csn_id', 'pat_id', 'medication_id',
       'description', 'refills', 'quantity', 'DISCON_TIME', 'ORDER_START_TIME',
       'ORDER_end_TIME', 'order_status'],
      dtype='object')
labs  (5746394, 5) 
 Index(['Unnamed: 0', 'COMPONENT_ID', 'pat_enc_csn_id', 'RESULT_TIME',
       'ORD_VALUE'],
      dtype='object')
icds  (838502, 6) 
 Index(['Unnamed: 0', 'Encounter_Diagnosis_Key', 'encounter_key',
       'Diagnosis_Key', 'ICDDiagnosis_Code_NoDecimal', 'ICDDiagnosis_Name'],
      dtype='object')


In [98]:
from collections import defaultdict
# Manipulate the dataset to merge notes with the same encounter ID.
unique_encounters = defaultdict()

for i in range(len(df_notes["note_text"])):
    # concatenate note string into existing note string of identical encounter id.
    if INPT == "c":
        if df_notes["encounter_key"][i] in unique_encounters:
            unique_encounters[df_notes["encounter_key"]
                            [i]][1] += df_notes["note_text"][i]
        else:
            unique_encounters[df_notes["encounter_key"][i]] = [
                df_notes["pat_enc_csn_id"][i], df_notes["note_text"][i], df_notes["had_Cardiac_Arrhythmia"][i], df_notes["had_CHF"][i], df_notes["had_Cardiomyopathy"][i]]
    else:
        # create a new row of data for the unique encounter id.
        if df_notes["pat_enc_csn_id"][i] in unique_encounters:
            unique_encounters[df_notes["pat_enc_csn_id"]
                              [i]][0] += df_notes["note_text"][i]
        else:
            unique_encounters[df_notes["pat_enc_csn_id"][i]] = [
                df_notes["note_text"][i], df_notes["outcome"][i]]

if INPT == "c":
    df_notes = pd.DataFrame.from_dict(unique_encounters, orient='index', columns=[
        'pat_enc_csn_id', 'note_text', "had_Cardiac_Arrhythmia", "had_CHF", "had_Cardiomyopathy"]).reset_index()
    df_notes.rename(columns={'index': 'encounter_key'}, inplace=True)

else:
    df_notes = pd.DataFrame.from_dict(unique_encounters, orient='index', columns=[
                                'note_text', 'outcome']).reset_index()

df_notes

Unnamed: 0,encounter_key,pat_enc_csn_id,note_text,had_Cardiac_Arrhythmia,had_CHF,had_Cardiomyopathy
0,17859200,76091112,Infection Control is granting visitation to th...,1,0,0
1,18334506,76493784,Jamie St Clair came to neurovascular clinic on...,0,0,0
2,18824835,76946216,"February , Cristina Tumacder, MD Eagle...",0,0,0
3,18835001,76968972,CARDIOLOGY CLINIC NOTE Type of note: Return...,0,0,0
4,18976224,77087831,CARDIOLOGY CLINIC NOTE Type of note: Return...,0,0,0
...,...,...,...,...,...,...
42921,77942437,92081248,Patient Ambulatory to Triage c/o Suicidal idea...,0,0,0
42922,77942438,92080854,Patient c/o Right Arm and Right Shoulder pain ...,0,0,0
42923,78105470,92081100,Patient c/o Right Inguinal Pain started yester...,0,0,0
42924,78141109,92096705,Attending Attestation Statement I personall...,0,0,0


In [99]:
def order_preserved_unique_list(seq, idfun=None):
   if idfun is None:
       def idfun(x): return x
   seen = {}
   result = []
   for item in seq:
       marker = idfun(item)
       if marker in seen:
          continue
       seen[marker] = 1
       result.append(item)
   return result

if INPT == "c":
    df_meds['prescribed_meds'] = df_meds['ORDER_end_TIME'] > df_meds['ORDER_START_TIME']
    df_meds.replace({'prescribed_meds': {True: 1, False: 0}}, inplace=True)
    # trouble figuring out how to remove rows with older start time, but different prescribed meds value
    df_meds_reduced = df_meds[['pat_enc_csn_id', 'medication_id', 'ORDER_START_TIME', 'prescribed_meds']].groupby(
        ['pat_enc_csn_id', 'prescribed_meds']).agg({'medication_id': lambda x: order_preserved_unique_list(x), 'ORDER_START_TIME': 'max'}).reset_index()
    df_meds_reduced = df_meds_reduced.sort_values(by=['ORDER_START_TIME'], ascending=False).drop_duplicates('pat_enc_csn_id', keep='first')
    print('meds reduced by pat_enc_csn_id',
          df_meds_reduced.shape, '\n', df_meds_reduced.columns)
    print()
    df_labs_reduced = df_labs[['pat_enc_csn_id', 'COMPONENT_ID', 'RESULT_TIME', 'ORD_VALUE']].groupby(
        ['pat_enc_csn_id']).agg({'COMPONENT_ID': lambda x: order_preserved_unique_list(x), 'ORD_VALUE': lambda x: order_preserved_unique_list(x), 'RESULT_TIME': 'max'}).reset_index()
    print('labs reduced by pat_enc_csn_id',
          df_labs_reduced.shape, '\n', df_labs_reduced.columns)
    print()
    df_icds_reduced = df_icds[['Encounter_Diagnosis_Key', 'encounter_key', 'Diagnosis_Key', 'ICDDiagnosis_Code_NoDecimal', 'ICDDiagnosis_Name']].groupby(['encounter_key']).agg(
        {'Encounter_Diagnosis_Key': lambda x: order_preserved_unique_list(x), 'Diagnosis_Key': lambda x: order_preserved_unique_list(x), 'ICDDiagnosis_Code_NoDecimal': lambda x: order_preserved_unique_list(x)}).reset_index()
    print('icds reduced by encounter_key',
          df_icds_reduced.shape, '\n', df_icds_reduced.columns)
    print()
    df_meds_labs_merged = df_meds_reduced.merge(df_labs_reduced, how='left', on='pat_enc_csn_id')
 
    print('reduced meds+labs merged by pat_enc_csn_id',
          df_meds_labs_merged.shape, '\n', df_meds_labs_merged.columns)
    print()
    df_notes_icds_merged = df_notes.merge(df_icds_reduced, how='left', on='encounter_key')
    print('reduced notes+icds merged by encounter_key',
      df_notes_icds_merged.shape, '\n', df_notes_icds_merged.columns)
    print()


meds reduced by pat_enc_csn_id (21260, 4) 
 Index(['pat_enc_csn_id', 'prescribed_meds', 'medication_id',
       'ORDER_START_TIME'],
      dtype='object')

labs reduced by pat_enc_csn_id (15858, 4) 
 Index(['pat_enc_csn_id', 'COMPONENT_ID', 'ORD_VALUE', 'RESULT_TIME'], dtype='object')

icds reduced by encounter_key (30021, 4) 
 Index(['encounter_key', 'Encounter_Diagnosis_Key', 'Diagnosis_Key',
       'ICDDiagnosis_Code_NoDecimal'],
      dtype='object')

reduced meds+labs merged by pat_enc_csn_id (21260, 7) 
 Index(['pat_enc_csn_id', 'prescribed_meds', 'medication_id',
       'ORDER_START_TIME', 'COMPONENT_ID', 'ORD_VALUE', 'RESULT_TIME'],
      dtype='object')

reduced notes+icds merged by encounter_key (42926, 9) 
 Index(['encounter_key', 'pat_enc_csn_id', 'note_text',
       'had_Cardiac_Arrhythmia', 'had_CHF', 'had_Cardiomyopathy',
       'Encounter_Diagnosis_Key', 'Diagnosis_Key',
       'ICDDiagnosis_Code_NoDecimal'],
      dtype='object')



In [100]:
df_meds_labs_merged

Unnamed: 0,pat_enc_csn_id,prescribed_meds,medication_id,ORDER_START_TIME,COMPONENT_ID,ORD_VALUE,RESULT_TIME
0,91798662,0,"[130623, 147268, 15769, 132397, 3627, 110517, ...",9/9/2021 9:00,"[2586.0, 1941.0, 1200.0, 1156.0, 1537.0, 1532....","[9.4, 5.07, 15.5, 46.7, 92.1, 30.6, 33.2, 13.2...",2021-09-06 20:26:00
1,91159504,0,"[10859, 140408, 61281, 19066, 16625, 121147, 3...",9/9/2021 9:00,"[1440.0, 2408.0, 190.0, 373.0, 333.0, 408.0, 2...","[16.0, 7.5, 3.4, 0.6, 0.2, 0.4, 89.0, 25.0, 15...",2021-09-06 20:56:00
2,91640780,0,"[25576, 12165, 41438, 40481, 39924, 8221, 1416...",9/9/2021 9:00,"[1620.0, 408.0, 333.0, 1076.0, 1611.0, 1415.0,...","[2.1, 1.7, 0.4, 129.0, 133.0, 4.4, 100.0, 22.0...",2021-09-13 06:29:00
3,92004632,0,"[119413, 10350, 3894, 13252, 40534, 15360, 701...",9/9/2021 9:00,"[304.0, 587.0, 3685.0, 3686.0, 3687.0, 3768.0,...","[4.64, 150.0, 82.0, 274.0, 432.0, -28.0, 153.0...",2021-09-06 18:30:00
4,91599714,0,"[45147, 685, 12697, 29646, 63344, 134, 141675,...",9/9/2021 9:00,"[4128.0, 4129.0, 4130.0, 4131.0, 1892.0, 1347....","[100.0, 7.491, 38.4, 39.4, 19.4, 1.7, 144.0, 3...",2021-09-06 14:25:00
...,...,...,...,...,...,...,...
21255,86191978,1,"[38420, 9604, 90505]",1/1/2021 2:03,"[2586.0, 1941.0, 1200.0, 1156.0, 1537.0, 1532....","[7.7, 4.46, 13.1, 39.0, 87.4, 29.4, 33.6, 12.4...",2021-01-01 02:33:00
21256,86193374,1,[142429],1/1/2021 19:30,"[2586.0, 1941.0, 1200.0, 1156.0, 1537.0, 1532....","[6.0, 4.65, 13.9, 41.1, 88.4, 29.9, 33.8, 12.7...",2021-01-01 17:17:00
21257,86193422,0,"[133779, 18631]",1/1/2021 19:30,"[1076.0, 1611.0, 1415.0, 543.0, 591.0, 1312.0,...","[89.0, 138.0, 4.1, 103.0, 26.0, 9.0, 0.7, 125....",2021-01-01 18:40:00
21258,85979680,0,[10249],1/1/2021 16:00,"[1076.0, 1611.0, 1415.0, 543.0, 591.0, 1312.0,...","[175.0, 139.0, 4.2, 104.0, 25.0, 10.0, 7.0, 0....",2021-01-01 16:27:00


In [101]:
df_notes_icds_merged

Unnamed: 0,encounter_key,pat_enc_csn_id,note_text,had_Cardiac_Arrhythmia,had_CHF,had_Cardiomyopathy,Encounter_Diagnosis_Key,Diagnosis_Key,ICDDiagnosis_Code_NoDecimal
0,17859200,76091112,Infection Control is granting visitation to th...,1,0,0,"[32949078, 36805138, 17033447, 17193988, 40925...","[871242, 808139, 855918, 825956, 836445, 83471...","[J690, B373, S0990XA, Z978, Z950, J90, S27321A..."
1,18334506,76493784,Jamie St Clair came to neurovascular clinic on...,0,0,0,,,
2,18824835,76946216,"February , Cristina Tumacder, MD Eagle...",0,0,0,[36683069],[825404],[H900]
3,18835001,76968972,CARDIOLOGY CLINIC NOTE Type of note: Return...,0,0,0,,,
4,18976224,77087831,CARDIOLOGY CLINIC NOTE Type of note: Return...,0,0,0,,,
...,...,...,...,...,...,...,...,...,...
42921,77942437,92081248,Patient Ambulatory to Triage c/o Suicidal idea...,0,0,0,"[72511001, 72510999, 72504975, 72501649, 72501...","[983702, 830850, 807590, 826567, 936494, 98147...","[F17210, F209, Z7289, Z590, F1490, F1290, F602..."
42922,77942438,92080854,Patient c/o Right Arm and Right Shoulder pain ...,0,0,0,"[72531163, 72503908, 72504592, 72504591, 72503...","[935868, 867787, 834333, 822360, 1492632]","[M79601, M25511, Z8249, J45990, D75A]"
42923,78105470,92081100,Patient c/o Right Inguinal Pain started yester...,0,0,0,"[72681539, 72696052, 72716413]","[823535, 873398, 867227]","[R1031, Z3202, Q8901]"
42924,78141109,92096705,Attending Attestation Statement I personall...,0,0,0,"[72763426, 72754212, 72751575, 73135326, 73100...","[801285, 844563, 374026, 832306, 789107, 85327...","[V284XXA, S032XXA, S02621A, V892XXA, S02609A, ..."


In [102]:
medication = pd.DataFrame(df_meds_labs_merged.medication_id.to_list())
for i in range(len(medication.columns)):
    medication.rename(columns={i: f"med_id #{i + 1}"}, inplace=True)
labs = df_meds_labs_merged.ORD_VALUE.to_list()
for i in range(len(labs)):
    if not isinstance(labs[i], list):
        labs[i] = []
labs = pd.DataFrame(labs)
for i in range(len(labs.columns)):
    labs.rename(columns={i: f"lab #{i + 1}"}, inplace=True)

df_meds_labs_merged = pd.concat([df_meds_labs_merged, medication, labs], axis=1).drop(
    columns=['medication_id', 'RESULT_TIME', 'ORDER_START_TIME', 'COMPONENT_ID', 'ORD_VALUE'])
df_meds_labs_merged

## Reduce this down to the top 100 med, lab, and icds

Unnamed: 0,pat_enc_csn_id,prescribed_meds,med_id #1,med_id #2,med_id #3,med_id #4,med_id #5,med_id #6,med_id #7,med_id #8,...,lab #2458,lab #2459,lab #2460,lab #2461,lab #2462,lab #2463,lab #2464,lab #2465,lab #2466,lab #2467
0,91798662,0,130623,147268.0,15769.0,132397.0,3627.0,110517.0,685.0,14401.0,...,,,,,,,,,,
1,91159504,0,10859,140408.0,61281.0,19066.0,16625.0,121147.0,36959.0,141667.0,...,,,,,,,,,,
2,91640780,0,25576,12165.0,41438.0,40481.0,39924.0,8221.0,141646.0,130622.0,...,,,,,,,,,,
3,92004632,0,119413,10350.0,3894.0,13252.0,40534.0,15360.0,7019.0,52167.0,...,,,,,,,,,,
4,91599714,0,45147,685.0,12697.0,29646.0,63344.0,134.0,141675.0,142156.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21255,86191978,1,38420,9604.0,90505.0,,,,,,...,,,,,,,,,,
21256,86193374,1,142429,,,,,,,,...,,,,,,,,,,
21257,86193422,0,133779,18631.0,,,,,,,...,,,,,,,,,,
21258,85979680,0,10249,,,,,,,,...,,,,,,,,,,


In [103]:
icds = df_notes_icds_merged.ICDDiagnosis_Code_NoDecimal.to_list()
for i in range(len(icds)):
    if not isinstance(icds[i], list):
        icds[i] = []
icds = pd.DataFrame(icds)
for i in range(len(icds.columns)):
    icds.rename(columns={i: f"icd #{i + 1}"}, inplace=True)
df_notes_icds_merged = pd.concat([df_notes_icds_merged, icds], axis=1).drop(
    columns=['encounter_key', 'Encounter_Diagnosis_Key', 'Diagnosis_Key', 'ICDDiagnosis_Code_NoDecimal'])
df_notes_icds_merged


Unnamed: 0,pat_enc_csn_id,note_text,had_Cardiac_Arrhythmia,had_CHF,had_Cardiomyopathy,icd #1,icd #2,icd #3,icd #4,icd #5,...,icd #194,icd #195,icd #196,icd #197,icd #198,icd #199,icd #200,icd #201,icd #202,icd #203
0,76091112,Infection Control is granting visitation to th...,1,0,0,J690,B373,S0990XA,Z978,Z950,...,,,,,,,,,,
1,76493784,Jamie St Clair came to neurovascular clinic on...,0,0,0,,,,,,...,,,,,,,,,,
2,76946216,"February , Cristina Tumacder, MD Eagle...",0,0,0,H900,,,,,...,,,,,,,,,,
3,76968972,CARDIOLOGY CLINIC NOTE Type of note: Return...,0,0,0,,,,,,...,,,,,,,,,,
4,77087831,CARDIOLOGY CLINIC NOTE Type of note: Return...,0,0,0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42921,92081248,Patient Ambulatory to Triage c/o Suicidal idea...,0,0,0,F17210,F209,Z7289,Z590,F1490,...,,,,,,,,,,
42922,92080854,Patient c/o Right Arm and Right Shoulder pain ...,0,0,0,M79601,M25511,Z8249,J45990,D75A,...,,,,,,,,,,
42923,92081100,Patient c/o Right Inguinal Pain started yester...,0,0,0,R1031,Z3202,Q8901,,,...,,,,,,,,,,
42924,92096705,Attending Attestation Statement I personall...,0,0,0,V284XXA,S032XXA,S02621A,V892XXA,S02609A,...,,,,,,,,,,


In [104]:
df = df_notes_icds_merged.merge(df_meds_labs_merged, how='left', on='pat_enc_csn_id')
# Reorder and drop some columns
outcomes = df[['prescribed_meds', 'had_Cardiac_Arrhythmia', 'had_CHF', 'had_Cardiomyopathy']]
df

Unnamed: 0,pat_enc_csn_id,note_text,had_Cardiac_Arrhythmia,had_CHF,had_Cardiomyopathy,icd #1,icd #2,icd #3,icd #4,icd #5,...,lab #2458,lab #2459,lab #2460,lab #2461,lab #2462,lab #2463,lab #2464,lab #2465,lab #2466,lab #2467
0,76091112,Infection Control is granting visitation to th...,1,0,0,J690,B373,S0990XA,Z978,Z950,...,,,,,,,,,,
1,76493784,Jamie St Clair came to neurovascular clinic on...,0,0,0,,,,,,...,,,,,,,,,,
2,76946216,"February , Cristina Tumacder, MD Eagle...",0,0,0,H900,,,,,...,,,,,,,,,,
3,76968972,CARDIOLOGY CLINIC NOTE Type of note: Return...,0,0,0,,,,,,...,,,,,,,,,,
4,77087831,CARDIOLOGY CLINIC NOTE Type of note: Return...,0,0,0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42921,92081248,Patient Ambulatory to Triage c/o Suicidal idea...,0,0,0,F17210,F209,Z7289,Z590,F1490,...,,,,,,,,,,
42922,92080854,Patient c/o Right Arm and Right Shoulder pain ...,0,0,0,M79601,M25511,Z8249,J45990,D75A,...,,,,,,,,,,
42923,92081100,Patient c/o Right Inguinal Pain started yester...,0,0,0,R1031,Z3202,Q8901,,,...,,,,,,,,,,
42924,92096705,Attending Attestation Statement I personall...,0,0,0,V284XXA,S032XXA,S02621A,V892XXA,S02609A,...,,,,,,,,,,


In [105]:
if INPT == "c":
    had_CA, had_CHF, had_CM, not_CA, not_CHF, not_CM = load_note_text(df_notes)
    bi_pos_CA, bi_neg_CA, tri_pos_CA, tri_neg_CA = get_ngrams(
        had_CA, not_CA, num_ngrams=num_ngrams)
    bi_pos_CHF, bi_neg_CHF, tri_pos_CHF, tri_neg_CHF = get_ngrams(
        had_CHF, not_CHF, num_ngrams=num_ngrams)
    bi_pos_CM, bi_neg_CM, tri_pos_CM, tri_neg_CM = get_ngrams(
        had_CM, not_CM, num_ngrams=num_ngrams)
    combined_pos_CA = bi_pos_CA + tri_pos_CA
    combined_neg_CA = bi_neg_CA + tri_neg_CA
    combined_pos_CHF = bi_pos_CHF + tri_pos_CHF
    combined_neg_CHF = bi_neg_CHF + tri_neg_CHF
    combined_pos_CM = bi_pos_CM + tri_pos_CM
    combined_neg_CM = bi_neg_CM + tri_neg_CM
else:
    correct_diagnosis, misdiagnosis = load_note_text(df_notes)
    bi_pos, bi_neg, tri_pos, tri_neg = get_ngrams(
        correct_diagnosis, misdiagnosis, num_ngrams=num_ngrams)
    combined_pos = bi_pos + tri_pos
    combined_neg = bi_neg + tri_neg


removing duplications. Output file = C:/Users/hocke/OneDrive/Desktop/EricZacharia/03-Education/02-GraduateSchool/01-UChicago/01-Quarters/05-Summer2021/UChicagoMed/NLP/N-Grams/FamishedNotes/text_pos_famished.docx
removing duplications. Output file = C:/Users/hocke/OneDrive/Desktop/EricZacharia/03-Education/02-GraduateSchool/01-UChicago/01-Quarters/05-Summer2021/UChicagoMed/NLP/N-Grams/FamishedNotes/text_neg_famished.docx
Top 3 bigrams for correctly diagnosed notes: 
                bigram        score  count
0       (NG/OG, Tube)  4891.726935   6397
1      (Recent, Labs)  4482.408491   5656
2  (sodium, chloride)  3645.800716   5791

Top 3 bigrams for misdiagnosed notes: 
                  bigram        score  count
0  (Smokeless, tobacco)  8103.454801   9430
1      (Lab, Component)  8017.949613  19485
2          (Ref, Range)  7737.252470   8547

Top 3 trigrams for correctly diagnosed notes: 
                                            trigram  score  count
0                          (-f

In [None]:
from sklearn.model_selection import train_test_split

test_size = 0.3
seed = 42
# X = df[(['note_text'])]
X = df.drop(columns=list(outcomes.columns.values))
if INPT == "c":
    y_CA = df[(['had_Cardiac_Arrhythmia'])]
    y_CHF = df[(['had_CHF'])]
    y_CM = df[(['had_Cardiomyopathy'])]
    X_train_CA, X_test_CA, y_train_CA, y_test_CA = train_test_split(X, y_CA, test_size=test_size, random_state=seed)
    X_train_CHF, X_test_CHF, y_train_CHF, y_test_CHF = train_test_split(X, y_CHF, test_size=test_size, random_state=seed)
    X_train_CM, X_test_CM, y_train_CM, y_test_CM = train_test_split(X, y_CM, test_size=test_size, random_state=seed)
else:
    y = df[(['outcome'])]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)


KeyError: "None of [Index(['outcome'], dtype='object')] are in the [columns]"

In [None]:
from sklearn.metrics import plot_confusion_matrix
import xgboost as xgb

def ngram_xgb(pos, neg, X_train, X_test, y_train, y_test, incorrect_label, corret_label):
    global seed
    columns = [ngram for ngram in pos + neg]
    # create dataframe of zeros with note text as rows and ngrams as columns
    df_train = pd.DataFrame(np.zeros((len(X_train), len(columns))),
                            index=X_train["note_text"], columns=columns)
    df_test = pd.DataFrame(np.zeros((len(X_test), len(columns))),
                        index=X_test["note_text"], columns=columns)

    # if the n-gram occurs in the note, then identify it with a "1"
    for ngram in pos + neg:
        for encounter_note in X_train['note_text']:
            if " ".join(ngram) in encounter_note:
                df_train.loc[encounter_note][ngram] = 1
        for encounter_note in X_test['note_text']:
            if " ".join(ngram) in encounter_note:
                df_test.loc[encounter_note][ngram] = 1

    print(df_train)
    X_train = df_train.to_numpy()
    X_test = df_test.to_numpy()

    # create a bunch of decisiton trees and apply early stopping
    model = xgb.XGBClassifier(objective='binary:logistic',
                            seed=seed, use_label_encoder=False)
    model.fit(X_train, y_train, verbose=True, early_stopping_rounds=10,
            eval_metric='aucpr', eval_set=[(X_test, y_test)])
    plot_confusion_matrix(model, X_test, y_test, display_labels=[incorrect_label, corret_label])

In [None]:
# Using bigrams only
if INPT == "c":
    ngram_xgb(bi_pos_CA, bi_neg_CA, X_train_CA, X_test_CA, y_train_CA, y_test_CA, "Not CA", "Had CA")
    ngram_xgb(bi_pos_CHF, bi_neg_CHF, X_train_CHF, X_test_CHF, y_train_CHF, y_test_CHF, "Not CHF", "Had CHF")
    ngram_xgb(bi_pos_CM, bi_neg_CM, X_train_CM, X_test_CM, y_train_CM, y_test_CM, "Not CM", "Had CM")
else:
    ngram_xgb(bi_pos, bi_neg, X_train, X_test, y_train, y_test, "misdiagnosed", "correct")


In [None]:
# Using trigrams only
if INPT == "c":
    ngram_xgb(tri_pos_CA, tri_neg_CA, X_train_CA, X_test_CA, y_train_CA, y_test_CA, "Not CA", "Had CA")
    ngram_xgb(tri_pos_CHF, tri_neg_CHF, X_train_CHF, X_test_CHF, y_train_CHF, y_test_CHF, "Not CHF", "Had CHF")
    ngram_xgb(tri_pos_CM, tri_neg_CM, X_train_CM, X_test_CM, y_train_CM, y_test_CM, "Not CM", "Had CM")
else:
    ngram_xgb(tri_pos, tri_neg, X_train, X_test, y_train, y_test, "misdiagnosed", "correct")


In [None]:
# Using a combination of bigrams and trigrams
if INPT == "c":
    ngram_xgb(combined_pos_CA, combined_neg_CA, X_train_CA, X_test_CA, y_train_CA, y_test_CA, "Not CA", "Had CA")
    ngram_xgb(combined_pos_CHF, combined_neg_CHF, X_train_CHF, X_test_CHF, y_train_CHF, y_test_CHF, "Not CHF", "Had CHF")
    ngram_xgb(combined_pos_CM, combined_neg_CM, X_train_CM, X_test_CM, y_train_CM, y_test_CM, "Not CM", "Had CM")
else:
    ngram_xgb(combined_pos, combined_neg, X_train, X_test, y_train, y_test, "misdiagnosed", "correct")
