In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import string
import itertools
from collections import Counter
from gensim.models import Word2Vec, KeyedVectors
from Levenshtein import distance as lev
from ast import literal_eval
import pickle
from sklearn.model_selection import KFold
import fasttext


In [2]:
noteEvents=pd.read_csv("mimic-iii-clinical-database-1.4/NOTEEVENTS.csv")
diagnosis = pd.read_csv("mimic-iii-clinical-database-1.4/DIAGNOSES_ICD.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
categories=set(noteEvents["CATEGORY"].tolist())
categories

{'Case Management ',
 'Consult',
 'Discharge summary',
 'ECG',
 'Echo',
 'General',
 'Nursing',
 'Nursing/other',
 'Nutrition',
 'Pharmacy',
 'Physician ',
 'Radiology',
 'Rehab Services',
 'Respiratory ',
 'Social Work'}

# Preprocessing

In [4]:
noteEvents=noteEvents[["HADM_ID","TEXT","DESCRIPTION","CATEGORY"]]
diagnosis = diagnosis[["HADM_ID","SUBJECT_ID","ICD9_CODE"]]

def exclude_procedure_code(x):
    if str(x).startswith("0"):
        return False
    return True

#Group Diagnosis into 1 list per admission
diagnosis=diagnosis[diagnosis["ICD9_CODE"].apply(lambda x: exclude_procedure_code(x))]

In [5]:
def convert_icd9(x):
    converted_code = []
    for code in x:
        icd9_str = str(code)
        if icd9_str.startswith("E"):
            converted = icd9_str[:4]
        else:
            converted = icd9_str[:3]
        converted_code.append(converted)
    return converted_code

def checkdiagnosis(x):
    for code in x:
        if str(code)[0:3]=="250":
            return True
    return False

def exclude_procedure_code(x):
    included_code = []
    for code in x:
        icd9_str = str(code)
        if icd9_str.startswith("0"):
            pass
        else:
            included_code.append(code)
    return included_code

In [6]:
diagnosisGrouped = diagnosis.groupby(["SUBJECT_ID","HADM_ID"])["ICD9_CODE"].apply(list).reset_index()
diagnosisGrouped.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ICD9_CODE
0,2,163353,"[V3001, V053, V290]"
1,3,145834,"[78559, 5849, 4275, 41071, 4280, 6826, 4254, 2..."
2,4,185777,"[1363, 7994, 2763, 7907, 5715, V090, E9317]"
3,5,178980,"[V3000, V053, V290]"
4,6,107064,"[40391, 4440, 9972, 2766, 2767, 2859, 2753, V1..."


In [7]:
diagnosisGrouped["ICD9_CODE"] = diagnosisGrouped["ICD9_CODE"].apply(lambda x: exclude_procedure_code(x))
admissionswithDiabetes = diagnosisGrouped[diagnosisGrouped["ICD9_CODE"].apply(lambda x: checkdiagnosis(x))]
admissionswithDiabetes["ICD9_CODE_primary"] = admissionswithDiabetes["ICD9_CODE"].apply(lambda x: convert_icd9(x))
admissionswithDiabetes.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  admissionswithDiabetes["ICD9_CODE_primary"] = admissionswithDiabetes["ICD9_CODE"].apply(lambda x: convert_icd9(x))


Unnamed: 0,SUBJECT_ID,HADM_ID,ICD9_CODE,ICD9_CODE_primary
11,13,143045,"[41401, 4111, 25000, 4019, 2720]","[414, 411, 250, 401, 272]"
15,18,188822,"[25080, 78039, 29633, V5867, E9323, V5869, 478...","[250, 780, 296, V58, E932, V58, 478, 780, 783,..."
17,20,157681,"[41401, 4111, 25000, 2724, 4019]","[414, 411, 250, 272, 401]"
18,21,109451,"[41071, 78551, 5781, 5849, 40391, 4280, 4592, ...","[410, 785, 578, 584, 403, 428, 459, 507, 427, ..."
19,21,111970,"[78552, 40391, 42731, 70709, 5119, 6823, 99859...","[785, 403, 427, 707, 511, 682, 998, 572, 995, ..."


In [8]:
combineddf=noteEvents.merge(admissionswithDiabetes, on="HADM_ID")
print(combineddf.shape)
combineddf.head()

(406203, 7)


Unnamed: 0,HADM_ID,TEXT,DESCRIPTION,CATEGORY,SUBJECT_ID,ICD9_CODE,ICD9_CODE_primary
0,121936.0,Admission Date: [**2125-2-9**] D...,Report,Discharge summary,28063,"[42843, 41071, 5990, 4275, 5849, 5070, 4280, 2...","[428, 410, 599, 427, 584, 507, 428, 272, 401, ..."
1,121936.0,PATIENT/TEST INFORMATION:\nIndication: Aortic ...,Report,Echo,28063,"[42843, 41071, 5990, 4275, 5849, 5070, 4280, 2...","[428, 410, 599, 427, 584, 507, 428, 272, 401, ..."
2,121936.0,PATIENT/TEST INFORMATION:\nIndication: Aortic...,Report,Echo,28063,"[42843, 41071, 5990, 4275, 5849, 5070, 4280, 2...","[428, 410, 599, 427, 584, 507, 428, 272, 401, ..."
3,121936.0,Sinus rhythm. Frequent atrial premature beats...,Report,ECG,28063,"[42843, 41071, 5990, 4275, 5849, 5070, 4280, 2...","[428, 410, 599, 427, 584, 507, 428, 272, 401, ..."
4,121936.0,Rhythm is most likely sinus rhythm with freque...,Report,ECG,28063,"[42843, 41071, 5990, 4275, 5849, 5070, 4280, 2...","[428, 410, 599, 427, 584, 507, 428, 272, 401, ..."


In [9]:
processeddf=combineddf.copy()
spacestring=" "*len(string.punctuation.replace("'",""))
dstring="d"*len(string.digits)
processeddf["TEXT"]=processeddf["TEXT"].apply(lambda x: x.translate(str.maketrans(string.punctuation.replace("'",""),spacestring)))
processeddf["TEXT"] = processeddf["TEXT"].apply(lambda x: x.translate(str.maketrans(string.digits,dstring)))
processeddf["TEXT"] = processeddf["TEXT"].apply(lambda x: x.lower())
processeddf["TEXT"] = processeddf["TEXT"].apply(lambda x: x.split())

In [10]:
wordslist=processeddf["TEXT"].tolist()
flatlist=list(itertools.chain(*wordslist))
wordset=set(flatlist)
wordDict=Counter(flatlist)
wordDict5orMore= dict(filter(lambda x: x[1] >=5, wordDict.items())) 

wordDictLessThan5 = dict(filter(lambda x: x[1] <5, wordDict.items()))

In [11]:
print(len(wordDict5orMore)) #53229
print(len(wordDictLessThan5)) #109486

53229
109486


In [12]:
###GENERATE Dictionary for mapping mispelled words
# mapped_dict = dict()

# for w in range(len(wordListLessThan5)):
#     lev_dist = []
#     for i in range(len(wordList5orMore)):
#         lev_dist.append(lev(wordListLessThan5[w], wordList5orMore[i]))
#     print(np.argmin(lev_dist))
#     mapped_dict[wordListLessThan5[w]] = wordList5orMore[np.argmin(lev_dist)]

# print(len(mapped_dict))

# with open('mapDictionary.pkl', 'wb') as f:
#     pickle.dump(mapped_dict, f)

with open('mapDictionary.pkl', 'rb') as f:
    mapped_dict = pickle.load(f)

In [13]:
limited_processeddf = processeddf[processeddf['TEXT'].apply(lambda x: len(x)) > 9]
limited_processeddf = limited_processeddf[limited_processeddf['TEXT'].apply(lambda x: len(x)) < 2200]
print(limited_processeddf.shape)
limited_processeddf.head()


(399623, 7)


Unnamed: 0,HADM_ID,TEXT,DESCRIPTION,CATEGORY,SUBJECT_ID,ICD9_CODE,ICD9_CODE_primary
1,121936.0,"[patient, test, information, indication, aorti...",Report,Echo,28063,"[42843, 41071, 5990, 4275, 5849, 5070, 4280, 2...","[428, 410, 599, 427, 584, 507, 428, 272, 401, ..."
2,121936.0,"[patient, test, information, indication, aorti...",Report,Echo,28063,"[42843, 41071, 5990, 4275, 5849, 5070, 4280, 2...","[428, 410, 599, 427, 584, 507, 428, 272, 401, ..."
3,121936.0,"[sinus, rhythm, frequent, atrial, premature, b...",Report,ECG,28063,"[42843, 41071, 5990, 4275, 5849, 5070, 4280, 2...","[428, 410, 599, 427, 584, 507, 428, 272, 401, ..."
4,121936.0,"[rhythm, is, most, likely, sinus, rhythm, with...",Report,ECG,28063,"[42843, 41071, 5990, 4275, 5849, 5070, 4280, 2...","[428, 410, 599, 427, 584, 507, 428, 272, 401, ..."
5,121936.0,"[atrial, fibrillation, intraventricular, condu...",Report,ECG,28063,"[42843, 41071, 5990, 4275, 5849, 5070, 4280, 2...","[428, 410, 599, 427, 584, 507, 428, 272, 401, ..."


In [14]:
def map_misspelled(x):
    new_l = [mapped_dict.get(item, item) for item in x]
    return new_l

limited_processeddf["TEXT"] = limited_processeddf["TEXT"].apply(lambda x: map_misspelled(x))

In [15]:
limited_processeddf["ICD9_SET"]= limited_processeddf["ICD9_CODE"].apply(lambda x: set(x))
limited_processeddf["ICD9_primary_SET"]= limited_processeddf["ICD9_CODE_primary"].apply(lambda x: set(x))

In [16]:
#Binary Relevance
top_10codes={"427","518","428","584","401","276","414","285","272","585"}
def TopCodes(x):
    toplist=[]
    for code in x:
        if str(code).startswith(tuple(top_10codes)):
            toplist.append(code)
    return toplist

limited_processeddf["Top_ICD9_Codes"]=limited_processeddf["ICD9_primary_SET"].apply(lambda x: TopCodes(x))

In [17]:
#Save to CSV
limited_processeddf.to_csv("Tokenized.csv")

## Preprocessed Data Statistics

In [18]:
icd9_list=limited_processeddf["ICD9_CODE_primary"].tolist()
icd9_flatlist=list(itertools.chain(*icd9_list))
print("Num. icd9 primary code",len(icd9_flatlist))
icd9_wordset=set(icd9_flatlist)
icd9_wordDict=Counter(icd9_flatlist)
print("Num. unique icd9 primary code",len(icd9_wordDict))
icd9_re_list=limited_processeddf["ICD9_CODE"].tolist()
icd9_re_flatlist=list(itertools.chain(*icd9_re_list))
print("Num. icd9 regular code",len(icd9_re_flatlist))
icd9_re_wordset=set(icd9_re_flatlist)
icd9_re_wordDict=Counter(icd9_re_flatlist)
print("Num. unique icd9 regular code",len(icd9_re_wordDict))

Num. icd9 primary code 6703628
Num. unique icd9 primary code 808
Num. icd9 regular code 6703628
Num. unique icd9 regular code 3965


In [19]:
limited_processeddf["ICD9_SET"]= limited_processeddf["ICD9_CODE"].apply(lambda x: set(x))
limited_processeddf["ICD9_primary_SET"]= limited_processeddf["ICD9_CODE_primary"].apply(lambda x: set(x))
limited_processeddf["NumberofICD9Codes"] = limited_processeddf["ICD9_SET"].apply(lambda x: len(x))
limited_processeddf["NumberofICD9CodesPri"] = limited_processeddf["ICD9_CODE_primary"].apply(lambda x: len(set(x)))
CodesperReport = limited_processeddf["NumberofICD9Codes"].tolist()
avgcodesperReport = sum(CodesperReport) / len(CodesperReport)
PriCodesperReportlist = limited_processeddf["NumberofICD9CodesPri"].tolist()
avgPriCodesperReport = sum(PriCodesperReportlist) / len(PriCodesperReportlist)
print("avgCodesperReport",avgcodesperReport)
print("avgPriCodesperReport",avgPriCodesperReport)

avgCodesperReport 16.773669183205172
avgPriCodesperReport 15.200246232073729


In [20]:
limited_processeddf["NumberofTokens"] = limited_processeddf["TEXT"].apply(lambda x: len(x))
NumberofTokensList=limited_processeddf["NumberofTokens"].tolist()
avgTokensperReport = sum(NumberofTokensList) / len(NumberofTokensList)
print("avgTokensperReport",avgTokensperReport)

avgTokensperReport 309.05638814582744


In [21]:
print("avgDensityperReport",avgcodesperReport/len(icd9_wordDict))
print("avgPriDensityperReport",avgPriCodesperReport/len(icd9_re_wordDict))

avgDensityperReport 0.02075949156337274
avgPriDensityperReport 0.0038336056070803854


In [22]:
limited_processeddf.head(5)

Unnamed: 0,HADM_ID,TEXT,DESCRIPTION,CATEGORY,SUBJECT_ID,ICD9_CODE,ICD9_CODE_primary,ICD9_SET,ICD9_primary_SET,Top_ICD9_Codes,NumberofICD9Codes,NumberofICD9CodesPri,NumberofTokens
1,121936.0,"[patient, test, information, indication, aorti...",Report,Echo,28063,"[42843, 41071, 5990, 4275, 5849, 5070, 4280, 2...","[428, 410, 599, 427, 584, 507, 428, 272, 401, ...","{42731, 44020, 5997, 5849, 41071, E8796, 2724,...","{272, 599, 427, 410, 414, 584, V45, E879, 440,...","[272, 427, 414, 584, 401, 428]",16,13,428
2,121936.0,"[patient, test, information, indication, aorti...",Report,Echo,28063,"[42843, 41071, 5990, 4275, 5849, 5070, 4280, 2...","[428, 410, 599, 427, 584, 507, 428, 272, 401, ...","{42731, 44020, 5997, 5849, 41071, E8796, 2724,...","{272, 599, 427, 410, 414, 584, V45, E879, 440,...","[272, 427, 414, 584, 401, 428]",16,13,306
3,121936.0,"[sinus, rhythm, frequent, atrial, premature, b...",Report,ECG,28063,"[42843, 41071, 5990, 4275, 5849, 5070, 4280, 2...","[428, 410, 599, 427, 584, 507, 428, 272, 401, ...","{42731, 44020, 5997, 5849, 41071, E8796, 2724,...","{272, 599, 427, 410, 414, 584, V45, E879, 440,...","[272, 427, 414, 584, 401, 428]",16,13,18
4,121936.0,"[rhythm, is, most, likely, sinus, rhythm, with...",Report,ECG,28063,"[42843, 41071, 5990, 4275, 5849, 5070, 4280, 2...","[428, 410, 599, 427, 584, 507, 428, 272, 401, ...","{42731, 44020, 5997, 5849, 41071, E8796, 2724,...","{272, 599, 427, 410, 414, 584, V45, E879, 440,...","[272, 427, 414, 584, 401, 428]",16,13,66
5,121936.0,"[atrial, fibrillation, intraventricular, condu...",Report,ECG,28063,"[42843, 41071, 5990, 4275, 5849, 5070, 4280, 2...","[428, 410, 599, 427, 584, 507, 428, 272, 401, ...","{42731, 44020, 5997, 5849, 41071, E8796, 2724,...","{272, 599, 427, 410, 414, 584, V45, E879, 440,...","[272, 427, 414, 584, 401, 428]",16,13,53
