In [61]:
import pandas as pd
import numpy as np

In [62]:
notes = pd.read_csv('./data/NOTEEVENTS.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [63]:
notes.shape

(2083180, 11)

In [64]:
icd = pd.read_csv('./data/DIAGNOSES_ICD.csv')

In [65]:
icd.shape

(651047, 5)

In [66]:
notes.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,174,22532,167853.0,2151-08-04,,,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...
1,175,13702,107527.0,2118-06-14,,,Discharge summary,Report,,,Admission Date: [**2118-6-2**] Discharg...
2,176,13702,167118.0,2119-05-25,,,Discharge summary,Report,,,Admission Date: [**2119-5-4**] D...
3,177,13702,196489.0,2124-08-18,,,Discharge summary,Report,,,Admission Date: [**2124-7-21**] ...
4,178,26880,135453.0,2162-03-25,,,Discharge summary,Report,,,Admission Date: [**2162-3-3**] D...


In [67]:
icd.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
0,1297,109,172335,1.0,40301
1,1298,109,172335,2.0,486
2,1299,109,172335,3.0,58281
3,1300,109,172335,4.0,5855
4,1301,109,172335,5.0,4254


In [68]:
notes = notes.merge(icd, left_on=['SUBJECT_ID', 'HADM_ID'], right_on=['SUBJECT_ID','HADM_ID'])

In [69]:
notes.shape

(25736048, 14)

In [70]:
notes = notes[['TEXT','ICD9_CODE']]

In [71]:
notes

Unnamed: 0,TEXT,ICD9_CODE
0,Admission Date: [**2151-7-16**] Dischar...,01193
1,Admission Date: [**2151-7-16**] Dischar...,4254
2,Admission Date: [**2151-7-16**] Dischar...,42731
3,Admission Date: [**2151-7-16**] Dischar...,2639
4,Admission Date: [**2151-7-16**] Dischar...,2762
...,...,...
25736043,Neonatology\nBaby Girl [**Known lastname 1672*...,V053
25736044,Neonatology\nBaby Girl [**Known lastname 1672*...,V290
25736045,NPN NICU\nTerm female admitted to NICU for sep...,V3001
25736046,NPN NICU\nTerm female admitted to NICU for sep...,V053


# Create File for Labels

In [72]:
labels = notes['ICD9_CODE'].unique()

In [73]:
labels.shape

(6968,)

In [74]:
pd.DataFrame(labels).to_csv('./labels.txt', index=False, header=False)

# Create Multi-class Labels

In [75]:
notes.head()

Unnamed: 0,TEXT,ICD9_CODE
0,Admission Date: [**2151-7-16**] Dischar...,1193
1,Admission Date: [**2151-7-16**] Dischar...,4254
2,Admission Date: [**2151-7-16**] Dischar...,42731
3,Admission Date: [**2151-7-16**] Dischar...,2639
4,Admission Date: [**2151-7-16**] Dischar...,2762


In [76]:
notes = notes.astype(str)

In [77]:
# Keep top 500 codes
top_codes = set(notes["ICD9_CODE"].value_counts().nlargest(n=500).index)

In [78]:
sum(notes["ICD9_CODE"].value_counts().nlargest(n=500)) / notes.shape[0]

0.8025725239555039

In [79]:
notes.loc[~notes["ICD9_CODE"].isin(top_codes), 'ICD9_CODE'] = "OTHER"

In [81]:
from sklearn.preprocessing import MultiLabelBinarizer

In [82]:
mlb = MultiLabelBinarizer(sparse_output=True)

In [83]:
mlb.fit([list(notes['ICD9_CODE'].unique())])

MultiLabelBinarizer(classes=None, sparse_output=True)

In [84]:
mlb.classes_

array(['00845', '0380', '03811', '03819', '03842', '03843', '03849',
       '0388', '0389', '04104', '04111', '04112', '04119', '0413', '0414',
       '0417', '04185', '042', '07044', '07054', '07070', '1120', '1122',
       '1123', '1125', '1173', '1179', '1623', '1628', '1970', '1977',
       '1983', '1985', '19889', '20280', '20500', '2449', '25000',
       '25001', '25002', '25040', '25050', '25060', '25061', '25080',
       '2536', '2554', '25541', '261', '2639', '2720', '2724', '2749',
       '2753', '27541', '27542', '2760', '2761', '2762', '2763', '2764',
       '2765', '27650', '27651', '27652', '2766', '2767', '2768', '27800',
       '27801', '2800', '2809', '2841', '2848', '2851', '28521', '28522',
       '28529', '2859', '2866', '2867', '2869', '2874', '2875', '28800',
       '28860', '2910', '29181', '29281', '2930', '29410', '2948',
       '29680', '30000', '3004', '30301', '30390', '30391', '30393',
       '30401', '30500', '30501', '3051', '30560', '311', '3229', '32723

# Groupby text create lists of labels

In [85]:
notes = notes.groupby('TEXT').ICD9_CODE.apply(list).reset_index()

In [86]:
test = mlb.transform(notes['ICD9_CODE'])

In [87]:
test

<1801952x501 sparse matrix of type '<class 'numpy.int32'>'
	with 21546805 stored elements in Compressed Sparse Row format>

In [88]:
temp = pd.DataFrame(test.todense())

In [89]:
# Join notes to one-hot-encodings based on index
notes = pd.concat([notes, temp], axis=1)

In [90]:
notes.head()

Unnamed: 0,TEXT,ICD9_CODE,0,1,2,3,4,5,6,7,...,491,492,493,494,495,496,497,498,499,500
0,\n\n\n,"[78550, 70703, 41011, OTHER, OTHER, OTHER, 428...",1,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,\n\n\n\n,"[49322, 51881, 8670, OTHER, 4589, OTHER, 7850,...",0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,\n\n\n\n\n,"[2762, 78552, 4280, 29181, 99592, 0389, 486, 5...",1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,\n\n\n\n\n\n\n,"[51881, 4821, 49322, 2767, 5849, 42731, OTHER,...",0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,\n\n\n\n\n\n\n\n,"[OTHER, 51881, 43411, OTHER, 2848, 5849, 78559...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [91]:
del temp

In [92]:
del test

In [93]:
# Start with column 0
notes['labels'] = notes[[notes.columns[2]]].values.tolist()

In [95]:
notes['labels'] = notes[notes.columns[2:-1]].values.tolist()

In [None]:
# notes['labels'] = notes.apply(lambda x: x['labels'] + [x[col]], 1)

In [None]:
# for col in notes.columns[3:-1]:
#     notes['labels'] = notes.apply(lambda x: x['labels'] + [x[col]], 1)
#     notes.drop(col, axis=1, inplace=True)

# Drop All Columns That are Unused

In [96]:
notes = notes[['TEXT','labels']]

In [97]:
notes = notes.rename({'TEXT': 'text'}, axis=1)

# Create Train, Dev, & Test Files

In [99]:
train, validate, test = np.split(notes.sample(frac=1), [int(.8*len(notes)), int(0.9*len(notes))])

In [100]:
train.shape

(1441561, 2)

In [101]:
validate.shape

(180195, 2)

In [102]:
test.shape

(180196, 2)

In [103]:
train.to_csv('./train.tsv', index=False, sep='\t')

In [104]:
validate.to_csv('./validate.tsv', index=False, sep='\t')

In [105]:
test.to_csv('./test.tsv', index=False, sep='\t')