In [1]:
# import nltk
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


In [2]:
cohort = pd.read_csv('cohort.csv', index_col=0)
ndc = pd.read_csv('ndcxls/product.csv')

In [3]:
cohort.head()
# cohort.shape

Unnamed: 0,subject_id,target
0,109,1.0
1,188,1.0
2,222,1.0
3,236,1.0
4,305,1.0


In [4]:
all_notes = pd.read_csv('s3://athena-output-mimic/noteevents_all/2022/04/10/8a123711-2ce8-4927-93fd-5b1c01e79841.csv')
all_scripts = pd.read_csv('s3://athena-output-mimic/prescriptions_all/2022/04/10/76afdfc4-62a9-4ecc-936e-21e55b3119ca.csv')

desc = pd.read_csv('s3://athena-output-mimic/diagnosis_descriptions/2022/04/10/539668ed-8120-4085-b072-7ea0168a4ae6.csv')
diag = pd.read_csv('s3://athena-output-mimic/diagnoses/2022/04/10/7e4c60d3-184c-4ab4-a21a-3bfb78922ae5.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
all_notes.head()

Unnamed: 0,subject_id,_col1
0,24291,Nursing NICU note Addendum.\nMother in this ev...
1,24291,NPN 1900-0700\n\n\n1. FEN: TF=130cc/k/day B...
2,24416,Admission Note\nOb- [**Month (only) 910**]\nPe...
3,24416,Nursing Triage Note\nInfant is a full term mal...
4,24131,[**Location (un) 369**]/NEON DOL 5 CGA 34 [**5...


In [6]:
all_notes['_col1'] = all_notes['_col1'].astype(str)

In [7]:
notes_grpd = all_notes.groupby('subject_id')['_col1'].apply(lambda x: ' '.join(x))

In [8]:
# notes_grpd

In [9]:
all_scripts.head()

Unnamed: 0,row_id,subject_id,hadm_id,icustay_id,startdate,enddate,drug_type,drug,drug_name_poe,drug_name_generic,formulary_drug_cd,gsn,ndc,prod_strength,dose_val_rx,dose_unit_rx,form_val_disp,form_unit_disp,route
0,2214776,6,107064,,2175-06-11 00:00:00.000,2175-06-12 00:00:00.000,MAIN,Tacrolimus,Tacrolimus,Tacrolimus,TACR1,21796.0,469061711.0,1mg Capsule,2,mg,2,CAP,PO
1,2214775,6,107064,,2175-06-11 00:00:00.000,2175-06-12 00:00:00.000,MAIN,Warfarin,Warfarin,Warfarin,WARF5,6562.0,56017275.0,5mg Tablet,5,mg,1,TAB,PO
2,2215524,6,107064,,2175-06-11 00:00:00.000,2175-06-12 00:00:00.000,MAIN,Heparin Sodium,,,HEPAPREMIX,6522.0,338055002.0,"25,000 unit Premix Bag",25000,UNIT,1,BAG,IV
3,2216265,6,107064,,2175-06-11 00:00:00.000,2175-06-12 00:00:00.000,BASE,D5W,,,HEPBASE,,0.0,HEPARIN BASE,250,ml,250,ml,IV
4,2214773,6,107064,,2175-06-11 00:00:00.000,2175-06-12 00:00:00.000,MAIN,Furosemide,Furosemide,Furosemide,FURO20,8208.0,54829725.0,20mg Tablet,20,mg,1,TAB,PO


In [10]:
all_scripts['drug_name_generic'] = all_scripts['drug_name_generic'].astype(str)

In [11]:
scripts_grpd = all_scripts.groupby('subject_id')['drug_name_generic'].apply(lambda x: ' '.join(x))

In [12]:
# diag.head()

In [13]:
# desc.head()

In [14]:
icd_map = pd.Series(desc['short_title'].values, index=desc['icd9_code']).to_dict()
diag['icd9_code'] = diag['icd9_code'].map(icd_map)
diag.head()

Unnamed: 0,row_id,subject_id,hadm_id,seq_num,icd9_code
0,1297,109,172335,1.0,Mal hyp kid w cr kid V
1,1298,109,172335,2.0,"Pneumonia, organism NOS"
2,1299,109,172335,3.0,Chr nephritis in oth dis
3,1300,109,172335,4.0,Chron kidney dis stage V
4,1301,109,172335,5.0,Prim cardiomyopathy NEC


In [15]:
diag['icd9_code'] = diag['icd9_code'].astype(str)
diag_grpd = diag.groupby('subject_id')['icd9_code'].apply(lambda x: ' '.join(x))

In [16]:
diag_grpd

subject_id
2        Single lb in-hosp w cs Need prphyl vc vrl hepa...
3        Septicemia NOS Shock w/o trauma NEC Acute kidn...
4        Human immuno virus dis Pneumocystosis Cachexia...
5        Single lb in-hosp w/o cs Need prphyl vc vrl he...
6        Hyp kid NOS w cr kid V nan Surg comp-peri vasc...
                               ...                        
99985    Septicemia NOS Acute respiratry failure Meth s...
99991    Dvrtcli colon w/o hmrhg Septicemia NOS Ac vasc...
99992    Complic med care NEC/NOS Hemoperitoneum Pancre...
99995    Abdom aortic aneurysm Ac on chr diast hrt fail...
99999    Spondylolisthesis Stridor Hypertension NOS DMI...
Name: icd9_code, Length: 46520, dtype: object

In [17]:
cohort.head()

Unnamed: 0,subject_id,target
0,109,1.0
1,188,1.0
2,222,1.0
3,236,1.0
4,305,1.0


In [18]:
cohort = cohort.merge(notes_grpd, on='subject_id', how='left')
cohort = cohort.merge(scripts_grpd, on='subject_id', how='left')
cohort = cohort.merge(diag_grpd, on='subject_id', how='left')

In [19]:
cohort['features'] = cohort['_col1'] + cohort['drug_name_generic'] + cohort['icd9_code']

In [20]:
cohort.head()

Unnamed: 0,subject_id,target,_col1,drug_name_generic,icd9_code,features
0,109,1.0,Chief Complaint: Hypertensive urgency\n I sa...,HydrALAzine Labetalol NiCARdipine Valsartan Pr...,"Mal hyp kid w cr kid V Pneumonia, organism NOS...",Chief Complaint: Hypertensive urgency\n I sa...
1,188,1.0,[**2158-8-11**] 10:43 AM\n CT ABD W&W/O C; CT ...,nan nan nan Fluconazole nan nan nan Fentanyl C...,Compl liver transplant Acute respiratry failur...,[**2158-8-11**] 10:43 AM\n CT ABD W&W/O C; CT ...
2,222,1.0,Admission Date: [**2137-7-15**] Discharge...,Heparin Flush (10 Units/mL) Metoprolol Tartrat...,Mal neo lower lobe lung Acute & chronc resp fa...,Admission Date: [**2137-7-15**] Discharge...
3,236,1.0,[**2135-6-22**] 9:00 AM\n LIVER OR GALLBLADDER...,nan nan Heparin Sodium Metoprolol nan nan Insu...,Acute necrosis of liver Malignant neo liver NO...,[**2135-6-22**] 9:00 AM\n LIVER OR GALLBLADDER...
4,305,1.0,[**2127-6-19**] 3:44 PM\n CT ABDOMEN W/CONTRAS...,Insulin Human 70/30 Insulin - Sliding Scale In...,"Subendo infarct, initial Mitral valve disorder...",[**2127-6-19**] 3:44 PM\n CT ABDOMEN W/CONTRAS...


In [21]:
cohort_feat = cohort[['subject_id', 'features', 'target']]

In [22]:
cohort_feat

Unnamed: 0,subject_id,features,target
0,109,Chief Complaint: Hypertensive urgency\n I sa...,1.0
1,188,[**2158-8-11**] 10:43 AM\n CT ABD W&W/O C; CT ...,1.0
2,222,Admission Date: [**2137-7-15**] Discharge...,1.0
3,236,[**2135-6-22**] 9:00 AM\n LIVER OR GALLBLADDER...,1.0
4,305,[**2127-6-19**] 3:44 PM\n CT ABDOMEN W/CONTRAS...,1.0
...,...,...,...
1555,5076,"S/P CABG X 1 LIMA TO LAD\nS: ""HOW AM I DOING?""...",0.0
1556,15969,[**2149-5-17**] 5:58 PM\n CHEST (PA & LAT) ...,0.0
1557,24040,NPN 7P-7A\nPLEASE SEE CAREVIEW FOR OBJECTIVE D...,0.0
1558,24244,Nursing Note 7p-7a\nS: Intubated/[** **].\nO: ...,0.0


In [23]:
X_trainRaw, X_testRaw, y_trainRaw, y_testRaw = train_test_split(cohort_feat[['subject_id', 'features']], cohort_feat['target'], test_size=0.30, random_state=42)

In [24]:
X_trainRaw.head()
# y_trainRaw.head()

Unnamed: 0,subject_id,features
599,32658,[**2143-4-6**] 1:52 PM\n LIVER OR GALLBLADDER ...
613,42327,PATIENT/TEST INFORMATION:\nIndication: Chemoth...
312,17735,[**2170-3-9**] 10:54 AM\n CT ABDOMEN W/CONTRAS...
964,27390,PATIENT/TEST INFORMATION:\nIndication: Left ve...
997,43737,Chief Complaint: Left arm weakness\n I saw a...


In [25]:
raw_train = pd.concat([X_trainRaw, y_trainRaw], axis=1)
raw_test = pd.concat([X_testRaw, y_testRaw], axis=1)

raw_train.to_csv('raw_train.csv')
raw_test.to_csv('raw_test.csv')

In [27]:
tfIdfVectorizer = TfidfVectorizer(use_idf=True)
tf_idf = tfIdfVectorizer.fit_transform(cohort_feat['features'].values.astype('U'))


df = pd.DataFrame(tf_idf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)



In [28]:
df.head(10)

Unnamed: 0,TF-IDF
and,0.240064
with,0.23982
sle,0.223106
to,0.198815
of,0.188987
hd,0.183545
urgency,0.175154
hypertensive,0.17479
2142,0.174463
esrd,0.171771


In [29]:
# tf_idf.toarray()
tf_idf = pd.DataFrame(tf_idf.toarray())

In [30]:
tf_idf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,107809,107810,107811,107812,107813,107814,107815,107816,107817,107818
0,0.012637,0.006237,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,0.005310,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,0.008912,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,0.008730,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,0.009701,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1555,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1556,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1557,0.010230,0.044808,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1558,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.024799,0.041026,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
cohort_feat.head()

Unnamed: 0,subject_id,features,target
0,109,Chief Complaint: Hypertensive urgency\n I sa...,1.0
1,188,[**2158-8-11**] 10:43 AM\n CT ABD W&W/O C; CT ...,1.0
2,222,Admission Date: [**2137-7-15**] Discharge...,1.0
3,236,[**2135-6-22**] 9:00 AM\n LIVER OR GALLBLADDER...,1.0
4,305,[**2127-6-19**] 3:44 PM\n CT ABDOMEN W/CONTRAS...,1.0


In [32]:
fin_frame = pd.concat([cohort_feat['subject_id'], tf_idf, cohort_feat['target']], axis=1)

In [33]:
fin_frame.head()

Unnamed: 0,subject_id,0,1,2,3,4,5,6,7,8,...,107810,107811,107812,107813,107814,107815,107816,107817,107818,target
0,109,0.012637,0.006237,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,188,0.00531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,222,0.008912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,236,0.00873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,305,0.009701,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [34]:
X_train, X_test, y_train, y_test = train_test_split(fin_frame.drop('target', axis=1), fin_frame['target'], test_size=0.30, random_state=42)

In [35]:
X_train
# X_trainRaw

Unnamed: 0,subject_id,0,1,2,3,4,5,6,7,8,...,107809,107810,107811,107812,107813,107814,107815,107816,107817,107818
599,32658,0.005328,0.0,0.004796,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
613,42327,0.001727,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
312,17735,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
964,27390,0.003818,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
997,43737,0.002810,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,50807,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1294,73417,0.013846,0.0,0.011216,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
860,81723,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1459,5201,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [37]:
train.to_csv('train.csv')
test.to_csv('test.csv')