In [12]:
import numpy as np
import pandas as pd

In [13]:
df = pd.read_csv('annotations.csv')

The provided dataset `annotations.csv` does not contain the crucial discharge summary due HIPAA requirements. Thus, you would need to run the following commands inside `\data` directory in order to download the MIMIC-III dataset from Physionet, which contains the discharge summary.

_You can replace the `physionetUsername` with your Physionet username. If you do not have one, you can register for free [here](https://physionet.org/about/database/)._


```physionetUsername = input('Physionet Username: ')
physionetDataset = 'https://physionet.org/files/mimiciii/1.4/NOTEEVENTS.csv.gz'
assert physionetUsername is not None
!wget -O "data/mimic3/NOTEEVENTS.csv.gz" -r -N -c -np --user {physionetUsername} --ask-password "{physionetDataset}"
find . -name '*.csv.gz' -print0 | xargs -0 -n1 gzip -d
```

In [14]:
notes = pd.read_csv('NOTEEVENTS.csv')

In [15]:
df.loc[0, "Hospital.Admission.ID"]

118003

In [16]:
summary = []
for i in range(len(df)):
    sub_id = df.loc[i, "subject.id"]
    hadm_id = df.loc[i, "Hospital.Admission.ID"]
    summaries = notes[(notes['CATEGORY'] == 'Discharge summary') & (notes['SUBJECT_ID'] == sub_id) & (notes['HADM_ID'] == hadm_id)]
    words = ""
    for j in summaries.index:
        words += summaries.loc[j, "TEXT"]
    summary.append(words)

KeyError: 'CATEGORY'

In [92]:
df['text'] = summary

In [17]:
df

Unnamed: 0.1,Unnamed: 0,Hospital.Admission.ID,subject.id,chart.time,cohort,Obesity,Non.Adherence,Developmental.Delay.Retardation,Advanced.Heart.Disease,Advanced.Lung.Disease,Schizophrenia.and.other.Psychiatric.Disorders,Alcohol.Abuse,Other.Substance.Abuse,Chronic.Pain.Fibromyalgia,Chronic.Neurological.Dystrophies,Advanced.Cancer,Depression,Dementia,Unsure,text
0,0,118003,3644,118003,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,Admission Date: [**2200-4-7**] Discharge ...
1,1,177830,97736,999999,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,Admission Date: [**2107-2-13**] ...
2,2,185673,27694,999999,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,Admission Date: [**2167-5-19**] ...
3,3,131938,16275,131938,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Admission Date: [**2105-4-30**] Dischar...
4,4,198999,4059,198999,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,Admission Date: [**2108-3-23**] Dischar...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1605,1605,154684,8743,999999,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,Admission Date: [**2133-1-7**] D...
1606,1606,110974,17580,999999,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,Admission Date: [**2130-1-3**] D...
1607,1607,160802,7188,160802,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Admission Date: [**2177-5-14**] ...
1608,1608,106955,518,106955,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,Admission Date: [**2109-4-12**] ...


In [94]:
df.to_csv('w_text.csv')  

You can replace the original `annotations.csv` with the new dataset `w_text.csv` that contains the discharge summary.

In [3]:
from gensim.models import Word2Vec
from gensim.test.utils import common_texts

In [4]:
print(common_texts)

[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']]


In [9]:
model = Word2Vec(sentences=words, vector_size=100, window=5, min_count=1, workers=4)
model.wv.save_word2vec_format('test_w2v.txt', binary=False)

In [11]:
model = Word2Vec(sentences=words, min_count=1)
model.wv.save_word2vec_format('test_w2v.txt', binary=False)

In [44]:
text = .values


In [45]:
print(type(text))
print(text.shape)
print(text)

<class 'numpy.ndarray'>
(1610,)
["Admission Date:  [**2200-4-7**]     Discharge Date:  [**2200-4-10**]\n\nDate of Birth:   [**2146-9-21**]     Sex:  F\n\nService:  CARDIAC INTENSIVE CARE MEDICINE\n\nCHIEF COMPLAINT:  The patient was admitted to the Cardiac\nIntensive Care Unit Medicine Service on [**2200-4-7**], with the\nchief complaint of acute myocardial infarction and fever.\n\nHISTORY OF PRESENT ILLNESS:  The patient is a 53 year old\nwhite female with a history of coronary artery disease,\nhypertension, hypercholesterolemia and two pack per day\ntobacco use with previous coronary artery bypass graft\nsurgery presenting to an outside hospital on [**2200-4-6**], with a\ntwo day history of fevers and confusion.  The patient had a\nCT scan of the chest at that time which revealed pneumonia by\nreport in the left lower lobe.\n\nWhile in the outside hospital Emergency Department, the\npatient complained of chest pain.  The patient states that\nshe has had this pain for approximately tw

In [60]:
words = []
for i in range(len(df)):
    words.append(str(df["text"].values[i]).split())
print(len(words))


1610


In [61]:
print(words[0])

['Admission', 'Date:', '[**2200-4-7**]', 'Discharge', 'Date:', '[**2200-4-10**]', 'Date', 'of', 'Birth:', '[**2146-9-21**]', 'Sex:', 'F', 'Service:', 'CARDIAC', 'INTENSIVE', 'CARE', 'MEDICINE', 'CHIEF', 'COMPLAINT:', 'The', 'patient', 'was', 'admitted', 'to', 'the', 'Cardiac', 'Intensive', 'Care', 'Unit', 'Medicine', 'Service', 'on', '[**2200-4-7**],', 'with', 'the', 'chief', 'complaint', 'of', 'acute', 'myocardial', 'infarction', 'and', 'fever.', 'HISTORY', 'OF', 'PRESENT', 'ILLNESS:', 'The', 'patient', 'is', 'a', '53', 'year', 'old', 'white', 'female', 'with', 'a', 'history', 'of', 'coronary', 'artery', 'disease,', 'hypertension,', 'hypercholesterolemia', 'and', 'two', 'pack', 'per', 'day', 'tobacco', 'use', 'with', 'previous', 'coronary', 'artery', 'bypass', 'graft', 'surgery', 'presenting', 'to', 'an', 'outside', 'hospital', 'on', '[**2200-4-6**],', 'with', 'a', 'two', 'day', 'history', 'of', 'fevers', 'and', 'confusion.', 'The', 'patient', 'had', 'a', 'CT', 'scan', 'of', 'the', 'c

In [47]:
import numpy as np

# Assuming `text` is your numpy array with 1610 rows
# Replace the following line with your array
test = np.array(["This is a sample string", "Another sample string"])
# Use a list comprehension to create a nested list where each inner list is a row in `text` split by spaces
nested_list = [row.split() for row in test]

print(nested_list)


[['This', 'is', 'a', 'sample', 'string'], ['Another', 'sample', 'string']]


In [59]:
print("hello")

hello
