### Environment Setup

```
pip install torch torchvision
pip install -U spacy
pip install nltk
pip install pandas
pip install numpy
```

In [6]:
!pip install numpy

[31msagemaker 1.18.5 has requirement requests<2.21,>=2.20.0, but you'll have requests 2.21.0 which is incompatible.[0m
[31mdocker-compose 1.23.2 has requirement requests!=2.11.0,!=2.12.2,!=2.18.0,<2.21,>=2.6.1, but you'll have requests 2.21.0 which is incompatible.[0m
[33mYou are using pip version 10.0.1, however version 19.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [1]:
import torch
import json
from utils import jsonl
import pandas as pd
import numpy as np
import spacy
import nltk
#nltk.data.path.append("/home/ubuntu/nltk_data") # if nltk_data is downloaded to a customized place

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Checking the Sample Data Provided

In [2]:
with open('./dataset/stanford-sample/train.jsonl', 'r') as fh:
    stanford_file = fh.readlines()

with open('./dataset/iu-chest/test.jsonl', 'r') as fh:
    iu_file = fh.readlines()

In [3]:
## return a list, where each item is a parsed dictionary, 
## containing 'part', 'background', 'findings', 'impression' key-value pairs
## 'part' is not required

jsonl.load(stanford_file)

[{'part': 'ankle',
  'background': ['RADIOGRAPHIC',
   'EXAMINATION',
   'OF',
   'THE',
   'FOOT',
   ':',
   '<DATE>',
   '<TIME>',
   'AM',
   'CLINICAL',
   'HISTORY',
   ':',
   '<AGE>',
   'years',
   'of',
   'age',
   ',',
   'Female',
   ',',
   'RT',
   'ANKLE-DXAN2',
   'DXFT3-WB',
   '.',
   'COMPARISON',
   ':',
   'None',
   '.',
   'PROCEDURE',
   'COMMENTS',
   ':',
   'Three',
   'views',
   'of',
   'the',
   'right',
   'and',
   'left',
   'foot',
   '.',
   '3',
   'views',
   'of',
   'the',
   'right',
   'and',
   'left',
   'ankle',
   '.'],
  'findings': ['FINDINGS',
   ':',
   '3',
   'views',
   'of',
   'the',
   'left',
   'foot',
   'demonstrate',
   'mild',
   'midfoot',
   'degenerative',
   'changes',
   'and',
   'mild',
   'hammertoe',
   'deformity',
   'of',
   'the',
   'third',
   'ray',
   '.',
   'No',
   'acute',
   'fracture',
   'or',
   'dislocation',
   '.',
   '3',
   'views',
   'of',
   'the',
   'right',
   'foot',
   'demonstrate',
  

In [4]:
jsonl.load(iu_file)

[{'background': ['INDICATION',
   ':',
   'XXXX',
   'year',
   'old',
   'male',
   'status',
   'post',
   'XXXX',
   'COMPARISON',
   ':',
   'None'],
  'findings': ['FINDINGS',
   ':',
   'Lungs',
   'are',
   'clear',
   'bilaterally',
   'with',
   'no',
   'focal',
   'infiltrate',
   ',',
   'pleural',
   'effusion',
   ',',
   'or',
   'pneumothoraces',
   '.',
   'Cardiomediastinal',
   'silhouette',
   'is',
   'within',
   'normal',
   'limits',
   '.',
   'XXXX',
   'and',
   'soft',
   'tissues',
   'are',
   'unremarkable',
   '.'],
  'impression': ['IMPRESSION',
   ':',
   'No',
   'acute',
   'cardiopulmonary',
   'abnormality',
   '.']},
 {'background': ['INDICATION',
   ':',
   'XXXX',
   'year',
   'old',
   'female',
   ',',
   'post',
   'XXXX',
   'closure',
   'COMPARISON',
   ':',
   'None'],
  'findings': ['FINDINGS',
   ':',
   'Atrial',
   'septal',
   'occluder',
   'artifact',
   '.',
   'Rotated',
   'frontal',
   'position',
   ',',
   'overall',
   'hea

## IA to jsonl Conversion
- Dingchao has a `all_data_cleaned.csv`, which contains cleaned up raw data, ask him where it is stored

In [3]:
## IA parsing function

keywords = {'CLINICAL INFORMATION:',
             'COMPARISON:',
             'FINDINGS:',
             'IMPRESSION:',
             'TECHNIQUE:'}

def parse_text(txt, keywords):
    '''separate bulk text into different subparts based on the keywords'''
    
    txt = txt.replace("\t", "")
    txt = re.sub(' +', ' ', txt).strip()
    txt = re.sub('\n+' , '\n', txt)
    txt = re.sub('FINDINGS:( |\n)*' , 'FINDINGS:', txt)
    txt = re.sub('IMPRESSION:( |\n)*' , 'IMPRESSION:', txt)
    
    parts_idx = pd.Series()    
    for keyword in keywords:
        parts_idx[keyword] = txt.find(keyword)
    parts_idx = parts_idx.sort_values()

    result = pd.DataFrame(data=[[""]*len(keywords)], columns=keywords)
    for i in range(len(keywords)):
        keyword = parts_idx.index[i]
        
        if i+1 >= len(keywords):
            result[keyword] = txt[parts_idx.iloc[i]:]
        else:
            result[keyword] = txt[parts_idx.iloc[i]:parts_idx.iloc[i+1]]
    
    result.columns = [col.strip(':') for col in result.columns]
    
    return result

In [4]:
## IA data to jsonl format

column2jsonKey_mapping = {'TECHNIQUE': 'techinque',
                            'CLINICAL INFORMATION': 'background',
                            'COMPARISON': 'comparison',
                            'FINDINGS': 'findings', 
                            'IMPRESSION': 'impression',
                           }

def tokenize(txt): 
    '''
    Return a list of tokens from txt.
    Can swap in different tokenizers (remove punct, stemming, normalizing, etc.)
    '''
    # spacy is too slow (doing too many other things)
    # nlp = spacy.load('en_core_web_sm')
    # [token.text for token in nlp(row[input_key])]
    txt = txt.lower()
    return nltk.word_tokenize(txt)


def row2json_dict(row, column2jsonKey_mapping, is_tokenized=False):
    json_dict = {}
    for input_key, output_key in column2jsonKey_mapping.items():
        if is_tokenized:
            json_dict[output_key] = row[input_key]
        else:
            json_dict[output_key] = tokenize(row[input_key])
    
    return json_dict


def convert_data_to_jsonl(df):
    
    df  = df.fillna('')
    res = df.apply(lambda row: row2json_dict(row, column2jsonKey_mapping), axis=1)
    
    return res


def dump(data, output_file):
    '''
    Input data is a list of dictionaries. Each list item is one document
    '''
    with open(output_file, 'w') as fh:
        for d in data:
            fh.write(json.dumps(d, ensure_ascii=False)+'\n')
    
    return


def train_test_split(data, train_ratio, dev_ratio, test_ratio):
    '''
    Input data is a list of observations
    '''
    assert train_ratio + dev_ratio + test_ratio == 1.0
    
    total_size = len(data)
    perm_idx = np.random.permutation(total_size)
    data  = np.array(data)[perm_idx].tolist()
    
    train_size = int(total_size * train_ratio)
    dev_size   = int(total_size * dev_ratio)
    test_size  = int(total_size * test_ratio)
    
    train = data[:train_size]
    dev   = data[train_size:(train_size+dev_size)]
    test  = data[-test_size:-1]
    
    return train, dev, test
    

In [5]:
data_df = pd.read_csv('./dataset/ia-patients/ull_data_cleaned.csv', header = 0)#, nrows=5000)

In [6]:
data_df.head(30)

Unnamed: 0,IMPRESSION,TECHNIQUE,COMPARISON,FINDINGS,CLINICAL INFORMATION
0,"Inflammatory changes involving the maxillary, ...",Contrast enhanced CT scan of the orbits,None.,There is mucosal thickening involving the sphe...,History of right periorbital cellulitis diagno...
1,No significant change from previous with no ev...,Routine non-contrast examination of the brain.,12/30/2007,There is evidence of periventricular hypodensi...,Patient with left-sided weakness and the plaqu...
2,Normal brain CT.,Routine non-contrast examination of the brain.,,There is no evidence of intracranial hemorrhag...,One month old with history of fall. No loss of...
3,Normal noncontrast CT scan brain.,Routine noncontrast CT scan brain,9/26/2005,Noncontrast CT scan brain is normal. There is ...,Headache
4,Small vessel ischemic disease with no evidence...,Routine non-contrast examination of the brain.,,The examination demonstrates mild periventricu...,82-year-old with fever and mental status chang...
5,Status post shunt placement with decreased siz...,Routine noncontrast CT scan brain,12/31/2007 at 5:18,Since the previous examination a right frontal...,Status post surgery
6,Hemorrhage and increased edema within the left...,Routine non-contrast examination of the brain.,12/29/2007,Right frontal shunt catheter is again noted wi...,Check shunt placement
7,Normal brain CT. if clinical suspicion of isch...,Routine non-contrast examination of the brain.,,There is no evidence of intracranial hemorrhag...,rule out CVA in 63 are old woman with altered ...
8,Normal brain CT. if clinical suspicion for acu...,Routine non-contrast examination of the brain.,12/14/2007,There is no evidence of intracranial hemorrhag...,Rule out stroke or bleed in patient with slurr...
9,No definite evidence of acute cortical ischemi...,Routine non-contrast examination of the brain.,,There is no evidence of intracranial hemorrhag...,51 year old with aortic aneurysm repair now wi...


In [4]:
#Drop rows where length of IMPRESSION exceeds the required MIN_LEN
MIN_LEN = 120
data_df = data_df.drop(data_df[data_df['IMPRESSION'].map(len) > MIN_LEN].index)

Data Conversion

In [7]:
%%time
res = convert_data_to_jsonl(data_df)

CPU times: user 8min 2s, sys: 2.24 s, total: 8min 4s
Wall time: 8min 4s


In [8]:
dump(res, 'ia_data_all_u.jsonl')

Train Test Split

In [9]:
train, dev, test = train_test_split(res, train_ratio=0.80, dev_ratio=0.10, test_ratio=0.10)

In [11]:
dump(train, './dataset/ia-patients/trainu.jsonl')
dump(dev, './dataset/ia-patients/devu.jsonl')
dump(test, './dataset/ia-patients/testu.jsonl')

### Following is self optionals trials of Utku

In [7]:
import json
import uuid
from IPython.display import display_javascript, display_html, display
#import plotly.plotly as py
#import base64
#import plotly.graph_objs as go

class RenderJSON(object):
    def __init__(self, json_data):
        if isinstance(json_data, dict):
            self.json_str = json.dumps(json_data)
        else:
            self.json_str = json_data
            self.uuid = str(uuid.uuid4())

    def _ipython_display_(self):
        display_html('<div id="{}" style="height: 600px; width:100%;"></div>'.format(self.uuid), raw=True)
        display_javascript("""
        require(["https://rawgit.com/caldwell/renderjson/master/renderjson.js"], function() {
        document.getElementById('%s').appendChild(renderjson(%s))
        });
        """ % (self.uuid, self.json_str), raw=True)

In [20]:
list_of_json_files = []
for line in open('./dataset/ia-patients/testu.jsonl', 'r'):
    tweets.append(json.loads(line))


#with open('ia_data_all_u.jsonl') as json_file:  
 #   data = json.load(json_file)
#a=json.loads()

#json.loads(x)

#RenderJSON(json.dumps(data))
len(list_of_json_files)

18660

# Test ground

In [None]:
nltk.word_tokenize("Hey, I feel didn't bad utku")