In [1]:
##############
# DATA LOADING
##############

In [3]:
##################
# All imports 
##################

#sys libs
import os
import sys
import random
import warnings
warnings.filterwarnings("ignore")

#data manupulation libs
import pandas as pd
import numpy as np
#from sklearn.model_selection import train_test_split

#from pandarallel import pandarallel

# Initialization
#pandarallel.initialize()

#string manupulation libs
import re
import string
from string import digits
import spacy

#torch libs
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim

import transformers

from transformers import BertTokenizer

# data manipulations
from pathlib import Path
import uuid
import pydicom

from PIL import Image

import cv2
import matplotlib.pyplot as plt

In [6]:
#############################
# Prepare data
#############################

# Used to make the csv with raw reports, not really important anymore once we have the reports in a csv
cxr_root_path = "/media/Data/ComputerVision/mimic_cxr_jpg_small_chest_radiograph/data/physionet.org/files/mimic-cxr-jpg/2.0.0/"
df_record = pd.read_csv('{}/cxr-record-list.csv.gz'.format(cxr_root_path), sep=',')
df_split = pd.read_csv('{}/mimic-cxr-2.0.0-split.csv.gz'.format(cxr_root_path))

df_temp = df_split.merge(df_record, on=['subject_id', 'study_id', 'dicom_id'], how='left')

df_sections = pd.read_csv('{}/mimic-cxr-sections/mimic_cxr_sectioned.csv'.format(cxr_root_path))

# if you already have the df_raw_reports.csv just uncomment the following line and skipp to the vocabulary class

# extract the csv into a dataframe
# df_raw_reports = pd.read_csv('df_raw_reports.csv')
# # discard duplicates of the reports as there were originally more rows than reports (as there is sometimes more images per study)
# df_raw_reports= df_raw_reports.drop_duplicates(subset = ["raw_report"])
# #restart the index column as it was filtered from larger data and the indices were messed up
# df_raw_reports.reset_index(inplace=True)
# #discard some unimportant columns
# df_raw_reports = df_raw_reports.drop('index', 1)
# df_raw_reports = df_raw_reports.drop('Unnamed: 0', 1)
# # print the head to check if looks like intended
# df_raw_reports.head()


Unnamed: 0,dicom_id,study_id,subject_id,split,path,raw_report
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,50414267,10000032,train,files/p10/p10000032/s50414267/02aa804e-bde0afd...,FINAL REPORT\...
1,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,53189527,10000032,train,files/p10/p10000032/s53189527/2a2277a9-b0ded15...,FINAL REPORT\...
2,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,53911762,10000032,train,files/p10/p10000032/s53911762/68b5c4b1-227d048...,FINAL REPORT\...
3,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,56699142,10000032,train,files/p10/p10000032/s56699142/ea030e7a-2e3b134...,FINAL REPORT\...
4,096052b7-d256dc40-453a102b-fa7d01c6-1b22c6b4,57375967,10000764,train,files/p10/p10000764/s57375967/096052b7-d256dc4...,FINAL REPORT\...


In [None]:
df_new_sections = df_sections.rename(columns={'study': 'study_id'})

In [None]:
# Adding the raw report to the sections df


paths = df_record["path"].tolist()
study_ids = df_record["study_id"].tolist()

study_ids_new=[]

for study in study_ids:
    
    new_id = 's' + str(study)
    study_ids_new.append(new_id)


In [None]:
# creates dataframe with study id and raw reports, prints the name of the main directory on enter so you can see the progress
# overall there is p10-p19 directories, should take around 25-30 min to run as far as I remember

import ntpath

def path_leaf(path):
    head, tail = ntpath.split(path)
    return tail or ntpath.basename(head)

def searchDirectory(mainDirectory,df_raw_reports):


    directory = '/media/Data/ComputerVision/mimic_cxr_jpg_small_chest_radiograph/data/physionet.org/files/mimic-cxr-jpg/2.0.0/files/' + mainDirectory
    report_ext = '.txt'
    image_ext = '.jpg'
    
    print(mainDirectory)
    for subDirectory in os.listdir(directory):
        full_path = directory + '/' + subDirectory
       
        for root, dirs, files in os.walk(full_path):
            
            for filename in files:

                if filename.endswith(report_ext):
                    with open(os.path.join(root, filename), 'r') as report:
                        contents = report.read()
                    
                    df_temp = {"study_id": path_leaf(filename)[1:9], "raw_report": contents}
                    df_raw_reports = df_raw_reports.append(df_temp, ignore_index = True)
        
    return df_raw_reports



    
directories = ['p10','p11','p12','p13','p14','p15','p16','p17', 'p18', 'p19' ]


df_raw_reports = pd.DataFrame([], columns = ['study_id', 'raw_report'])

for directory in directories:
    df_raw_reports = searchDirectory(directory, df_raw_reports)


In [2]:
df_raw_reports.head()
df_raw_reports.to_csv('raw_reports.csv', index=False)


df_raw_reports = pd.read_csv('df_raw_reports.csv')

df_raw_reports= df_raw_reports.drop_duplicates(subset = ["raw_report"])
df_raw_reports.reset_index(inplace=True)
df_raw_reports = df_raw_reports.drop('index', 1)
df_raw_reports = df_raw_reports.drop('Unnamed: 0', 1)
df_raw_reports.head()

In [7]:
##############################
# Vocabulary class
##############################

class Vocabulary:
  
    '''
    __init__ method is called by default as soon as an object of this class is initiated
    we use this method to initiate our vocab dictionaries
    '''
    def __init__(self, freq_threshold, max_size):
        '''
        freq_threshold : the minimum times a word must occur in corpus to be treated in vocab
        max_size : max source vocab size. Eg. if set to 10,000, we pick the top 10,000 most frequent words and discard others
        '''
        #initiate the index to token dict
        ## <PAD> -> padding, used for padding the shorter sentences in a batch to match the length of longest sentence in the batch
        ## <SOS> -> start token, added in front of each sentence to signify the start of sentence
        ## <EOS> -> End of sentence token, added to the end of each sentence to signify the end of sentence
        ## <UNK> -> words which are not found in the vocab are replace by this token
        self.itos = {0: '<PAD>', 1:'<SOS>', 2:'<EOS>', 3: '<UNK>'}
        
        #initiate the token to index dict
        self.stoi = {k:j for j,k in self.itos.items()} 
        
        self.freq_threshold = freq_threshold
        self.max_size = max_size
    
    '''
    __len__ is used by dataloader later to create batches
    '''
    def __len__(self):
        return len(self.itos)
    
    '''
    a simple tokenizer to split on space and converts the sentence to list of words
    '''
    @staticmethod
    def tokenizer_function(text):
        #use the BERT tokenizer here instead!
        cleanedReport = re.sub(r'[^\w\s]','',text)   # remove punctuation (not word characters and whitespace)
        cleanedReport = re.sub('_', '', cleanedReport)        # remove __ in the report
        cleanedReport = re.sub(r'[\d-]', '', cleanedReport)  # remove numbers in the report 
        cleanedReport = re.sub('\n', '', cleanedReport)
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        #tokenized_report = tokenizer.tokenize(cleanedReport)
        return tokenizer.tokenize(cleanedReport)
        #return [tok.lower().strip() for tok in text.split(' ')]
    
    '''
    build the vocab: create a dictionary mapping of index to string (itos) and string to index (stoi)
    output ex. for stoi -> {'the':5, 'a':6, 'an':7}
    '''
    def build_vocabulary(self, sentence_list):
        #calculate the frequencies of each word first to remove the words with freq < freq_threshold
        frequencies = {}  #init the freq dict
        idx = 4 #index from which we want our dict to start. We already used 4 indexes for pad, start, end, unk
        
        i=0
        for sentence in sentence_list:
            i+=1
            if (i % 10000 == 0):
                print(i) 
            for word in self.tokenizer_function(sentence):
                if word not in frequencies.keys():
                    frequencies[word]=1
                else:
                    frequencies[word]+=1
              
                    
        #limit vocab by removing low freq words
        frequencies = {k:v for k,v in frequencies.items() if v>self.freq_threshold} 
        
        #limit vocab to the max_size specified
        frequencies = dict(sorted(frequencies.items(), key = lambda x: -x[1])[:self.max_size-idx]) # idx =4 for pad, start, end , unk
            
        #create vocab
        for word in frequencies.keys():
            self.stoi[word] = idx
            self.itos[idx] = word
            idx+=1
        return frequencies
            
    '''
    convert the list of words to a list of corresponding indexes
    '''    
    def numericalize(self, text):
        #tokenize text
        tokenized_text = self.tokenizer_function(text)
        numericalized_text = []
        for token in tokenized_text:
            if token in self.stoi.keys():
                numericalized_text.append(self.stoi[token])
            else: #out-of-vocab words are represented by UNK token index
                numericalized_text.append(self.stoi['<UNK>'])
                
        return numericalized_text

In [8]:
###############
# Train dataset
###############

class TrainDataset(Dataset):
    
    def __init__(self, df_train, target_column, transform=None, freq_threshold = 0,
                vocab_max_size = 5000):
       
        self.df_train = df_train
    
        
        self.transform = transform
        
        self.target_text = df_train[target_column]
        
        
        self.report_vocab = Vocabulary(freq_threshold, vocab_max_size)
        self.report_vocab.build_vocabulary(self.target_text.tolist())
        
    def __len__(self):
        return len(self.df)
    
    
    def __getitem__(self, index):
  
        target_text = self.target_text[index]
           
        #numericalize texts ['<SOS>','cat', 'in', 'a', 'bag','<EOS>'] -> [1,12,2,9,24,2]

        numerialized_target_text = [self.report_vocab.stoi["<SOS>"]]
        numerialized_target_text += self.report_vocab.numericalize(target_text)
        numerialized_target_text.append(self.report_vocab.stoi["<EOS>"])
        
        #convert the list to tensor and return
        return torch.tensor(numerialized_target_text)


In [10]:
####################
# Validation dataset
####################

class Validation_Dataset:
    def __init__(self, train_dataset, df, target_column, transform = None):
        self.df = df
        self.transform = transform
        
        #train dataset will be used as lookup for vocab
        self.train_dataset = train_dataset
        
        #get source and target texts
        self.target_texts = self.df[target_column]
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self,index):
        target_text = self.target_texts[index]
        #print(target_text)
            
        #numericalize texts ['<SOS>','cat', 'in', 'a', 'bag','<EOS>'] -> [1,12,2,9,24,2]

        numerialized_target = [self.train_dataset.target_vocab.stoi["<SOS>"]]
        numerialized_target += self.train_dataset.target_vocab.numericalize(target_text)
        numerialized_target.append(self.train_dataset.target_vocab.stoi["<EOS>"])
        #print(numerialized_source)
        return torch.tensor(numerialized_target) 

In [12]:
# Here's a little example on just 10 reports
df_train_test = df_raw_reports[0:10]

train_dataset = TrainDataset(df_train_test, "raw_report")

In [14]:
# we can see the results, e.g.
train_dataset[0]

tensor([  1,  18,  19,  44,   6,  15,   7,  73,  74,  35,  62,  12,  52, 193,
         45, 194,  46, 195,  27,  63, 196,  53,   6,  15,   7,  36,  20, 123,
         21,  11,   5,   9,  22,  47,  28,  29,  30,  31,  32,  33,  37,  23,
         38,  39,  40,  41,  34,  99,  64, 124,  65,  24, 125, 126, 127, 197,
        198, 199, 200,   4,  25,  54,  55,  56,  57,  27,  58,   5,  16,  75,
        128,  59,   4,  13,  14, 201, 129,   4,  76,   4, 130,  77,  66, 131,
          5, 202, 203, 204, 205, 206, 207, 208, 209,  10,   4, 210,  13, 132,
          7, 211, 212,   8, 133,  26,   9,  48,  25,  78,  79,  80,  81,  82,
         60,   2])