# Data Loading

In [1]:
##################
# All imports 
##################

#sys libs
import os
import sys
import random
import warnings
warnings.filterwarnings("ignore")

#data manupulation libs
import pandas as pd
import numpy as np
#from sklearn.model_selection import train_test_split

#from pandarallel import pandarallel

# Initialization
#pandarallel.initialize()

#string manupulation libs
import re
import string
from string import digits
import spacy

#torch libs
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim

import transformers

from transformers import BertTokenizer, AutoTokenizer

# data manipulations
from pathlib import Path
import uuid
import pydicom

from PIL import Image

import cv2
import matplotlib.pyplot as plt

### Data Preparation

In [2]:
# Used to make the csv with raw reports, not really important anymore once we have the reports in a csv
cxr_root_path = "/media/Data/ComputerVision/mimic_cxr_jpg_small_chest_radiograph/data/physionet.org/files/mimic-cxr-jpg/2.0.0/"
df_record = pd.read_csv('{}/cxr-record-list.csv.gz'.format(cxr_root_path), sep=',')
df_split = pd.read_csv('{}/mimic-cxr-2.0.0-split.csv.gz'.format(cxr_root_path))

df_temp = df_split.merge(df_record, on=['subject_id', 'study_id', 'dicom_id'], how='left')

df_sections = pd.read_csv('{}/mimic-cxr-sections/mimic_cxr_sectioned.csv'.format(cxr_root_path))

# if you already have the df_raw_reports.csv just uncomment the following line and skipp to the vocabulary class

# extract the csv into a dataframe
df_raw_reports = pd.read_csv('df_raw_reports.csv')
# discard duplicates of the reports as there were originally more rows than reports (as there is sometimes more images per study)
df_raw_reports= df_raw_reports.drop_duplicates(subset = ["raw_report"])
#restart the index column as it was filtered from larger data and the indices were messed up
df_raw_reports.reset_index(inplace=True)
#discard some unimportant columns
df_raw_reports = df_raw_reports.drop('index', 1)
df_raw_reports = df_raw_reports.drop('Unnamed: 0', 1)
# print the head to check if looks like intended
df_raw_reports.head()


Unnamed: 0,dicom_id,study_id,subject_id,split,path,raw_report
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,50414267,10000032,train,files/p10/p10000032/s50414267/02aa804e-bde0afd...,FINAL REPORT\...
1,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,53189527,10000032,train,files/p10/p10000032/s53189527/2a2277a9-b0ded15...,FINAL REPORT\...
2,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,53911762,10000032,train,files/p10/p10000032/s53911762/68b5c4b1-227d048...,FINAL REPORT\...
3,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,56699142,10000032,train,files/p10/p10000032/s56699142/ea030e7a-2e3b134...,FINAL REPORT\...
4,096052b7-d256dc40-453a102b-fa7d01c6-1b22c6b4,57375967,10000764,train,files/p10/p10000764/s57375967/096052b7-d256dc4...,FINAL REPORT\...


In [3]:
# Checking test dataset number
df_split.head()
print(df_split.shape)

split = df_split["split"].tolist()
idx = 0
for elem in split:
    if elem == "test":
        idx+=1

print(idx)

(377110, 4)
5159


### Creating raw reports dataframe (Skip if done)

In [11]:
df_new_sections = df_sections.rename(columns={'study': 'study_id'})

In [12]:
# Adding the raw report to the sections df


paths = df_record["path"].tolist()
study_ids = df_record["study_id"].tolist()

study_ids_new=[]

for study in study_ids:
    
    new_id = 's' + str(study)
    study_ids_new.append(new_id)


In [9]:
# creates dataframe with study id and raw reports, prints the name of the main directory on enter so you can see the progress
# overall there is p10-p19 directories, should take around 25-30 min to run as far as I remember

import ntpath

def path_leaf(path):
    head, tail = ntpath.split(path)
    return tail or ntpath.basename(head)

def searchDirectory(mainDirectory,df_raw_reports):


    directory = '/media/Data/ComputerVision/mimic_cxr_jpg_small_chest_radiograph/data/physionet.org/files/mimic-cxr-jpg/2.0.0/files/' + mainDirectory
    report_ext = '.txt'
    image_ext = '.jpg'
    
    print(mainDirectory)
    for subDirectory in os.listdir(directory):
        full_path = directory + '/' + subDirectory
       
        for root, dirs, files in os.walk(full_path):
            
            for filename in files:

                if filename.endswith(report_ext):
                    with open(os.path.join(root, filename), 'r') as report:
                        contents = report.read()
                    
                    df_temp = {"study_id": path_leaf(filename)[1:9], "raw_report": contents}
                    df_raw_reports = df_raw_reports.append(df_temp, ignore_index = True)
        
    return df_raw_reports



    
directories = ['p10','p11','p12','p13','p14','p15','p16','p17', 'p18', 'p19' ]


df_raw_reports = pd.DataFrame([], columns = ['study_id', 'raw_report'])

for directory in directories:
    df_raw_reports = searchDirectory(directory, df_raw_reports)


p10
p11


KeyboardInterrupt: 

In [13]:
df_raw_reports.head()
df_raw_reports.to_csv('raw_reports.csv', index=False)


df_raw_reports = pd.read_csv('df_raw_reports.csv')

df_raw_reports= df_raw_reports.drop_duplicates(subset = ["raw_report"])
df_raw_reports.reset_index(inplace=True)
df_raw_reports = df_raw_reports.drop('index', 1)
df_raw_reports = df_raw_reports.drop('Unnamed: 0', 1)
df_raw_reports.head()

Unnamed: 0,dicom_id,study_id,subject_id,split,path,raw_report
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,50414267,10000032,train,files/p10/p10000032/s50414267/02aa804e-bde0afd...,FINAL REPORT\...
1,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,53189527,10000032,train,files/p10/p10000032/s53189527/2a2277a9-b0ded15...,FINAL REPORT\...
2,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,53911762,10000032,train,files/p10/p10000032/s53911762/68b5c4b1-227d048...,FINAL REPORT\...
3,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,56699142,10000032,train,files/p10/p10000032/s56699142/ea030e7a-2e3b134...,FINAL REPORT\...
4,096052b7-d256dc40-453a102b-fa7d01c6-1b22c6b4,57375967,10000764,train,files/p10/p10000764/s57375967/096052b7-d256dc4...,FINAL REPORT\...


### Classes for the Data Loader
- Include: loading data, text preprocessing, words frequency check, tokenization, tokens-IDs-conversion 

In [84]:
##############################
# Vocabulary class
##############################

class Vocabulary:
  
    '''
    __init__ method is called by default as soon as an object of this class is initiated
    we use this method to initiate our vocab dictionaries
    '''
    def __init__(self, freq_threshold, max_size):
        '''
        freq_threshold : the minimum times a word must occur in corpus to be treated in vocab
        max_size : max source vocab size. Eg. if set to 10,000, we pick the top 10,000 most frequent words and discard others
        '''
        
        self.freq_threshold = freq_threshold
        self.max_size = max_size
    
    
    '''
    a simple tokenizer to split on space and converts the sentence to list of words
    '''
    @staticmethod
    def preprocessing(text):
        cleanedReport = re.sub(r'[^\w\s]','',text)            # remove punctuation (not word characters and whitespace)
        cleanedReport = re.sub('_', '', cleanedReport)        # remove __ in the report
        cleanedReport = re.sub(r'[\d-]', '', cleanedReport)   # remove numbers in the report 
        cleanedReport = re.sub('\n', '', cleanedReport)
        # need further Lemmatization?
        return cleanedReport                                  # should be just a string
    
    '''
    vocabulary frequency check:
    '''
    def build_vocabulary(self, sentence_list):
        # calculate the frequencies of each word first to remove the words with freq < freq_threshold
        frequencies = {}  #init the freq dict
        #idx = 4 #index from which we want our dict to start. We already used 4 indexes for pad, start, end, unk
        idx = 4
        
        i=0
        for sentence in sentence_list:
            i+=1
            if (i % 10000 == 0):
                print(i) 
                
            preprocessed = self.preprocessing(sentence)
            preprocessedList = list(preprocessed.split(" "))
            for word in preprocessedList:
                if word not in frequencies.keys():
                    frequencies[word]=1
                else:
                    frequencies[word]+=1
              
                    
        #limit vocab by removing low freq words
        frequencies = {k:v for k,v in frequencies.items() if v > self.freq_threshold} 
        
        #limit vocab to the max_size specified
        frequencies = dict(sorted(frequencies.items(), key = lambda x: -x[1])[:self.max_size-idx]) # idx =4 for pad, start, end , unk
            
        return frequencies

            
    '''
    tokenization and converting tokens to IDs
    '''    
    def tokenization(self, text):
        
        # change here to use different tokenizer models:
        #tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
        tokenized_text = tokenizer.tokenize(text)
        
        tokenized_report = ["[CLS]"]
        tokenized_report += tokenized_text
        tokenized_report.append("[SEP]")
        numericalized_report = tokenizer.convert_tokens_to_ids(tokenized_report)
        
#         # Another encoding method which works on bert-base-uncased tokenizer but not working well with not the others...
#         text_str = ' '.join([str(elem) for elem in text])
#         encoded = tokenizer.encode_plus(text=text_str,  # the sentence to be encoded
#                                         add_special_tokens=True,  # Add [CLS] and [SEP]
#                                         return_attention_mask = True,  # Generate the attention mask
#                                         #return_tensors = 'pt',  # ask the function to return PyTorch tensors
#                                         )
        
        return numericalized_report

In [85]:
###############
# Train dataset
###############

class TrainDataset(Dataset):
    
    def __init__(self, df_train, target_column, transform=None, freq_threshold = 0,
                vocab_max_size = 5000):
       
        self.df_train = df_train
        
        self.transform = transform
        
        self.target_text = df_train[target_column]

        self.report_vocab = Vocabulary(freq_threshold, vocab_max_size)
        self.frequencyDict = self.report_vocab.build_vocabulary(self.target_text.tolist())  # build vocab for whole thing (list of whole thing)
        
    def __len__(self):
        return len(self.df_train)
    

    def __getitem__(self, index):
  
        target_text = self.target_text[index]                             # string
        target_text = self.report_vocab.preprocessing(target_text)
        
        for word in target_text.split():
            if word not in self.frequencyDict.keys():
                target_text = target_text.replace(word, "")
            
        numericalized_report = self.report_vocab.tokenization(target_text)
        
        # uncomment the following to convert the list to tensor and return (apparently BERT works without tensor)
        # return torch.tensor(numericalized_report)
        return numericalized_report

#### Example:

In [86]:
# Here's a little example on just 10 reports
#df_train_test = df_raw_reports[0:10]
df_train_test = df_raw_reports[0:10]

train_dataset = TrainDataset(df_train_test, "raw_report")

In [88]:
# we can see the results, e.g.
example = train_dataset[0]
print(example)
print("\n")
print("Number of tokens: ", len(example))

[101, 1509, 2592, 8179, 2229, 185, 1161, 1105, 2495, 1204, 12754, 175, 1114, 1207, 15415, 1112, 14375, 1116, 174, 7501, 1111, 8974, 5531, 2229, 185, 1161, 1105, 11937, 7577, 3839, 9505, 1175, 1110, 1185, 17811, 20994, 185, 1513, 12602, 174, 3101, 17268, 1137, 185, 1673, 1818, 12858, 25632, 20557, 6873, 5552, 11769, 7409, 4233, 1115, 1211, 2620, 4248, 15070, 6719, 1103, 3621, 2660, 16418, 2050, 14196, 27316, 1110, 2999, 16973, 1933, 1166, 1103, 1286, 13093, 9046, 1439, 1103, 7209, 1103, 3077, 1181, 3105, 14701, 1110, 8362, 16996, 23822, 1895, 13306, 19353, 24211, 1785, 1104, 1103, 16530, 1286, 3971, 1105, 5001, 10346, 1132, 2382, 8351, 1185, 12104, 3621, 2660, 16091, 13505, 7637, 1616, 1965, 102]


Number of tokens:  114


In [89]:
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

# Conversion back to tokens:
print("Tokens conversion:")
print(tokenizer.convert_ids_to_tokens(example))
print("\n")

# Decode it back into sentences (tokens of subwords will be merged together)
print("Decoding:")
print(tokenizer.decode(example))

Tokens conversion:
['[CLS]', 'final', 'report', 'examination', 'chest', 'p', '##a', 'and', 'la', '##t', 'indication', 'f', 'with', 'new', 'onset', 'as', '##cite', '##s', 'e', '##val', 'for', 'infection', 'technique', 'chest', 'p', '##a', 'and', 'lateral', 'comparison', 'none', 'findings', 'there', 'is', 'no', 'focal', 'consolidation', 'p', '##le', '##ural', 'e', '##ff', '##usion', 'or', 'p', '##ne', '##um', '##oth', '##orax', 'bilateral', 'nod', '##ular', 'op', '##ac', '##ities', 'that', 'most', 'likely', 'represent', 'nipple', 'shadows', 'the', 'card', '##io', '##media', '##st', '##inal', 'silhouette', 'is', 'normal', 'clips', 'project', 'over', 'the', 'left', 'lung', 'potentially', 'within', 'the', 'breast', 'the', 'image', '##d', 'upper', 'abdomen', 'is', 'un', '##rem', '##ark', '##able', 'chronic', 'def', '##orm', '##ity', 'of', 'the', 'posterior', 'left', 'sixth', 'and', 'seventh', 'ribs', 'are', 'noted', 'impression', 'no', 'acute', 'card', '##io', '##pu', '##lm', '##ona', '##ry'

### Chunking to unify token lengths of each sample

In [95]:
def chunking(dataset, df, chunk_size = 128):
    
    concat = []
    for i in range(len(dataset)):     # Add all tokens together in one list
        concat.extend(dataset[df.index[i]])
        #print(i)      # print to check the progress
        
    total_length = len(concat)
    print("Total concatenated length: ", total_length)
    
    chunks = lambda concat, chunk_size: [concat[i:i+chunk_size] for i in range(0, total_length, chunk_size)]  
    chunked_text = chunks(concat, chunk_size)
    
    if len(chunked_text[-1]) < chunk_size:
        print("Removed last chunk: ", len(chunked_text[-1]), " tokens")
        chunked_text.pop()     # remove last element 
    
    return chunked_text

In [96]:
# EXAMPLE:
example_df = df_raw_reports.sample(n=5)
example_dataset = TrainDataset(example_df, "raw_report")
chunked_text = chunking(example_dataset, example_df)

with open("chunked_example.txt", "w") as output:
    output.write(str(chunked_text))

0
1
2
3
4
Total concatenated length:  563
Removed last chunk:  51  tokens


In [97]:
for chunk in chunked_text:
    print(tokenizer.decode(chunk))
    print("\n")

[CLS] final report examination chest pa and lat indication history m with left arm and right leg numbness ro chf pneumonia technique chest pa and lateral comparison findings the lungs are clearthe cardiac hilar and mediastinal contours are normalno pleural abnormality is seen intact median sternal wires considerable calcification of the descending thoracic aorta impression considerable calcification of the descending thoracic aorta but no evidence of edema or pneumonia [SEP] [CLS] final report portable ap chest from at clinical indication yearold with hcv and hcc who was noted to


be hypoxic pneumonia versus edema comparison is made to the patients prior study of at portable upright chest film at is submitted impression persistent low lung volumes with interval appearance of patchy bibasilar opacities which could reflect patchy atelectasis although pneumonia or aspiration should also be considered no evidence of pulmonary edema although there is crowding of the pulmonary vasculature m

In [98]:
print(torch.cuda.device(0))
print(torch.cuda.get_device_name(0))

<torch.cuda.device object at 0x7f51f5d08cc0>
GeForce RTX 3090


In [99]:
# # For all training reports:
# trainDf = df_raw_reports[0:len(df_raw_reports.index)]
# train_dataset = TrainDataset(trainDf, "raw_report")
# chunked_train_dataset = chunking(train_dataset, trainDf)    # chunk size = 128 

# # Save as .txt file:
# with open("chunked_trainDataset.txt", "w") as output:
#     output.write(str(chunked_train_dataset))

In [101]:
df_raw_reports_sampled = df_raw_reports.sample(n=20000)
print(df_raw_reports_sampled.shape)
df_raw_reports_sampled.head()

(20000, 6)


Unnamed: 0,dicom_id,study_id,subject_id,split,path,raw_report
74238,35b729df-ba6798a6-84f4eed8-be928459-f6e1323b,58155159,13332955,train,files/p13/p13332955/s58155159/35b729df-ba6798a...,FINAL REPORT\...
69839,0f89f810-7ce5a44d-5b99a560-06f1c85a-a5aabc52,50936682,13145776,train,files/p13/p13145776/s50936682/0f89f810-7ce5a44...,FINAL REPORT\...
180909,61081955-7266d045-757eae92-978b5127-81ac4054,59716009,18128235,train,files/p18/p18128235/s59716009/61081955-7266d04...,FINAL REPORT\...
161458,292e12fa-9b7ceda8-60de0a25-2b73d563-82a30b05,57050873,17260918,train,files/p17/p17260918/s57050873/292e12fa-9b7ceda...,FINAL REPORT\...
13402,05b21ec9-575c84e1-731c557e-d7b188e9-33900cbd,54367541,10610208,train,files/p10/p10610208/s54367541/05b21ec9-575c84e...,FINAL REPORT\...


In [None]:
# 20000 samples first:
train_dataset = TrainDataset(df_raw_reports_sampled, "raw_report")
chunked_train_dataset20000 = chunking(train_dataset, df_raw_reports_sampled)

# Save as .txt file:
with open("chunked_trainDataset20000.txt", "w") as output:
    output.write(str(chunked_train_dataset20000))

Result:
- Total concatenated length:  2350254
- Removed last chunk:  46  tokens

In [104]:
print("Number of samples: ", len(chunked_train_dataset20000))

Number of samples:  18361
