### MIMIC CXR

* [Dataset](https://physionet.org/content/mimic-cxr/2.0.0/)

* [Original Paper](https://www.nature.com/articles/s41597-019-0322-0)

In [1]:
%cd ..

/home/datascience/Data Fusion


### Setup Environment:

In [2]:
import os
import pandas as pd

from src.classifiers import process_labels, split_data
from src.classifiers_base import preprocess_df

from transformers import BertTokenizer

from src.multimodal_data_loader import VQADataset
from torch.utils.data import DataLoader

from src.classifiers_base_cpu_metrics import calculate_memory

In [3]:
PATH = 'datasets/mimic/'

In [4]:
text_path = os.path.join(PATH, 'labels.csv')
images_path = os.path.join(PATH, 'images')

## Get data

In [5]:
df = pd.read_csv(text_path)
df

Unnamed: 0,path,race_label,sex_label,disease_label,subject_id,study_id,split,file_path,text,image_id
0,p19/p19702416/s51321189/d85c9f15-f0f84927-761f...,0,0,3,19702416,51321189,train,datasets/mimic/files/p19/p19702416/s51321189.txt,FINAL REPORT\...,s51321189_d85c9f15-f0f84927-761f30e0-51c2d319-...
1,p13/p13339704/s51292704/0024603b-12db30e2-ab32...,2,0,3,13339704,51292704,train,datasets/mimic/files/p13/p13339704/s51292704.txt,FINAL REPORT\...,s51292704_0024603b-12db30e2-ab32c9cb-dae5a3fc-...
2,p13/p13339704/s51292704/7953848d-2411f0df-859f...,2,0,3,13339704,51292704,train,datasets/mimic/files/p13/p13339704/s51292704.txt,FINAL REPORT\...,s51292704_7953848d-2411f0df-859f5cea-38c618e0-...
3,p12/p12668169/s54048859/8a4aaaee-55fcf98f-a036...,0,0,3,12668169,54048859,train,datasets/mimic/files/p12/p12668169/s54048859.txt,FINAL REPORT\...,s54048859_8a4aaaee-55fcf98f-a036a8e7-da71eed1-...
4,p10/p10309415/s58144222/9886b0fe-9121c65e-c8d7...,0,0,3,10309415,58144222,train,datasets/mimic/files/p10/p10309415/s58144222.txt,FINAL REPORT\...,s58144222_9886b0fe-9121c65e-c8d74649-4b88c530-...
...,...,...,...,...,...,...,...,...,...,...
153123,p14/p14476373/s53343726/f231fe18-30e5023f-617d...,0,0,0,14476373,53343726,val,datasets/mimic/files/p14/p14476373/s53343726.txt,FINAL REPORT\...,s53343726_f231fe18-30e5023f-617d5710-b7343694-...
153124,p12/p12491157/s54173393/6aa095e2-8ec1eeae-432f...,0,0,3,12491157,54173393,val,datasets/mimic/files/p12/p12491157/s54173393.txt,FINAL REPORT\...,s54173393_6aa095e2-8ec1eeae-432fbe0a-951014ba-...
153125,p14/p14036332/s52691805/f52e19e0-9569d75a-7c2e...,0,0,2,14036332,52691805,val,datasets/mimic/files/p14/p14036332/s52691805.txt,FINAL REPORT\...,s52691805_f52e19e0-9569d75a-7c2e1cca-588fe579-...
153126,p10/p10972527/s53691151/f4f75648-baff1e55-0086...,0,0,3,10972527,53691151,val,datasets/mimic/files/p10/p10972527/s53691151.txt,FINAL REPORT\...,s53691151_f4f75648-baff1e55-0086d06c-cf27d72e-...


## Data Perparation

In [6]:
# Select features and labels vectors
text_columns = 'text'
image_columns = 'image_id'
label_columns = 'disease_label'

df = preprocess_df(df, image_columns, images_path)
df.disease_label = df.disease_label.astype(str)

# Split the data
train_df, test_df = split_data(df)

# Process and one-hot encode labels for training set
train_labels, mlb, train_columns = process_labels(train_df, col=label_columns)
test_labels = process_labels(test_df, col=label_columns, train_columns=train_columns)

100%|██████████| 153128/153128 [00:01<00:00, 113065.62it/s]
100%|██████████| 153128/153128 [00:03<00:00, 46564.11it/s]


Train Shape: (107695, 10)
Test Shape: (30360, 10)


In [7]:
train_df

Unnamed: 0,path,race_label,sex_label,disease_label,subject_id,study_id,split,file_path,text,image_id
0,p19/p19702416/s51321189/d85c9f15-f0f84927-761f...,0,0,3,19702416,51321189,train,datasets/mimic/files/p19/p19702416/s51321189.txt,FINAL REPORT\...,datasets/mimic/images/s51321189_d85c9f15-f0f84...
1,p13/p13339704/s51292704/0024603b-12db30e2-ab32...,2,0,3,13339704,51292704,train,datasets/mimic/files/p13/p13339704/s51292704.txt,FINAL REPORT\...,datasets/mimic/images/s51292704_0024603b-12db3...
2,p13/p13339704/s51292704/7953848d-2411f0df-859f...,2,0,3,13339704,51292704,train,datasets/mimic/files/p13/p13339704/s51292704.txt,FINAL REPORT\...,datasets/mimic/images/s51292704_7953848d-2411f...
3,p12/p12668169/s54048859/8a4aaaee-55fcf98f-a036...,0,0,3,12668169,54048859,train,datasets/mimic/files/p12/p12668169/s54048859.txt,FINAL REPORT\...,datasets/mimic/images/s54048859_8a4aaaee-55fcf...
4,p10/p10309415/s58144222/9886b0fe-9121c65e-c8d7...,0,0,3,10309415,58144222,train,datasets/mimic/files/p10/p10309415/s58144222.txt,FINAL REPORT\...,datasets/mimic/images/s58144222_9886b0fe-9121c...
...,...,...,...,...,...,...,...,...,...,...
107690,p17/p17675016/s51152932/3c4ef22b-6452df42-fe88...,0,0,3,17675016,51152932,train,datasets/mimic/files/p17/p17675016/s51152932.txt,FINAL REPORT\...,datasets/mimic/images/s51152932_3c4ef22b-6452d...
107691,p11/p11932181/s53058995/a6bfecfe-281e20c1-3d9a...,2,1,0,11932181,53058995,train,datasets/mimic/files/p11/p11932181/s53058995.txt,FINAL REPORT\...,datasets/mimic/images/s53058995_a6bfecfe-281e2...
107692,p14/p14987339/s56635090/8ff6567d-509914b7-cc32...,2,0,0,14987339,56635090,train,datasets/mimic/files/p14/p14987339/s56635090.txt,FINAL REPORT\...,datasets/mimic/images/s56635090_8ff6567d-50991...
107693,p19/p19647041/s54660985/915784ab-a79bebe9-9f18...,1,0,3,19647041,54660985,train,datasets/mimic/files/p19/p19647041/s54660985.txt,FINAL REPORT\...,datasets/mimic/images/s54660985_915784ab-a79be...


In [8]:
# Instantiate tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [9]:
train_dataset = VQADataset(train_df, text_columns, image_columns, label_columns, mlb, train_columns, tokenizer)
test_dataset = VQADataset(test_df, text_columns, image_columns, label_columns, mlb, train_columns, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=2)

### Models

In [10]:
if label_columns == 'DR_2':
    output_size = 1
else:
    output_size = len(pd.unique(train_df[label_columns]))
multilabel = False

In [11]:
calculate_memory(train_loader, test_loader, output_size)

Early fusion:
Average Memory per Batch in Train: 36.75 MB
Total Memory Usage per Epoch Train: 61842.13 MB (excluding model parameters)
Test:
Average Memory per Batch in Test: 10.36 MB
Total Memory Usage per Epoch Test: 4920.40 MB (excluding model parameters)
Model: 
Model Memory Usage: 747.94 MB

Late fusion:
Average Memory per Batch in Train: 36.75 MB
Total Memory Usage per Epoch Train: 61842.13 MB (excluding model parameters)
Test:
Average Memory per Batch in Test: 10.36 MB
Total Memory Usage per Epoch Test: 4920.40 MB (excluding model parameters)
Model: 
Model Memory Usage: 747.57 MB
