### MIMIC CXR

* [Dataset](https://physionet.org/content/mimic-cxr/2.0.0/)

* [Original Paper](https://www.nature.com/articles/s41597-019-0322-0)

In [1]:
%cd ..

/home/datascience/Data Fusion


### Setup Environment:

In [2]:
import os
import pandas as pd

from src.classifiers import preprocess_data, process_labels,split_data

from src.classifiers import VQADataset
from torch.utils.data import DataLoader

from src.classifiers_cpu_metrics import calculate_memory

In [3]:
PATH = 'Embeddings_vlm/mimic/'
FILE = 'embeddings_clip.csv'

FILE_PATH = os.path.join(PATH, FILE)

## Get data

In [4]:
df = pd.read_csv(FILE_PATH)
df.disease_label = df.disease_label.astype(str)
df.drop(columns=['text', 'image_id'], inplace=True)
df.head()

Unnamed: 0,path,race_label,sex_label,disease_label,subject_id,study_id,split,file_path,image_embedding_0,image_embedding_1,...,text_embedding_502,text_embedding_503,text_embedding_504,text_embedding_505,text_embedding_506,text_embedding_507,text_embedding_508,text_embedding_509,text_embedding_510,text_embedding_511
0,p19/p19702416/s51321189/d85c9f15-f0f84927-761f...,0,0,3,19702416,51321189,train,datasets/mimic/files/p19/p19702416/s51321189.txt,0.017584,-0.003019,...,-0.052364,-0.018894,0.101141,0.003454,-0.030474,0.01375,-0.007859,0.061135,0.050736,-0.005674
1,p13/p13339704/s51292704/0024603b-12db30e2-ab32...,2,0,3,13339704,51292704,train,datasets/mimic/files/p13/p13339704/s51292704.txt,0.025167,-0.008093,...,-0.045582,-0.014851,0.052466,-0.013415,-0.023199,0.001723,0.004433,0.036226,0.035885,-0.000904
2,p13/p13339704/s51292704/7953848d-2411f0df-859f...,2,0,3,13339704,51292704,train,datasets/mimic/files/p13/p13339704/s51292704.txt,0.027844,-0.001912,...,-0.045582,-0.014851,0.052466,-0.013415,-0.023199,0.001723,0.004433,0.036226,0.035885,-0.000904
3,p12/p12668169/s54048859/8a4aaaee-55fcf98f-a036...,0,0,3,12668169,54048859,train,datasets/mimic/files/p12/p12668169/s54048859.txt,0.041953,0.002593,...,-0.031608,-0.018585,0.02058,0.012898,0.000496,0.035545,0.023832,-0.059201,0.019583,-0.003538
4,p10/p10309415/s58144222/9886b0fe-9121c65e-c8d7...,0,0,3,10309415,58144222,train,datasets/mimic/files/p10/p10309415/s58144222.txt,0.012712,0.00645,...,-0.038537,-0.009317,0.057783,-0.01001,-0.025244,0.002658,0.010754,0.027053,0.024629,0.007983


## Data Perparation

In [5]:
# Split the data
train_df, test_df = split_data(df)

# Select features and labels vectors
text_columns = [column for column in df.columns if 'text' in column] #[f'text_{i}']
image_columns = [column for column in df.columns if 'image' in column] #[f'image_{i}']
label_columns = 'disease_label'


# Process and one-hot encode labels for training set
train_labels, mlb, train_columns = process_labels(train_df, col=label_columns)
test_labels = process_labels(test_df, col=label_columns, train_columns=train_columns)


train_dataset = VQADataset(train_df, text_columns, image_columns, label_columns, mlb, train_columns)
test_dataset = VQADataset(test_df, text_columns, image_columns, label_columns, mlb, train_columns)


train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=2)

Train Shape: (107695, 1032)
Test Shape: (30360, 1032)


### Models

In [6]:
text_input_size = len(text_columns)
image_input_size = len(image_columns)
if label_columns == 'DR_2':
    output_size = 1
else:
    output_size = len(pd.unique(train_df[label_columns]))
multilabel = False

In [7]:
calculate_memory(train_loader, test_loader, text_input_size, image_input_size, output_size)

Early fusion:
Average Memory per Batch in Train: 0.25 MB
Total Memory Usage per Epoch Train: 422.33 MB (excluding model parameters)
Test:
Average Memory per Batch in Test: 0.07 MB
Total Memory Usage per Epoch Test: 33.60 MB (excluding model parameters)
Model: 
Model Memory Usage: 0.50 MB

Late fusion:
Average Memory per Batch in Train: 0.25 MB
Total Memory Usage per Epoch Train: 422.33 MB (excluding model parameters)
Test:
Average Memory per Batch in Test: 0.07 MB
Total Memory Usage per Epoch Test: 33.60 MB (excluding model parameters)
Model: 
Model Memory Usage: 0.25 MB
