### MIMIC CXR

* [Dataset](https://physionet.org/content/mimic-cxr/2.0.0/)

* [Original Paper](https://www.nature.com/articles/s41597-019-0322-0)

In [2]:
%cd ..

/home/datascience/DF-DM


### Setup Environment:

In [3]:
import os
import pandas as pd

from src.classifiers import preprocess_data, process_labels,split_data

from src.classifiers import VQADataset
from torch.utils.data import DataLoader

from src.classifiers import train_early_fusion, train_late_fusion

In [4]:
PATH = 'Embeddings/mimic/'
COLUMN = 'embeddings'

In [5]:
text_path, images_path = 'text_embeddings.csv', 'Embeddings_dinov2_base.csv' # os.listdir(PATH)

## Get data

### Text

In [6]:
text = pd.read_csv(os.path.join(PATH, text_path))
text

Unnamed: 0,path,race_label,sex_label,disease_label,subject_id,study_id,split,file_path,image_id,embeddings
0,p19/p19702416/s51321189/d85c9f15-f0f84927-761f...,0,0,3,19702416,51321189,train,datasets/mimic/files/p19/p19702416/s51321189.txt,s51321189_d85c9f15-f0f84927-761f30e0-51c2d319-...,"[0.019312300477283984, -0.0168451968233965, 0...."
1,p13/p13339704/s51292704/0024603b-12db30e2-ab32...,2,0,3,13339704,51292704,train,datasets/mimic/files/p13/p13339704/s51292704.txt,s51292704_0024603b-12db30e2-ab32c9cb-dae5a3fc-...,"[0.012197528503835167, -0.02721826380867521, -..."
2,p13/p13339704/s51292704/7953848d-2411f0df-859f...,2,0,3,13339704,51292704,train,datasets/mimic/files/p13/p13339704/s51292704.txt,s51292704_7953848d-2411f0df-859f5cea-38c618e0-...,"[0.012197528503835167, -0.02721826380867521, -..."
3,p12/p12668169/s54048859/8a4aaaee-55fcf98f-a036...,0,0,3,12668169,54048859,train,datasets/mimic/files/p12/p12668169/s54048859.txt,s54048859_8a4aaaee-55fcf98f-a036a8e7-da71eed1-...,"[0.023151464746335323, -0.003735494237665375, ..."
4,p10/p10309415/s58144222/9886b0fe-9121c65e-c8d7...,0,0,3,10309415,58144222,train,datasets/mimic/files/p10/p10309415/s58144222.txt,s58144222_9886b0fe-9121c65e-c8d74649-4b88c530-...,"[0.022131832248893236, -0.02182131683603928, 0..."
...,...,...,...,...,...,...,...,...,...,...
153123,p14/p14476373/s53343726/f231fe18-30e5023f-617d...,0,0,0,14476373,53343726,val,datasets/mimic/files/p14/p14476373/s53343726.txt,s53343726_f231fe18-30e5023f-617d5710-b7343694-...,"[0.036062664090444306, -0.024110380535552218, ..."
153124,p12/p12491157/s54173393/6aa095e2-8ec1eeae-432f...,0,0,3,12491157,54173393,val,datasets/mimic/files/p12/p12491157/s54173393.txt,s54173393_6aa095e2-8ec1eeae-432fbe0a-951014ba-...,"[0.030613531621846417, -0.013087464989618098, ..."
153125,p14/p14036332/s52691805/f52e19e0-9569d75a-7c2e...,0,0,2,14036332,52691805,val,datasets/mimic/files/p14/p14036332/s52691805.txt,s52691805_f52e19e0-9569d75a-7c2e1cca-588fe579-...,"[0.022604879986884274, -0.008428466567363305, ..."
153126,p10/p10972527/s53691151/f4f75648-baff1e55-0086...,0,0,3,10972527,53691151,val,datasets/mimic/files/p10/p10972527/s53691151.txt,s53691151_f4f75648-baff1e55-0086d06c-cf27d72e-...,"[0.024008833012363645, -0.02083004001988949, -..."


### Images

In [7]:
images = pd.read_csv(os.path.join(PATH, images_path))
images.head()

Unnamed: 0,ImageName,0,1,2,3,4,5,6,7,8,...,758,759,760,761,762,763,764,765,766,767
0,s57038960_db65d2eb-898d82af-87f59617-9ef8f1b8-...,-0.436693,-1.049584,0.640005,0.415092,-1.940182,0.858519,1.77797,-1.704433,-1.516895,...,-1.786172,-0.32148,-2.379034,-0.091928,1.205125,-2.092803,-1.849968,-2.751856,1.281025,-2.946824
1,s54536642_fe1a39db-b20ab2f2-6259a4a2-8c5a2638-...,-0.160901,-0.43344,-0.650693,1.273812,-2.661967,0.785982,2.1423,-1.868585,0.007601,...,-2.072898,-0.098359,-2.855795,1.093558,0.400163,-1.812585,-2.146883,-3.32922,1.434818,-2.328356
2,s51004296_bc027209-0c889cc5-b7ff43bc-11b44b52-...,-0.641195,0.156166,0.047452,0.138009,-1.724101,0.28234,2.119441,-1.537461,0.017383,...,-1.727623,-0.923027,-3.364305,0.637478,1.279478,-2.412542,-2.31173,-2.821277,1.974853,-3.839996
3,s56486389_00bbcb53-1b59b3a1-64b2ad2c-f7e4bc9b-...,-1.125658,0.159419,-0.995158,0.303959,-2.738186,0.452527,2.212985,-3.260901,-0.094809,...,-1.979997,0.104396,-1.797872,0.355599,1.269172,-2.556092,-1.792497,-2.726921,1.786852,-2.403912
4,s56962831_eadb5bb3-0ecb52be-a031b685-c1558abc-...,-1.296178,-0.460473,-1.792543,0.311563,-2.836921,-0.110482,2.21661,-2.794921,-0.308835,...,-2.2377,-0.129795,-2.576111,0.35597,0.700511,-3.887913,-0.871856,-3.350262,1.592613,-2.709266


### Merge and preprocess the datasets

In [None]:
df = preprocess_data(text, images, "image_id", "ImageName")
df.disease_label = df.disease_label.astype(str)
df.to_csv(os.path.join(PATH, 'embeddings.csv'), index=False)
df.head()