Project DeepHealth, UC5 "Deep Image Annotation"

Franco Alberto Cardillo, ILC-CNR (UNITO) 
<francoalberto.cardillo@ilc.cnr.it>

<font color="yellow">SET UP experimentations with MIMIC-CXR</font>

In [16]:

%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from numpy import count_nonzero as nnz
import os
from posixpath import join
from IPython.display import display
from IPython.display import Image
import pickle
from nltk import sent_tokenize
import yaml

from utils.vocabulary import Vocabulary
from utils.text_collation import collate_fn_one_s, collate_fn_n_sents
#> PATHS
# folders with images
# jpg_fld = "/mnt/datasets/mimic-cxr/mimic-cxr-jpg/physionet.org/files/mimic-cxr-jpg/2.0.0"  # originals
jpg_fld = "/mnt/datasets/mimic-cxr/mimic-cxr-jpg/physionet.org/files/mimic-cxr-jpg/2.0.0_resized"  # resize to 300x300
dcm_fld = "/mnt/datasets/mimic-cxr/mimic-cxr-dcm/physionet.org/files/mimic-cxr/2.0.0"

# metadata (report-based)
filename_metadata = join(jpg_fld, "mimic-cxr-2.0.0-metadata.csv")
# labels (image-based)
filename_chexpert = join(jpg_fld, "mimic-cxr-2.0.0-chexpert.csv") #labels
meta_fld = "/mnt/datasets/mimic-cxr/meta"

# pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
# aux functions
def save_label_indexes(df, out_fld):
    cols = df.columns
    lab2idx = {}
    idx2lab = {}
    for i, c in enumerate(cols):
        lab2idx[c] = i
        idx2lab[i] = c
        print(f"{i}) {c}")
    with open(join(out_fld, "label2idx.yaml"), "w") as fout:
        yaml.safe_dump(lab2idx, fout)
    print(f"saved {join(out_fld, 'label2idx.yaml')}")
    print(f"lab2idx with {len(lab2idx)} labels")
    with open(join(out_fld, "idx2label.yaml"), "w") as fout:
        yaml.safe_dump(idx2lab, fout)
    print(f"saved {join(out_fld, 'idx2label.yaml')}")
#<

def encode_text(sentences, vocab):
    word_indexes = []
    for sent in sentences.split("."):
        tokens = sent.strip().split()
        enc_sent = []
        for t in tokens:
            enc_sent.append(vocab.word2idx.get(t, Vocabulary.OOV))
        word_indexes.append(enc_sent)

    return word_indexes
#< encode_text

def build_img_text_ds(ds, image_fld=None):
    rep_ids = []
    image_filenames = []
    texts = []
    for row in ds.reset_index().itertuples():
        for fn in row.image_filename:
            rep_ids.append(row.id)
            if image_fld is not None:
                image_filenames.append(join(image_fld, fn))
            image_filenames.append(join(image_fld, fn))  # already
            texts.append(row.text)
    img_text_ds = pd.DataFrame()
    img_text_ds["id"] = rep_ids
    img_text_ds["image_filename"] = image_filenames
    img_text_ds["text"] = texts
    pd.columns = ["id", "image_filename", "text"]
    # display(img_text_ds.head())



In [19]:
# standard exp -- FIX FOR IDX 2 LABEL



# create folder for the output, output files will be saved here
EXP_FLD = "/mnt/datasets/uc5/EXPS/mimic/std_exp_resized_2"
os.makedirs(EXP_FLD, exist_ok=True)
print("created exp folder:", EXP_FLD)
#<

# ---
# img_ds: img_filenames (relative) to 1-hot encoded labels
img_ds = pd.read_pickle(join(meta_fld, "img_dataset.pkl"))
# read dataset: all other info, including text
ds = pd.read_pickle(join(meta_fld, "mimic_ds.pkl"))

# print("IMG_DS")
# display(img_ds.head().T)
# print("MIMIC_DS")
# display(ds.head().T)
merged = img_ds.merge(ds, left_on="filename", right_on="path", how="left")
# print("MERGED")
# display(merged.head().T)

assert merged.shape[0] == img_ds.shape[0], "WRONG FINAL DIM"

# ! 1.
# ! IMAGE DATASET
# make paths absolute
img_ds.reset_index(inplace=True, drop=False)
img_ds["filename"] = img_ds["filename"].apply(lambda x: join(jpg_fld, x))
img_ds.set_index("filename")
img_ds.to_pickle(join(EXP_FLD, "img_dataset.pkl"))
print("saved IMG_DS")
display(img_ds.head().T)

# ! idx2label, label2idx
labels = img_ds.columns
idx2label = {}
label2idx = {}
for i, l in enumerate(labels):
    idx2label[i] = l
    label2idx[l] = i

with open(join(EXP_FLD, "label2idx.yaml"), "w") as fout:
        yaml.safe_dump(label2idx, fout)
print(f"saved {join(EXP_FLD, 'label2idx.yaml')}")
print(f"lab2idx with {len(label2idx)} labels")
with open(join(EXP_FLD, "idx2label.yaml"), "w") as fout:
    yaml.safe_dump(idx2label, fout)
print(f"saved {join(EXP_FLD, 'idx2label.yaml')}")


# ! 2.
# ! IMG TEXT DATASET
keep_cols = ["study_id", "path", "split", "text"]
merged.drop(columns=[c for c in merged.columns if c not in keep_cols], inplace=True)
merged.columns = ["id", "split", "image_filename", "text"]
print("MERGED (reduced)")
display(merged.head().T)

merged["text_len"] = merged.text.apply(lambda s: len(s))
print(merged.text_len.value_counts())

len1 = merged.text_len == 1
print("EMPTY ROWS:", nnz(len1))


# ! encode text
# read vocabulary
VOCAB_FN = join(meta_fld, "vocab_3000.pkl")
with open(VOCAB_FN, "rb") as fin:
    vocab = pickle.load(fin)
# save in exp folder
with open(join(EXP_FLD, "vocab.pkl"), "wb") as fout:
    pickle.dump(vocab, fout)
print(f"num words: {len(vocab.word2idx)}, including special tokens")
for i in range(5):
    print(f"word {i}: {vocab.idx2word[i]}")
print("VOCAB SAVED in exp fld:", EXP_FLD)

print("encoding ...")
enc_text = merged.text.apply(encode_text, args=(vocab,))
merged["enc_text"] = enc_text
print('encoded')
# display(merged.head().T)
max_tokens = 12
merged["target_text"] = enc_text.apply(lambda enc: collate_fn_one_s(enc, max_tokens=max_tokens))
print("COLLATED")
# absolute paths
merged["image_filename"] = merged["image_filename"].apply(lambda x: join(jpg_fld, x))

img_text_ds = merged[["image_filename", "id", "text", "enc_text", "target_text"]].copy(deep=True)
# img_text_ds.set_index("image_filename", inplace=True)
img_text_ds.to_pickle(join(EXP_FLD, "img_text_dataset.pkl"))
print("IMG_TEXT_DS")
display(img_text_ds.head().T)

print(f"img_text_ds saved: {join(EXP_FLD, 'img_text_dataset.pkl')}")


merged.loc[merged.split == "validate", "split"] = "valid"

# now prepare ecvl dataset

# # 3 training iterations

# # splits
# print( ds.split.value_counts() )

# ds.rename(columns = {"path":"filename"}, inplace=True)
# print( ds.split.value_counts() )

# df_split = ds[["path", "split"]]
# df_split.rename(columns = {"path":"filename"}, inplace=True)

# n_trainings = 3

# for i in range(n_trainings):
#     df_split.to_pickle(join(EXP_FLD, f"split_{i}.pkl"))
    
#     print(f"saved: {join(EXP_FLD, f'split_{i}.pkl')}")

# build_ecvl_dataset(img_ds, img_text_ds):
print(f"all done, exp data available in: {EXP_FLD}")


created exp folder: /mnt/datasets/uc5/EXPS/mimic/std_exp_resized_2
saved IMG_DS


Unnamed: 0,0,1,2,3,4
filename,/mnt/datasets/mimic-cxr/mimic-cxr-jpg/physione...,/mnt/datasets/mimic-cxr/mimic-cxr-jpg/physione...,/mnt/datasets/mimic-cxr/mimic-cxr-jpg/physione...,/mnt/datasets/mimic-cxr/mimic-cxr-jpg/physione...,/mnt/datasets/mimic-cxr/mimic-cxr-jpg/physione...
Atelectasis,0.0,0.0,0.0,0.0,0.0
Cardiomegaly,0.0,0.0,0.0,0.0,0.0
Consolidation,0.0,0.0,0.0,0.0,0.0
Edema,0.0,0.0,0.0,0.0,0.0
Enlarged Cardiomediastinum,0.0,0.0,0.0,0.0,0.0
Fracture,0.0,0.0,0.0,0.0,0.0
Lung Lesion,0.0,0.0,0.0,0.0,0.0
Lung Opacity,0.0,0.0,0.0,0.0,0.0
No Finding,1.0,1.0,1.0,1.0,1.0


saved /mnt/datasets/uc5/EXPS/mimic/std_exp_resized_2/label2idx.yaml
lab2idx with 15 labels
saved /mnt/datasets/uc5/EXPS/mimic/std_exp_resized_2/idx2label.yaml
MERGED (reduced)


Unnamed: 0,0,1,2,3,4
id,50414267,50414267,53189527,53189527,53911762
split,train,train,train,train,train
image_filename,files/p10/p10000032/s50414267/02aa804e-bde0afd...,files/p10/p10000032/s50414267/174413ec-4ec4c1f...,files/p10/p10000032/s53189527/2a2277a9-b0ded15...,files/p10/p10000032/s53189527/e084de3b-be89b11...,files/p10/p10000032/s53911762/68b5c4b1-227d048...
text,there is no focal consolidation pleural effusi...,there is no focal consolidation pleural effusi...,the cardiac mediastinal and hilar contours are...,the cardiac mediastinal and hilar contours are...,single frontal view of the chest provided. the...


266     3269
247     2901
185     2104
232     1171
274     1164
        ... 
1724       1
1353       1
1656       1
1766       1
1402       1
Name: text_len, Length: 1586, dtype: int64
EMPTY ROWS: 0
num words: 3004, including special tokens
word 0: <pad>
word 1: <oov>
word 2: <bos>
word 3: <eos>
word 4: the
VOCAB SAVED in exp fld: /mnt/datasets/uc5/EXPS/mimic/std_exp_resized_2
encoding ...
encoded
COLLATED
IMG_TEXT_DS


Unnamed: 0,0,1,2,3,4
image_filename,/mnt/datasets/mimic-cxr/mimic-cxr-jpg/physione...,/mnt/datasets/mimic-cxr/mimic-cxr-jpg/physione...,/mnt/datasets/mimic-cxr/mimic-cxr-jpg/physione...,/mnt/datasets/mimic-cxr/mimic-cxr-jpg/physione...,/mnt/datasets/mimic-cxr/mimic-cxr-jpg/physione...
id,50414267,50414267,53189527,53189527,53911762
text,there is no focal consolidation pleural effusi...,there is no focal consolidation pleural effusi...,the cardiac mediastinal and hilar contours are...,the cardiac mediastinal and hilar contours are...,single frontal view of the chest provided. the...
enc_text,"[[10, 5, 6, 30, 81, 12, 22, 17, 41], [68, 418,...","[[10, 5, 6, 30, 81, 12, 22, 17, 41], [68, 418,...","[[4, 44, 28, 8, 45, 37, 9, 40], [18, 152, 5, 4...","[[4, 44, 28, 8, 45, 37, 9, 40], [18, 152, 5, 4...","[[400, 120, 205, 7, 4, 24, 840], [10, 5, 6, 30..."
target_text,"[2, 10, 5, 6, 30, 81, 12, 22, 17, 41, 3, 0]","[2, 10, 5, 6, 30, 81, 12, 22, 17, 41, 3, 0]","[2, 4, 44, 28, 8, 45, 37, 9, 40, 3, 0, 0]","[2, 4, 44, 28, 8, 45, 37, 9, 40, 3, 0, 0]","[2, 400, 120, 205, 7, 4, 24, 840, 3, 0, 0, 0]"


img_text_ds saved: /mnt/datasets/uc5/EXPS/mimic/std_exp_resized_2/img_text_dataset.pkl
all done, exp data available in: /mnt/datasets/uc5/EXPS/mimic/std_exp_resized_2


In [20]:
def build_ecvl_dataset(img_ds, img_text_ds, name="na", description="na"):
    img_ds = img_ds.set_index("filename")
    pd.set_option('display.max_colwidth', None)
    display(img_ds)
    labels = list(range(img_ds.shape[1]))
    print("N CLASSES:", len(labels))
    
    d = {
        "name"        : name,
        "description" : description,
        "classes"     : labels, 
        "images"      : None,
        "split"       : None
    }
    imgs = []
    splits = ["train", "valid", "test"]
    counter = {}
    for s in splits:
        split_ds = img_text_ds[img_text_ds.split == s]
        n_split_samples = split_ds.shape[0]
        counter[s] = n_split_samples
        print(f"SPLIT {s}: samples {n_split_samples}")

        for row in split_ds.itertuples():
            filename = row.image_filename
            values = img_ds.loc[filename]
            classes = []
            for class_idx, v in enumerate(values):
                if v == 1:
                    classes.append(class_idx)
            imgs.append({
                "location": filename,
                "label": classes
            })
    d["images"] = imgs
    d["split"] = dict(training=list(range(counter["train"])),
                    validation = list(range(counter["train"], counter["train"] + counter["valid"])),
                    test = list(range(counter["train"] + counter["valid"], counter["train"] + counter["valid"] + counter["test"])))
    return d

ecvl_ds = build_ecvl_dataset(img_ds, merged)
os.makedirs(join(EXP_FLD, "run_0"), exist_ok=True)
ecvl_filename = join(EXP_FLD, "run_0", "ecvl_ds.yml")
print("saving ecvl dataset in yaml format...")
with open( ecvl_filename, "w") as fout:
            yaml.safe_dump(ecvl_ds, fout, default_flow_style=None)
print("saved:", ecvl_filename)


Unnamed: 0_level_0,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
/mnt/datasets/mimic-cxr/mimic-cxr-jpg/physionet.org/files/mimic-cxr-jpg/2.0.0_resized/files/p10/p10000032/s50414267/02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
/mnt/datasets/mimic-cxr/mimic-cxr-jpg/physionet.org/files/mimic-cxr-jpg/2.0.0_resized/files/p10/p10000032/s50414267/174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
/mnt/datasets/mimic-cxr/mimic-cxr-jpg/physionet.org/files/mimic-cxr-jpg/2.0.0_resized/files/p10/p10000032/s53189527/2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
/mnt/datasets/mimic-cxr/mimic-cxr-jpg/physionet.org/files/mimic-cxr-jpg/2.0.0_resized/files/p10/p10000032/s53189527/e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
/mnt/datasets/mimic-cxr/mimic-cxr-jpg/physionet.org/files/mimic-cxr-jpg/2.0.0_resized/files/p10/p10000032/s53911762/68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
/mnt/datasets/mimic-cxr/mimic-cxr-jpg/physionet.org/files/mimic-cxr-jpg/2.0.0_resized/files/p19/p19999733/s57132437/428e2c18-5721d8f3-35a05001-36f3d080-9053b83c.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
/mnt/datasets/mimic-cxr/mimic-cxr-jpg/physionet.org/files/mimic-cxr-jpg/2.0.0_resized/files/p19/p19999733/s57132437/58c403aa-35ff8bd9-73e39f54-8dc9cc5d-e0ec3fa9.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
/mnt/datasets/mimic-cxr/mimic-cxr-jpg/physionet.org/files/mimic-cxr-jpg/2.0.0_resized/files/p19/p19999987/s55368167/58766883-376a15ce-3b323a28-6af950a0-16b793bd.jpg,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
/mnt/datasets/mimic-cxr/mimic-cxr-jpg/physionet.org/files/mimic-cxr-jpg/2.0.0_resized/files/p19/p19999987/s58621812/7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08.jpg,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


N CLASSES: 14
SPLIT train: samples 354617
SPLIT valid: samples 2866
SPLIT test: samples 4710
saving ecvl dataset in yaml format...
saved: /mnt/datasets/uc5/EXPS/mimic/std_exp_resized_2/run_0/ecvl_ds.yml


In [None]:
# check files


<font color="yellow">NOTICE: cells below are taken from the pytorch pipeline and cannot be used with EDDL without modifications.</font>

In [None]:
# ! WARNING
# ! OLD CODE: for pytorch-based pipelines
#  EXPERIMENT - NORMAL vs REST (unbalanced)

EXP_FLD = "/opt/uc5/results/sicaai/mimic/normal_vs_rest_unbal"
os.makedirs(EXP_FLD, exist_ok=True)
print("created exp folder:", EXP_FLD)

%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from numpy import count_nonzero as nnz
import os
from posixpath import join
from IPython.display import display
from IPython.display import Image
import pickle
from nltk import sent_tokenize
import yaml

from utils.vocabulary import Vocabulary

# folders with images
jpg_fld = "/mnt/datasets/mimic-cxr/mimic-cxr-jpg/physionet.org/files/mimic-cxr-jpg/2.0.0"
dcm_fld = "/mnt/datasets/mimic-cxr/mimic-cxr-dcm/physionet.org/files/mimic-cxr/2.0.0"

# metadata (report-based)
filename_metadata = join(jpg_fld, "mimic-cxr-2.0.0-metadata.csv")
# labels (image-based)
filename_chexpert = join(jpg_fld, "mimic-cxr-2.0.0-chexpert.csv") #labels
# output files saved here
meta_fld = "/mnt/datasets/mimic-cxr/meta"



# ! process labels
img_ds = pd.read_pickle(join(meta_fld, "img_dataset.pkl"))
display(img_ds.head())
normal_col = "No Finding"

normal_idx = img_ds[normal_col] == 1
others_idx = img_ds[normal_col] == 0
print("normal:", nnz(normal_idx))
print("others:", nnz(others_idx))

img_ds.drop(columns=[c for c in img_ds.columns if c not in [normal_col, "filename"]], inplace=True)
img_ds["others"] = 0
img_ds.loc[others_idx, "others"] = 1
print("normal:", nnz(img_ds[normal_col] == 1))
print("others:", nnz(img_ds["others"] == 1))

img_ds.to_pickle(join(EXP_FLD, "img_dataset.pkl"))

# ! label to indexes
def save_label_indexes(df, out_fld):
    cols = df.columns
    lab2idx = {}
    idx2lab = {}
    for i, c in enumerate(cols):
        lab2idx[c] = i
        idx2lab[i] = c
        print(f"{i}) {c}")
    with open(join(out_fld, "label2idx.yaml"), "w") as fout:
        yaml.safe_dump(lab2idx, fout)
    print(f"saved {join(out_fld, 'label2idx.yaml')}")
    print(f"lab2idx with {len(lab2idx)} labels")
    with open(join(out_fld, "idx2label.yaml"), "w") as fout:
        yaml.safe_dump(idx2lab, fout)
    print(f"saved {join(out_fld, 'idx2label.yaml')}")
#<
save_label_indexes(img_ds, EXP_FLD)

# ! encode text
# read vocabulary
VOCAB_FN = join(meta_fld, "vocab_3000.pkl")
with open(VOCAB_FN, "rb") as fin:
    vocab = pickle.load(fin)
# save in exp folder
with open(join(EXP_FLD, "vocab.pkl"), "wb") as fout:
    pickle.dump(vocab, fout)
print(f"num words: {len(vocab.word2idx)}, including special tokens")

for i in range(5):
    print(f"word {i}: {vocab.idx2word[i]}")


# read dataset
ds = pd.read_pickle(join(meta_fld, "mimic_ds.pkl"))
display(ds.head().T)

#> ENCODING
def encode_text(sentences, vocab):
    word_indexes = []
    for sent in sentences.split("."):
        tokens = sent.strip().split()
        enc_sent = []
        for t in tokens:
            enc_sent.append(vocab.word2idx.get(t, Vocabulary.OOV))
        word_indexes.append(enc_sent)

    return word_indexes

print("encoding ...")
enc_text = ds.text.apply(encode_text, args=(vocab,))
ds["enc_text"] = enc_text
print('encoded')

display(ds.head().T)

text_ds = ds[["study_id", "path", "text", "enc_text"]]
text_ds = text_ds.rename(columns= {"study_id":"id", "path":"image_filename"})
display(text_ds.head().T)

text_ds.to_pickle(join(EXP_FLD, "img_text_dataset.pkl"))
#<


# 3 training iterations

# splits
print( ds.split.value_counts() )

ds.loc[ds.split == "validate", "split"] = "valid"
ds.rename(columns = {"path":"filename"}, inplace=True)
print( ds.split.value_counts() )

df_split = ds[["filename", "split"]]

n_iter = 3

for i in range(n_iter):
    df_split.to_pickle(join(EXP_FLD, f"split_{i}.pkl"))
    print(f"SPLIT {i}")
    display(df_split.head())
    df_split.filename
    print(f"saved: {join(EXP_FLD, f'split_{i}.pkl')}")

print(f"all done, exp data available in: {EXP_FLD}")


In [None]:
#  EXPERIMENT - NORMAL vs REST (balanced)

EXP_FLD = "/opt/uc5/results/sicaai/mimic/normal_vs_rest_bal"
os.makedirs(EXP_FLD, exist_ok=True)
print("created exp folder:", EXP_FLD)

%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from numpy import count_nonzero as nnz
import os
from posixpath import join
from IPython.display import display
from IPython.display import Image
import pickle
from nltk import sent_tokenize
import yaml

from utils.vocabulary import Vocabulary

# folders with images
jpg_fld = "/mnt/datasets/mimic-cxr/mimic-cxr-jpg/physionet.org/files/mimic-cxr-jpg/2.0.0"
dcm_fld = "/mnt/datasets/mimic-cxr/mimic-cxr-dcm/physionet.org/files/mimic-cxr/2.0.0"

# metadata (report-based)
filename_metadata = join(jpg_fld, "mimic-cxr-2.0.0-metadata.csv")
# labels (image-based)
filename_chexpert = join(jpg_fld, "mimic-cxr-2.0.0-chexpert.csv") #labels
# output files saved here
meta_fld = "/mnt/datasets/mimic-cxr/meta"

# ! process labels
img_ds = pd.read_pickle(join(meta_fld, "img_dataset.pkl"))
# display(img_ds.head())
normal_col = "No Finding"

normal_idx = img_ds[normal_col] == 1
others_idx = img_ds[normal_col] == 0
print("orig dataset, normal:", nnz(normal_idx))
print("full dataset, others:", nnz(others_idx))
print("full dataset, shape:", img_ds.shape)

img_ds.drop(columns=[c for c in img_ds.columns if c not in [normal_col, "filename"]], inplace=True)
img_ds["others"] = 0
img_ds.loc[others_idx, "others"] = 1

# save in exp folder
print("IMG_DS SAVED IN EXP:")
display(img_ds.head())
img_ds.to_pickle(join(EXP_FLD, "img_dataset.pkl"))

# ds needed for the column "split"
ds = pd.read_pickle(join(meta_fld, "mimic_ds.pkl"))

# join img_ds and ds
merged = img_ds.reset_index().merge(ds[["path", "split"]], left_on="filename", right_on="path", how="left")
merged.drop(columns=["path"], inplace=True)

n_iters = 3
for iter in range(n_iters):
    dfs = []
    for g in merged.groupby(["split"]):
        # print(g)
        sub = g[1]  # dataframe
        n_normals = int(sub[normal_col].sum())
        print(f"partition {g[0]}, n_normals: {n_normals}")
        sub = sub.groupby(normal_col).sample(n_normals)
        print(f"{g[0]}, n_normals: {sub[normal_col].sum()}")
        print(f"{g[0]}, n_others: {sub['others'].sum()}")
        print()
        dfs.append(sub)
        #n_normals = nnz(g[normal_col] == 1)
        #print(n_normals)
    data = pd.concat(dfs, axis=0)
    print("SPLIT:", iter)
    print("normal:", data[normal_col].sum())
    print("others:", data["others"].sum())
    print("shape:", data.shape)
    data.loc[data.split=="validate", "split"] = "valid"
    display(data)
    save_data = data[["filename", "split"]]
    print("DATAFRAME SPLIT:")
    display(save_data.head())
    save_data.to_pickle(join(EXP_FLD, f"split_{iter}.pkl"))

# ! label to indexes
def save_label_indexes(df, out_fld):
    cols = df.columns
    lab2idx = {}
    idx2lab = {}
    for i, c in enumerate(cols):
        lab2idx[c] = i
        idx2lab[i] = c
        print(f"{i}) {c}")
    with open(join(out_fld, "label2idx.yaml"), "w") as fout:
        yaml.safe_dump(lab2idx, fout)
    print(f"saved {join(out_fld, 'label2idx.yaml')}")
    print(f"lab2idx with {len(lab2idx)} labels")
    with open(join(out_fld, "idx2label.yaml"), "w") as fout:
        yaml.safe_dump(idx2lab, fout)
    print(f"saved {join(out_fld, 'idx2label.yaml')}")
#<
save_label_indexes(img_ds, EXP_FLD)

# ! encode text
# read vocabulary
VOCAB_FN = join(meta_fld, "vocab_3000.pkl")
with open(VOCAB_FN, "rb") as fin:
    vocab = pickle.load(fin)
# save in exp folder
with open(join(EXP_FLD, "vocab.pkl"), "wb") as fout:
    pickle.dump(vocab, fout)
print(f"num words: {len(vocab.word2idx)}, including special tokens")

for i in range(5):
    print(f"word {i}: {vocab.idx2word[i]}")


# read dataset
ds = pd.read_pickle(join(meta_fld, "mimic_ds.pkl"))
display(ds.head().T)

#> ENCODING
def encode_text(sentences, vocab):
    word_indexes = []
    for sent in sentences.split("."):
        tokens = sent.strip().split()
        enc_sent = []
        for t in tokens:
            enc_sent.append(vocab.word2idx.get(t, Vocabulary.OOV))
        word_indexes.append(enc_sent)

    return word_indexes

print("encoding ...")
enc_text = ds.text.apply(encode_text, args=(vocab,))
ds["enc_text"] = enc_text
print('encoded')

display(ds.head().T)

text_ds = ds[["study_id", "path", "text", "enc_text"]]
text_ds = text_ds.rename(columns= {"study_id":"id", "path":"image_filename"})
display(text_ds.head().T)

text_ds.to_pickle(join(EXP_FLD, "img_text_dataset.pkl"))
#<


# # 3 training iterations

# # splits
# print( ds.split.value_counts() )

# ds.loc[ds.split == "validate", "split"] = "valid"
# ds.rename(columns = {"path":"filename"}, inplace=True)
# print( ds.split.value_counts() )

# df_split = ds[["filename", "split"]]

# n_iter = 1

# for i in range(n_iter):
#     df_split.to_pickle(join(EXP_FLD, f"split_{i}.pkl"))
#     print(f"SPLIT {i}")
#     display(df_split.head())
#     print(f"saved: {join(EXP_FLD, f'split_{i}.pkl')}")

print(f"all done, exp data available in: {EXP_FLD}")
