### BRSET

* [Dataset](https://physionet.org/content/brazilian-ophthalmological/1.0.0/)

* [Original Paper](https://www.medrxiv.org/content/10.1101/2024.01.23.24301660v1)

In [1]:
%cd ..

/home/datascience/Data Fusion


### Setup Environment:

In [2]:
import os
import pandas as pd

from src.classifiers import preprocess_data, process_labels,split_data

from src.classifiers import VQADataset
from torch.utils.data import DataLoader

from src.classifiers_cpu_metrics import calculate_memory

In [3]:
PATH = 'Embeddings/brset/'
COLUMN = 'embeddings'

In [4]:
text_path, images_path = os.listdir(PATH)

## Get data

### Text

In [5]:
text = pd.read_csv(os.path.join(PATH, text_path))
text

Unnamed: 0,image_id,DR_ICDR,text,DR_2,DR_3,split,embeddings
0,img00001,0,"An image from the right eye of a male patient,...",0,0,train,"[-0.23097647726535797, -0.6493059992790222, 0...."
1,img00002,0,"An image from the left eye of a male patient, ...",0,0,test,"[-0.18894515931606293, -0.5165128707885742, 0...."
2,img00003,0,An image from the right eye of a female patien...,0,0,train,"[-0.6901693344116211, -0.7104458212852478, 0.5..."
3,img00004,0,An image from the left eye of a female patient...,0,0,train,"[-0.7486071586608887, -0.5078404545783997, 0.6..."
4,img00005,0,"An image from the right eye of a male patient,...",0,0,test,"[-0.5759348273277283, -0.6498719453811646, 0.7..."
...,...,...,...,...,...,...,...
16261,img16262,1,"An image from the left eye of a male patient, ...",1,1,test,"[-0.15599964559078217, -0.5760699510574341, 0...."
16262,img16263,0,"An image from the right eye of a male patient,...",0,0,train,"[-0.6628211736679077, -0.7985941767692566, 0.8..."
16263,img16264,0,"An image from the left eye of a male patient, ...",0,0,test,"[-0.710575520992279, -0.6339892745018005, 0.91..."
16264,img16265,0,"An image from the right eye of a male patient,...",0,0,train,"[0.040444035083055496, -0.4349224269390106, 0...."


### Images

In [6]:
images = pd.read_csv(os.path.join(PATH, images_path))
images.head()

Unnamed: 0,ImageName,0,1,2,3,4,5,6,7,8,...,758,759,760,761,762,763,764,765,766,767
0,img09798.jpg,-0.294193,0.390751,2.643405,0.073184,-2.890928,-1.097695,1.91523,-1.107189,0.594483,...,-1.713115,-2.014417,-2.686043,-1.798464,2.518672,0.784248,-1.113031,-2.76622,0.831086,-1.603637
1,img05580.jpg,-0.367986,0.880974,4.144483,0.404111,-2.362875,-2.483494,1.619341,-0.234326,0.339506,...,-0.264772,-1.551748,-1.696318,-2.658066,2.74664,1.296471,-0.03117,-1.991332,0.483615,-3.028327
2,img04048.jpg,-1.067095,0.447412,2.681838,-0.720095,-1.41848,-1.689153,1.293048,-0.19382,-0.901702,...,-0.383299,-2.128386,-1.460208,-1.774976,2.094109,1.960315,-1.126977,-2.039704,0.200136,-2.508668
3,img03601.jpg,0.261033,0.724737,4.302428,-0.413854,-2.100041,-1.586142,1.285109,-0.815749,-0.267234,...,-1.517937,-1.591721,-1.075221,-1.612588,2.007281,-0.389522,-0.608352,-1.712191,-0.994813,-2.560976
4,img03469.jpg,-0.669364,0.454145,1.622608,-0.313759,-0.694529,-1.980817,2.837111,0.497876,-0.563879,...,0.930517,-3.166204,-1.988941,-2.530482,1.69323,2.151035,-0.600033,-2.071379,-0.978301,-3.103351


### Merge and preprocess the datasets

In [7]:
df = preprocess_data(text, images, "image_id", "ImageName")
df.drop(columns='text', inplace=True)
df.head()

Unnamed: 0,DR_ICDR,DR_2,DR_3,split,text_1,text_2,text_3,text_4,text_5,text_6,...,image_758,image_759,image_760,image_761,image_762,image_763,image_764,image_765,image_766,image_767
0,0,0,0,train,-0.230976,-0.649306,0.351964,4.827357,-0.652423,-1.548302,...,0.353415,-2.146708,-1.749919,-1.179385,1.85987,2.4566,-0.001963,-1.731499,0.635826,-2.96702
1,0,0,0,test,-0.188945,-0.516513,0.458152,4.739528,-0.524881,-1.532104,...,-0.18476,-0.907411,-2.038369,-1.02167,1.608807,1.900826,-0.817061,-1.813032,0.36919,-2.460825
2,0,0,0,train,-0.690169,-0.710446,0.562674,4.909813,-0.266062,-1.765767,...,-0.754568,-1.555292,-2.267442,-2.331255,2.976459,-0.276913,-0.815998,-0.84656,0.259764,-1.921576
3,0,0,0,train,-0.748607,-0.50784,0.673932,4.859914,-0.07118,-1.733303,...,-0.992413,-1.933622,-1.661158,-1.502459,2.131441,0.62016,-0.455107,-1.403641,0.61773,-2.241285
4,0,0,0,test,-0.575935,-0.649872,0.722284,5.00232,-0.277846,-1.737345,...,-0.895002,-1.292191,-1.858146,-1.470159,2.455127,1.181771,-0.773389,-1.061603,-0.579806,-2.56962


## Data Perparation

In [8]:
# Split the data
train_df, test_df = split_data(df)

# Select features and labels vectors
text_columns = [column for column in df.columns if 'text' in column] #[f'text_{i}']
image_columns = [column for column in df.columns if 'image' in column] #[f'image_{i}']
label_columns = 'DR_2'


# Process and one-hot encode labels for training set
train_labels, mlb, train_columns = process_labels(train_df, col=label_columns)
test_labels = process_labels(test_df, col=label_columns, train_columns=train_columns)


train_dataset = VQADataset(train_df, text_columns, image_columns, label_columns, mlb, train_columns)
test_dataset = VQADataset(test_df, text_columns, image_columns, label_columns, mlb, train_columns)


train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=2)

Train Shape: (13012, 4868)
Test Shape: (3254, 4868)


### Models

In [9]:
text_input_size = len(text_columns)
image_input_size = len(image_columns)
if label_columns == 'DR_2':
    output_size = 1
else:
    output_size = len(pd.unique(train_df[label_columns]))
multilabel = False

In [10]:
calculate_memory(train_loader, test_loader, text_input_size, image_input_size, output_size)

Early fusion:
Average Memory per Batch in Train: 1.18 MB
Total Memory Usage per Epoch Train: 241.48 MB (excluding model parameters)
Test:
Average Memory per Batch in Test: 0.30 MB
Total Memory Usage per Epoch Test: 15.10 MB (excluding model parameters)
Model: 
Model Memory Usage: 2.38 MB

Late fusion:
Average Memory per Batch in Train: 1.18 MB
Total Memory Usage per Epoch Train: 241.48 MB (excluding model parameters)
Test:
Average Memory per Batch in Test: 0.30 MB
Total Memory Usage per Epoch Test: 15.10 MB (excluding model parameters)
Model: 
Model Memory Usage: 1.19 MB
