### HAM 10000

* [Dataset](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/DBW86T)

* [Original Paper](https://www.nature.com/articles/sdata2018161)

In [1]:
%cd ..

/home/datascience/Data Fusion


### Setup Environment:

In [2]:
import os
import pandas as pd

from src.classifiers import preprocess_data, process_labels,split_data

from src.classifiers import VQADataset
from torch.utils.data import DataLoader

from src.classifiers_cpu_metrics import calculate_memory

In [3]:
PATH = 'Embeddings/ham10000/'
COLUMN = 'embeddings'

In [4]:
text_path, images_path = os.listdir(PATH)

## Get data

### Text

In [5]:
text = pd.read_csv(os.path.join(PATH, text_path))
text

Unnamed: 0,image_id,dx,text,split,embeddings
0,ISIC_0033319,nv,Patient diagnosed via histo. Age: 35 years. Se...,train,"[3.236721992492676, -0.7394944429397583, 2.767..."
1,ISIC_0030823,nv,Patient diagnosed via follow_up. Age: 40 years...,train,"[3.1978533267974854, -1.3737833499908447, 3.03..."
2,ISIC_0028730,akiec,Patient diagnosed via histo. Age: 65 years. Se...,train,"[3.1016488075256348, -0.910774290561676, 2.712..."
3,ISIC_0027299,nv,Patient diagnosed via follow_up. Age: 40 years...,train,"[3.1515274047851562, -1.0320912599563599, 2.47..."
4,ISIC_0032444,nv,Patient diagnosed via histo. Age: 65 years. Se...,train,"[3.1069583892822266, -1.0400464534759521, 2.30..."
...,...,...,...,...,...
10010,ISIC_0034116,nv,Patient diagnosed via histo. Age: 35 years. Se...,test,"[3.2164361476898193, -0.7670844793319702, 3.04..."
10011,ISIC_0026453,bcc,Patient diagnosed via histo. Age: 55 years. Se...,test,"[3.0540246963500977, -0.9477623105049133, 2.34..."
10012,ISIC_0029885,mel,Patient diagnosed via histo. Age: 35 years. Se...,test,"[3.058117151260376, -0.9321731925010681, 2.385..."
10013,ISIC_0033226,mel,Patient diagnosed via histo. Age: 65 years. Se...,test,"[2.567990779876709, -0.786322295665741, 2.1221..."


### Images

In [6]:
images = pd.read_csv(os.path.join(PATH, images_path))
images.head()

Unnamed: 0,ImageName,0,1,2,3,4,5,6,7,8,...,758,759,760,761,762,763,764,765,766,767
0,ISIC_0034270.jpg,0.788414,-0.175732,0.57187,-3.324638,1.868513,-3.654095,1.18118,-0.911827,-1.607665,...,0.156191,-1.275873,-0.159357,0.137761,1.449755,1.265087,-0.351541,-2.545814,-0.105099,-2.420387
1,ISIC_0028607.jpg,0.779679,-1.515092,1.360622,-0.367401,-1.459929,-5.119742,-0.69475,-0.838915,-1.972777,...,-0.69541,-1.646969,-0.97541,-1.31818,1.0852,-2.449388,1.271104,-2.958441,-2.017804,-2.190779
2,ISIC_0033962.jpg,1.187722,-0.24272,2.727305,-0.350272,0.690375,-3.688381,0.674369,-1.261206,-2.15309,...,-0.567914,-1.451659,-1.796797,0.030802,1.300691,0.447621,0.420344,-1.75653,-0.35666,-3.559395
3,ISIC_0034205.jpg,1.792122,-1.577801,1.715291,-2.408758,0.636901,-4.870243,2.044212,-1.625948,-1.264762,...,0.660257,-2.85165,-1.287652,-0.034494,0.609628,2.412038,-1.139721,-4.104665,0.642429,-1.40891
4,ISIC_0033155.jpg,0.127771,0.93399,-0.058526,-3.397099,0.952787,-3.337882,1.368866,-1.483904,-1.112241,...,-0.126982,0.255524,0.010045,1.304554,2.6627,-0.100018,-0.582395,-1.632171,1.970989,-0.681773


### Merge and preprocess the datasets

In [7]:
df = preprocess_data(text, images, "image_id", "ImageName")
df.drop(columns='text', inplace=True)
df.head()

Unnamed: 0,dx,split,text_1,text_2,text_3,text_4,text_5,text_6,text_7,text_8,...,image_758,image_759,image_760,image_761,image_762,image_763,image_764,image_765,image_766,image_767
0,nv,train,3.236722,-0.739494,2.767815,2.977734,0.151343,1.922289,-0.614234,-0.036614,...,0.15161,-0.41672,-0.741418,-0.19596,1.777167,1.339372,-1.264874,-2.068761,0.750921,-2.04784
1,nv,train,3.197853,-1.373783,3.037363,2.311507,1.628178,0.383299,-1.635776,1.009209,...,0.043114,-1.430739,-0.509843,0.542778,0.117624,1.338904,-0.256517,-1.047065,0.875867,-1.740106
2,akiec,train,3.101649,-0.910774,2.712828,3.315805,0.162183,1.690392,-0.627258,0.270939,...,0.967768,-3.906605,-0.074423,-0.327721,2.269238,0.344848,-1.42514,-2.678744,-1.339433,-3.752901
3,nv,train,3.151527,-1.032091,2.471391,3.638098,0.970964,1.238591,-0.291014,0.352654,...,-0.069756,-1.501771,-0.059097,1.069056,0.486885,1.278436,0.518448,-3.212569,-2.012066,-0.775845
4,nv,train,3.106958,-1.040046,2.301498,2.111029,1.11405,1.175306,-0.688185,0.991874,...,-0.300747,-2.016899,-1.806851,-1.276631,0.606535,-0.106882,-0.666254,-1.708952,-0.982433,-3.697665


## Data Perparation

In [8]:
# Split the data
train_df, test_df = split_data(df)

# Select features and labels vectors
text_columns = [column for column in df.columns if 'text' in column] #[f'text_{i}']
image_columns = [column for column in df.columns if 'image' in column] #[f'image_{i}']
label_columns = 'dx'


# Process and one-hot encode labels for training set
train_labels, mlb, train_columns = process_labels(train_df, col=label_columns)
test_labels = process_labels(test_df, col=label_columns, train_columns=train_columns)


train_dataset = VQADataset(train_df, text_columns, image_columns, label_columns, mlb, train_columns)
test_dataset = VQADataset(test_df, text_columns, image_columns, label_columns, mlb, train_columns)


train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=2)

Train Shape: (8012, 4866)
Test Shape: (2003, 4866)


### Models

In [9]:
text_input_size = len(text_columns)
image_input_size = len(image_columns)
if label_columns == 'DR_2':
    output_size = 1
else:
    output_size = len(pd.unique(train_df[label_columns]))
multilabel = False

In [10]:
calculate_memory(train_loader, test_loader, text_input_size, image_input_size, output_size)

Early fusion:
Average Memory per Batch in Train: 1.18 MB
Total Memory Usage per Epoch Train: 148.87 MB (excluding model parameters)
Test:
Average Memory per Batch in Test: 0.30 MB
Total Memory Usage per Epoch Test: 9.45 MB (excluding model parameters)
Model: 
Model Memory Usage: 2.38 MB

Late fusion:
Average Memory per Batch in Train: 1.18 MB
Total Memory Usage per Epoch Train: 148.87 MB (excluding model parameters)
Test:
Average Memory per Batch in Test: 0.30 MB
Total Memory Usage per Epoch Test: 9.45 MB (excluding model parameters)
Model: 
Model Memory Usage: 1.19 MB
