In [1]:
from exp.utils import *
from exp.models import *
from exp.losses import *
from tqdm.notebook import tqdm
from multiprocessing import Pool

import torch
import torch.nn as NN
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, transforms
from torchvision.transforms import ToTensor

In [2]:
seed = 92
seed_everything(seed)

In [3]:
label = "Cardiomegaly"
model_name = f"balanced_dl_v1_{label}"
model_type = "densenet"
bs = 16
lr = 1e-3
epochs = 50
image_size = (224, 224)
device = get_device()
labels = get_labels()

Using the GPU!


In [4]:
train_df, valid_df, test_df = get_dataframes(include_labels=labels, small=True, small_fraction=0.05)
train_df.shape, valid_df.shape, test_df.shape



((3460, 24), (865, 24), (1279, 24))

In [5]:
train_df = get_binary_df(label, train_df)
valid_df = get_binary_df(label, valid_df)
test_df = get_binary_df(label, test_df)

In [6]:
train_label = train_df[[label]].values
neg_weights, pos_weights = compute_class_freqs(train_label)
neg_weights, pos_weights = torch.Tensor(neg_weights), torch.Tensor(pos_weights)
neg_weights, pos_weights

(tensor([0.0179]), tensor([0.9821]))

In [7]:
train_tfs, test_tfs = get_transforms(image_size=image_size)

In [8]:
train_ds = CRX8_Data(train_df, get_image_path(), label, image_size=image_size, transforms=train_tfs)
valid_ds = CRX8_Data(valid_df, get_image_path(), label, image_size=image_size, transforms=test_tfs)
test_ds  = CRX8_Data(test_df , get_image_path(), label, image_size=image_size, transforms=test_tfs)

In [9]:
def split_pos_neg(df, label):
    pos_df = df[df[label] >= 0.5]
    neg_df = df[df[label] < 0.5]
    return pos_df, neg_df

In [10]:
pos_train, neg_train = split_pos_neg(train_df, label)

In [11]:
pos_train_ds = CRX8_Data(pos_train, get_image_path(), label, image_size=image_size, transforms=train_tfs)
neg_train_ds = CRX8_Data(neg_train, get_image_path(), label, image_size=image_size, transforms=train_tfs)

valid_ds = CRX8_Data(valid_df, get_image_path(), label, image_size=image_size, transforms=test_tfs)
test_ds  = CRX8_Data(test_df , get_image_path(), label, image_size=image_size, transforms=test_tfs)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df["Index_2"] = list(range(self.df.shape[0]))


In [12]:
class BalancedDataLoader:
    def __init__(self, pos_dataset, neg_dataset, image_size, batch_size=16, shuffle=False): 
        self.pos_dataset, self.neg_dataset = pos_dataset, neg_dataset
        self.batch_size, self.shuffle = batch_size, shuffle
        self.image_size = image_size
        self.pos_prob, self.neg_prob = self.calculate_frequencies()
        self.pos_dl = DataLoader(pos_dataset, batch_size=1, shuffle=shuffle)
        self.neg_dl = DataLoader(neg_dataset, batch_size=1, shuffle=shuffle)
        
        self.n_batches = math.ceil((len(neg_dataset) + len(pos_dataset)) / batch_size)
        self.batch_counter = 0

    def calculate_frequencies(self):
        n_pos = len(self.pos_dataset)
        n_neg = len(self.neg_dataset)
        return n_neg / (n_pos + n_neg), n_pos / (n_pos + n_neg)
    
    def sampler(self):
        return np.random.rand(self.batch_size) >= 0.5
    
    def get_data(self, v):
        if v > 0.5:
            rand_idx = np.random.choice(list(range(len(self.pos_dataset))))
            return self.pos_dataset.__getitem__(rand_idx)[0]
        else:
            rand_idx = np.random.choice(list(range(len(self.neg_dataset))))
            return self.neg_dataset.__getitem__(rand_idx)[0]
    
    def __next__(self):
        if self.batch_counter >= self.n_batches: raise StopIteration
        sample = self.sampler()
        X = torch.zeros((self.batch_size, 3, *self.image_size))
        y = torch.zeros((len(sample), 1))
        for idx, s in enumerate(sample): X[idx] = self.get_data(s)
        for idx, s in enumerate(sample): y[idx] = torch.Tensor([s])
        #y = torch.Tensor(sample).float()
        self.batch_counter += 1
        return X.float(), y.float()
    
    def __iter__(self): return self
    def __len__(self): return self.n_batches

In [13]:
train_dl = BalancedDataLoader(pos_train_ds, neg_train_ds, image_size)

#train_dl = DataLoader(train_ds, batch_size=bs, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=bs, shuffle=False)
test_dl  = DataLoader(test_ds,  batch_size=bs, shuffle=False)

dataloaders = {
    "train": train_dl,
    "val": valid_dl,
    "test": test_dl
}

In [14]:
criterion = get_weighted_loss_with_logits(pos_weights.to(device), neg_weights.to(device))

In [15]:
model = pretrained_densenet121()
model = model.to(device)

In [16]:
model, fine_optimizer = fine_tune_setup(model, lr)

In [17]:
scheduler = EmptyScheduler()

In [18]:
model, history = fit(model, criterion, fine_optimizer, 
                     scheduler, dataloaders, model_name,
                     epochs, lr, patience=1, metric="loss")

Epoch 1:


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=217.0), HTML(value='')))


Train: Loss: 0.044, Acc: 0.520, AUROC: 0.782


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=55.0), HTML(value='')))


Val: Loss: 0.054, Acc: 0.806, AUROC: 0.623
Saved model with loss 0.0539
Epoch 2:


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=217.0), HTML(value='')))




ValueError: y_true takes value in {} and pos_label is not specified: either make y_true take value in {0, 1} or {-1, 1} or pass pos_label explicitly.

In [None]:
for X, y in train_dl:
    print(y.shape);break

In [None]:
i=3
print_image(X[i], y[i])

In [None]:
print_batch(X, y, label)

In [None]:
import numpy as np

In [None]:
np.random.rand(16) < 0.5

In [None]:
np.random.choice([1, 2, 3, 4, 5])