# CrossValidation

## Packages and helper functions

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pretrainedmodels
import pandas as pd
import os

from tools import *

def write_log(logfile, train_loss, test_loss, test_score, lr):
    with open(logfile, "a+") as log:
        log.write("{}\t{}\t{}\t{}\n".format(train_loss, test_loss, test_score, lr))


### Set the device

In [2]:

# PER IMPOSTARE IL DEVICE
if torch.backends.mps.is_available():
    device = torch.device('mps')
    print('MPS is available')
elif torch.cuda.is_available():
    device = torch.device('cuda')
    print('CUDA is available')
else:
    device = torch.device('cpu')
    print('No acceleration available')


MPS is available


## Importing label dataset

To run the full crossvalidation, remove .head(30000). This will multiply about by 20 the time required by this notebook.

In [3]:

# FOLDERS PATH
source_dir = 'histopathologic-cancer-detection/'
train_im_source_dir = source_dir+'/train'
test_im_source_dir = source_dir+'/test'

# IMPORTING DATA
train_data = pd.read_csv(os.path.join(source_dir,'train_labels.csv')) 
train_data = train_data.sample(frac=1, random_state=10).head(30000)
train_data.shape



(30000, 2)

### Creating folds

In [4]:

def get_samples(dataset, n, etichetta):
    label_matching_indexes = dataset.index[dataset['label'] == etichetta].tolist()
    label_matching_indexes = label_matching_indexes[:n]
    return dataset.loc[label_matching_indexes], dataset.drop(index=label_matching_indexes)


In [5]:

n_groups = 20 # number of folds # CROSS-VALIDATION PER VALIDAZIONE


In [6]:

validataion_proportion = 0.2

# Deduce this proportion by the validation
train_proportion = 1-validataion_proportion

# Get number of elements for each class
zeros = len(train_data[train_data.label == 0])
ones = len(train_data[train_data.label == 1])

# Get the proportion of elements for training, splitted for each class
train_zeros = int(zeros/n_groups*0.9)
train_ones = int(ones/n_groups*0.9)

# Get the proportion of elements for training, splitted for each class
val_zeros = int(zeros/n_groups*0.1)
val_ones = int(ones/n_groups*0.1)

# Shuffle the training data, get df dataset
df = train_data.sample(frac=1, random_state=10)

# Initializate empty lists
folds_id_train = []
folds_label_train = []
folds_id_val = []
folds_label_val = []

# Loop on elements for validation
for i in range(n_groups):

    # Get first elements in df, return also df without elements
    fold_train_zeros, df = get_samples(df, train_zeros, 0)
    fold_train_ones, df = get_samples(df, train_ones, 1)

    # Merge zero and ones datasets together
    fold_train = pd.concat([fold_train_zeros, fold_train_ones], ignore_index=True).sample(frac=1, random_state=10+n_groups)

    # Append id and label to the two lists
    folds_id_train.append(fold_train['id'].values)
    folds_label_train .append(fold_train['label'].values)

    # Same but for validation
    fold_val_zeros, df = get_samples(df, val_zeros, 0)
    fold_val_ones, df = get_samples(df, val_ones, 1)

    fold_val = pd.concat([fold_val_zeros, fold_val_ones], ignore_index=True).sample(frac=1, random_state=10+n_groups)

    folds_id_val.append(fold_val['id'].values)    
    folds_label_val.append(fold_val['label'].values)








## Training with cross validation

In [7]:
# Logging directories
model_source_dir = 'cross_validation_logs/'
model_name = 'cv'

# Batch size
b_size = 96 # batch size

# Epochs
n_epochs = 15

# Set samples per epoch
#samples_per_epoch = 50000 

# Save loss for each fold
fold_loss = {}


### Training - loop on cv folds

In [8]:

for valid_idx in range(n_groups):

    # Print loop number
    print('Training fold {}'.format(valid_idx))

    # Setting log files
    logfile =  model_source_dir+'/{}.fold{}.logfile.txt'.format(model_name, valid_idx)
    best_w_path = model_source_dir+'/{}.fold{}.best.pt'.format(model_name, valid_idx)
    es_w_path =  model_source_dir+'/{}.fold{}.es.pt'.format(model_name, valid_idx)
    
    # Data augmentation functions
    training_aug = aug_train() 
    validation_aug = aug_val() 
    
    # Learning Rate setting. This will be modified according to cyclic scheduler
    curr_lr = 3e-3 
    
    # Sampler for the trainer
    #train_sampler = torch.utils.data.RandomSampler(DataGenerator(folds_id_val[valid_idx],       # GENERATES DATASET FOR LOADING
    #                                                             folds_label_val[valid_idx], 
    #                                                             validation_aug, train_im_source_dir),
    #                                               replacement=True)
    #                                               #num_samples=samples_per_epoch) # remove this comment lo limit the number of trainin samples per epoch
    

    # Loader for the training and the validation
    train_loader = torch.utils.data.DataLoader(DataGenerator(folds_id_train[valid_idx], 
                                                             folds_label_train[valid_idx], 
                                                             training_aug, train_im_source_dir),
                                               pin_memory=False,
                                               num_workers=4,
                                               batch_size=b_size) 
                                               #sampler=train_sampler)
    
    val_loader = torch.utils.data.DataLoader(DataGenerator(folds_id_val[valid_idx], 
                                                       folds_label_val[valid_idx], 
                                                       validation_aug, train_im_source_dir),
                                             pin_memory=False,
                                             num_workers=1,
                                             batch_size=b_size)
    
    # Loss function 
    loss_f = nn.BCELoss() # BINARY CROSS ENTROPY

    # Import pretrained model
    base_model = pretrainedmodels.resnet34(num_classes=1000, 
                                           pretrained='imagenet').to(device) 
    
    # Shape the model    
    model = Net(base_model, 512).to(device)

    # Optimizer
    # Some layers are freezed for the first iteration, by setting the learning rate to zero
    optimizer = optim.SGD([{'params': model.layer0.parameters(), 'lr': 0},
                           {'params': model.layer1.parameters(), 'lr': 0},
                           {'params': model.layer2.parameters(), 'lr': 0},
                           {'params': model.layer3.parameters(), 'lr': 0},
                           {'params': model.layer4.parameters(), 'lr': 0},
                           {'params': model.classif.parameters()}], lr=0.05, momentum=0.9)
    
    # First Training procedure
    train_loss = train(model= model,
                           train_loader= train_loader, 
                           optimizer= optimizer, 
                           epoch= 0, 
                           log_interval= 100, 
                           loss_f= loss_f, 
                           #samples_per_epoch= samples_per_epoch,
                           scheduler= None,
                           device=device)
    
    # First Test procedure
    test_loss, score = test(model= model, 
                                test_loader= val_loader, 
                                loss_f= loss_f,
                                device = device)
    
    # Log the loss
    write_log(logfile, train_loss, test_loss, score, lr = "not available")


    # Loop on epochs
    # start training with all the layers
    # train while validation loss decreases, save model at each improvement of test loss. 
    # if loss does not decreases for 3 epochs, reload last best model, reduce lr by factor of 2. 
    # If loss still doesn't decrease for 10 epochs, stop the model. 

    # Values to monitor the loss trough the loop on epochs
    best_score = 0
    best_loss = 1e5
    idx_stop = 0

    for epoch in range(n_epochs):

        # Print current epoch
        print(f'we are in epoch {epoch}')

        # Optimizer
        optimizer = torch.optim.SGD(model.parameters(), lr=curr_lr, momentum=0.9)

        # Scheduler for triangular cyclic learning rate
        scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=curr_lr, max_lr=3*curr_lr, mode = 'triangular')

        # Train procedure
        train_loss = train(model= model,
                           train_loader= train_loader, 
                           optimizer= optimizer, 
                           epoch= epoch, 
                           log_interval= 100, 
                           loss_f= loss_f, 
                           #samples_per_epoch= samples_per_epoch,
                           scheduler= scheduler,
                           device = device)
        
        # Test procedure
        test_loss, score = test(model= model, 
                                test_loader= val_loader, 
                                loss_f= loss_f,
                                device = device)
        
        # Log the loss
        write_log(logfile, train_loss, test_loss, score, lr = curr_lr)
        
        # Case: we get a new minimum loss
        if test_loss < best_loss:
            print('Test LOSS IMPROVED from {} to {}, saving'.format(best_loss, test_loss))
            best_loss = test_loss
            # Saving the model
            torch.save(model.state_dict(), best_w_path)
            idx_stop = 0
        # Else:
        else:
            print('Loss {}, did NOT improve from {} for {} epochs'.format(test_loss, best_loss, idx_stop))
            idx_stop += 1


        # No improvement for 3 epochs - reduce learning rate
        if idx_stop>3:
            print('Reducing LR by two and reloading best model')
            model.load_state_dict(torch.load(best_w_path))
            curr_lr = curr_lr/2

        # No improvements for 10 epochs - terminate epoch
        if idx_stop>10:
            print('Stopping the model')
            torch.save(model.state_dict(), es_w_path)

    fold_loss.update({valid_idx : test_loss})


Training fold 0




Setting DEVICE:
	 MPS is available
Mean train loss on epoch 0 : 0.6708741188049316
Setting DEVICE:
	 MPS is available
OK!
OK!

Test set: Average loss: 0.644994, roc auc: 0.7315

we are in epoch 0
Setting DEVICE:
	 MPS is available
Mean train loss on epoch 0 : 0.6235868334770203
Setting DEVICE:
	 MPS is available
OK!
OK!

Test set: Average loss: 0.510615, roc auc: 0.8854

Test LOSS IMPROVED from 100000.0 to 0.5106145441532135, saving
we are in epoch 1
Setting DEVICE:
	 MPS is available
Mean train loss on epoch 1 : 0.5237208604812622
Setting DEVICE:
	 MPS is available
OK!
OK!

Test set: Average loss: 0.424325, roc auc: 0.9099

Test LOSS IMPROVED from 0.5106145441532135 to 0.42432481050491333, saving
we are in epoch 2
Setting DEVICE:
	 MPS is available
Mean train loss on epoch 2 : 0.46702301502227783
Setting DEVICE:
	 MPS is available
OK!
OK!

Test set: Average loss: 0.335865, roc auc: 0.9200

Test LOSS IMPROVED from 0.42432481050491333 to 0.3358645588159561, saving
we are in epoch 3
Sett



Setting DEVICE:
	 MPS is available
Mean train loss on epoch 0 : 0.6771374940872192
Setting DEVICE:
	 MPS is available
OK!
OK!

Test set: Average loss: 0.626218, roc auc: 0.7903

we are in epoch 0
Setting DEVICE:
	 MPS is available
Mean train loss on epoch 0 : 0.6029232740402222
Setting DEVICE:
	 MPS is available
OK!
OK!

Test set: Average loss: 0.509080, roc auc: 0.8734

Test LOSS IMPROVED from 100000.0 to 0.5090796500444412, saving
we are in epoch 1
Setting DEVICE:
	 MPS is available
Mean train loss on epoch 1 : 0.5206272602081299
Setting DEVICE:
	 MPS is available
OK!
OK!

Test set: Average loss: 0.415037, roc auc: 0.9172

Test LOSS IMPROVED from 0.5090796500444412 to 0.4150373041629791, saving
we are in epoch 2
Setting DEVICE:
	 MPS is available
Mean train loss on epoch 2 : 0.42236918210983276
Setting DEVICE:
	 MPS is available
OK!
OK!

Test set: Average loss: 0.368552, roc auc: 0.9272

Test LOSS IMPROVED from 0.4150373041629791 to 0.36855174601078033, saving
we are in epoch 3
Setti

In [None]:
fold_loss