In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("histopathologic-cancer-detection/train_labels.csv")
df.label.value_counts()

label
0    130908
1     89117
Name: count, dtype: int64

In [3]:
130908/89117

1.468945319074924

In [4]:

def split_dataframe(df:pd.DataFrame, num_of_splits:int) -> list[pd.DataFrame]:

    # Shuffle the rows
    df_shuffled = df.sample(frac=1, random_state=10) 

    rows_per_split = len(df_shuffled) // num_of_splits

    dfs = [df_shuffled.iloc[i * rows_per_split:(i + 1) * rows_per_split] for i in range(num_of_splits)]

    # Print to check line losses
    print(f'Original dataset has {len(df)} rows')
    print(f'Splitted has in total {sum([len(part) for part in dfs])} rows')

    return dfs

dfs = split_dataframe(df = df, num_of_splits = 5)

Original dataset has 220025 rows
Splitted has in total 220025 rows


In [5]:
import torch
import torch.nn as nn
#import torch.nn.functional as F
import torch.optim as optim
#from torchvision import datasets, transforms
#from torch.autograd import Variable
#from torch.optim import Optimizer
#from torch.utils import data
import pretrainedmodels
import pandas as pd
#import numpy as np
from sklearn.utils.class_weight import compute_class_weight
import os
#import cv2
from skimage.io import imread
#from torch.utils.data.sampler import WeightedRandomSampler, BatchSampler
#from tqdm import tqdm
#from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold, GroupKFold
#import pretrainedmodels.utils as utils
#from sklearn.metrics import roc_auc_score
from tools import *

def write_log(logfile, train_loss, test_loss, test_score, lr):
    with open(logfile, "a+") as log:
        log.write("{}\t{}\t{}\t{}\n".format(train_loss, test_loss, test_score, lr))

        


In [6]:

# PER IMPOSTARE IL DEVICE
if torch.backends.mps.is_available():
    device = torch.device('mps')
    print('MPS is available')
elif torch.cuda.is_available():
    device = torch.device('cuda')
    print('CUDA is available')
else:
    device = torch.device('cpu')
    print('No acceleration available')


MPS is available


In [7]:

# FOLDERS PATH
DIR = 'histopathologic-cancer-detection/'
train_im_dir = DIR+'/train'
test_im_dir = DIR+'/test'

# IMPORTING DATA
train_data = pd.read_csv(os.path.join(DIR,'train_labels.csv')) #labels for train data


train_data.head()

Unnamed: 0,id,label
0,f38a6374c348f90b587e046aac6079959adf3835,0
1,c18f2d887b7ae4f6742ee445113fa1aef383ed77,1
2,755db6279dae599ebb4d39a9123cce439965282d,0
3,bc3f0c64fb968ff4a8bd33af6971ecae77c75e08,0
4,068aba587a4950175d04c680d38943fd488d6a9d,0


In [8]:
# MODELS
# model_dir = os.path.join(DIR, 'resnet34')
# model_name = 'resnet34'
model_dir = 'this_model/'#os.path.join(DIR, 'this_model')
model_name = 'this_model'

In [9]:

n_groups = 15 # number of folds # CROSS-VALIDATION PER VALIDAZIONE
b_size = 96 # batch size


In [10]:

def get_samples(dataset, n, etichetta):
    label_matching_indexes = dataset.index[dataset['label'] == etichetta].tolist()
    label_matching_indexes = label_matching_indexes[:n]
    return dataset.loc[label_matching_indexes], dataset.drop(index=label_matching_indexes)


In [11]:

validataion_proportion = 0.1
train_proportion = 1-validataion_proportion

# Get number of elements for each class
zeros = len(train_data[train_data.label == 0])
ones = len(train_data[train_data.label == 1])

# Get the proportion of elements for training, splitted for each class
train_zeros = int(zeros/n_groups*0.9)
train_ones = int(ones/n_groups*0.9)

# Get the proportion of elements for training, splitted for each class
val_zeros = int(zeros/n_groups*0.1)
val_ones = int(ones/n_groups*0.1)

# Shuffle the training data, get df dataset
df = train_data.sample(frac=1, random_state=10)

# Initializate empty lists
folds_id_train = []
folds_label_train = []
folds_id_val = []
folds_label_val = []

# Loop on elements for validation
for i in range(n_groups):

    #print('---')
    #print(len(df))
    
    # Get first elements in df, return also df without elements
    fold_train_zeros, df = get_samples(df, train_zeros, 0)
    #print(len(df))
    fold_train_ones, df = get_samples(df, train_ones, 1)
    #print(len(df))

    # Merge zero and ones together
    fold_train = pd.concat([fold_train_zeros, fold_train_ones], ignore_index=True).sample(frac=1, random_state=10+n_groups)

    
    folds_id_train.append(fold_train['id'].values)
    folds_label_train .append(fold_train['label'].values)


    fold_val_zeros, df = get_samples(df, val_zeros, 0)
    #print(len(df))
    fold_val_ones, df = get_samples(df, val_ones, 1)
    #print(len(df))

    fold_val = pd.concat([fold_val_zeros, fold_val_ones], ignore_index=True).sample(frac=1, random_state=10+n_groups)

    folds_id_val.append(fold_val['id'].values)    
    folds_label_val.append(fold_val['label'].values)

    #print(fold_train.head())
    #print(fold_val.head())






In [12]:

# INIZIO DEL TRAINING
samples_per_epoch = 50000 #define number of samples per epoch, since dataset is big
# CICLO SUI GRUPPI DELLA C.V.
for valid_idx in range(n_groups):

    logfile =  model_dir+'/{}.fold{}.logfile.txt'.format(model_name, valid_idx)
    best_w_path = model_dir+'/{}.fold{}.best.pt'.format(model_name, valid_idx)
    es_w_path =  model_dir+'/{}.fold{}.es.pt'.format(model_name, valid_idx)
    
    
    print('Training fold {}'.format(valid_idx))
    
    #with open(logfile, "w") as log:
    #    pass    
    
    training_aug = aug_train() # FUNZIONE IN UTILS
    validation_aug = aug_val() # FUNZIONE IN UTILS
    
    curr_lr = 3e-3#3e-4
    
    train_sampler = torch.utils.data.RandomSampler(DataGenerator(folds_id_val[valid_idx],       # GENERATES DATASET FOR LOADING
                                                                 folds_label_val[valid_idx], 
                                                                 validation_aug, train_im_dir),
                                                   replacement=True, 
                                                   num_samples=samples_per_epoch)
    
    train_loader = torch.utils.data.DataLoader(DataGenerator(folds_id_train[valid_idx], 
                                                             folds_label_train[valid_idx], 
                                                             training_aug, train_im_dir),
                                               pin_memory=False,
                                               num_workers=4,
                                               batch_size=b_size, 
                                               sampler=train_sampler)
    
    val_loader = torch.utils.data.DataLoader(DataGenerator(folds_id_val[valid_idx], 
                                                       folds_label_val[valid_idx], 
                                                       validation_aug, train_im_dir),
                                             pin_memory=False,
                                             num_workers=1,
                                             batch_size=b_size)
    
    # DEFINIAMO LA FUNZIONE DI LOSS - 
    loss_f = nn.BCELoss() # BINARY CROSS ENTROPY

    best_score = 0
    best_loss = 1e5
    idx_stop = 0

    ##############################################
    # OK QUI BISOGNA LAVORARCI

    # LOAD RESNET34 - PRETRAINED  # questo andrebbe sostituito cazzo!
    base_model = pretrainedmodels.resnet34(num_classes=1000, 
                                           pretrained='imagenet').to(device) 
    
    #model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)  # Esempio di ResNet-18 preaddestrata

    
    # DEFINISCO IL MODEL! # OK QUI 
    model = Net(base_model, 512).to(device)

    ##############################################
    
    # "TRAINING WITH FROZEN LAYERS EXCEPT FOR CLASSIFICATION HEAD"
    # OK, ALCUNI LAYERS VENGONO BLOCCATI CON UN LEARNING RATE PARI A ZERO, PER CUI NON VENGONO RIADDESTRATI!
    optimizer = optim.SGD([{'params': model.layer0.parameters(), 'lr': 0},
                           {'params': model.layer1.parameters(), 'lr': 0},
                           {'params': model.layer2.parameters(), 'lr': 0},
                           {'params': model.layer3.parameters(), 'lr': 0},
                           {'params': model.layer4.parameters(), 'lr': 0},
                           {'params': model.classif.parameters()}], lr=0.05, momentum=0.9)
    

    
    # QUI INIZIA GIà A TRAINARE, QUESTO è IL PRIMO GIRO, CHISSà PERCHè LO FA ESTERNO
    # LAUNCH TRAIN, TEST AND WRITE LOG
    train_loss = train(model= model,
                           train_loader= train_loader, 
                           optimizer= optimizer, 
                           epoch= 0, 
                           log_interval= 100, 
                           loss_f= loss_f, 
                           samples_per_epoch= samples_per_epoch,
                           scheduler= None)
    
    test_loss, score = test(model= model, 
                                test_loader= val_loader, 
                                loss_f= loss_f)
    
    write_log(logfile, train_loss, test_loss, score, lr = "not available")
    
    '''
    start training the model with all layers
    Training scheme : train while validation loss decreases, save model at each improvement of test loss. 
    if loss does not decreases for 3 epochs, reload last best model, reduce lr by factor of 2. 
    If loss still doesn't decrease for 10 epochs, stop the model. 
    '''
    for epoch in range(15):
        print(f'we are in epoch {epoch}')
        optimizer = torch.optim.SGD(model.parameters(), lr=curr_lr, momentum=0.9)
        scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=curr_lr, max_lr=3*curr_lr, mode = 'triangular')
        #scheduler = CyclicLR(optimizer, max_lr=3*curr_lr)

        # LAUNCH TRAIN, TEST AND WRITE LOG
        train_loss = train(model= model,
                           train_loader= train_loader, 
                           optimizer= optimizer, 
                           epoch= epoch, 
                           log_interval= 100, 
                           loss_f= loss_f, 
                           samples_per_epoch= samples_per_epoch,
                           scheduler= scheduler)
        
        test_loss, score = test(model= model, 
                                test_loader= val_loader, 
                                loss_f= loss_f)
        
        write_log(logfile, train_loss, test_loss, score, lr = curr_lr)
        
        if test_loss<best_loss:
            print('Test loss improved from {} to {}, saving'.format(best_loss, test_loss))
            best_loss = test_loss
            torch.save(model.state_dict(), best_w_path)
            idx_stop = 0
        else:
            print('Loss {}, did not improve from {} for {} epochs'.format(test_loss, best_loss, idx_stop))
            idx_stop += 1
        if idx_stop>3:
            print('Reducing LR by two and reloading best model')
            model.load_state_dict(torch.load(best_w_path))
            curr_lr = curr_lr/2
        if idx_stop>10:
            print('Stopping the model')
            torch.save(model.state_dict(), es_w_path)


Training fold 0




Setting DEVICE:
	 MPS is available
Mean train loss on epoch 0 : 0.5534087651471297
Setting DEVICE:
	 MPS is available
OK!
OK!

Test set: Average loss: 0.474496, roc auc: 0.8538

we are in epoch 0
Setting DEVICE:
	 MPS is available
Mean train loss on epoch 0 : 0.24058662698293728
Setting DEVICE:
	 MPS is available
OK!
OK!

Test set: Average loss: 0.377712, roc auc: 0.9604

Test loss improved from 100000.0 to 0.3777122087776661, saving
we are in epoch 1
Setting DEVICE:
	 MPS is available
Mean train loss on epoch 1 : 0.07357502089347691
Setting DEVICE:
	 MPS is available
OK!
OK!

Test set: Average loss: 0.413329, roc auc: 0.9610

Loss 0.41332932841032743, did not improve from 0.3777122087776661 for 0 epochs
we are in epoch 2
Setting DEVICE:
	 MPS is available
Mean train loss on epoch 2 : 0.04155347629857715
Setting DEVICE:
	 MPS is available
OK!
OK!

Test set: Average loss: 0.506190, roc auc: 0.9628

Loss 0.5061897467821836, did not improve from 0.3777122087776661 for 1 epochs
we are in e



Setting DEVICE:
	 MPS is available
Mean train loss on epoch 0 : 0.5364695736765861
Setting DEVICE:
	 MPS is available
OK!
OK!

Test set: Average loss: 0.467883, roc auc: 0.8573

we are in epoch 0
Setting DEVICE:
	 MPS is available
Mean train loss on epoch 0 : 0.23220080010592933
Setting DEVICE:
	 MPS is available
OK!
OK!

Test set: Average loss: 0.277977, roc auc: 0.9666

Test loss improved from 100000.0 to 0.2779768304899335, saving
we are in epoch 1
Setting DEVICE:
	 MPS is available
Mean train loss on epoch 1 : 0.07765357266568267
Setting DEVICE:
	 MPS is available
OK!
OK!

Test set: Average loss: 0.358727, roc auc: 0.9601

Loss 0.3587268954142928, did not improve from 0.2779768304899335 for 0 epochs
we are in epoch 2
Setting DEVICE:
	 MPS is available
Mean train loss on epoch 2 : 0.047812771912431336
Setting DEVICE:
	 MPS is available
OK!
OK!

Test set: Average loss: 0.362387, roc auc: 0.9668

Loss 0.3623871607705951, did not improve from 0.2779768304899335 for 1 epochs
we are in e



Setting DEVICE:
	 MPS is available
Mean train loss on epoch 0 : 0.5311208716531596
Setting DEVICE:
	 MPS is available
OK!
OK!

Test set: Average loss: 0.416078, roc auc: 0.8917

we are in epoch 0
Setting DEVICE:
	 MPS is available
Mean train loss on epoch 0 : 0.2442487515571217
Setting DEVICE:
	 MPS is available
OK!
OK!

Test set: Average loss: 0.318981, roc auc: 0.9610

Test loss improved from 100000.0 to 0.31898069474846125, saving
we are in epoch 1
Setting DEVICE:
	 MPS is available
Mean train loss on epoch 1 : 0.08099237957969307
Setting DEVICE:
	 MPS is available
OK!
OK!

Test set: Average loss: 0.278727, roc auc: 0.9784

Test loss improved from 0.31898069474846125 to 0.2787269074469805, saving
we are in epoch 2
Setting DEVICE:
	 MPS is available


In [None]:
device = torch.device("cuda" if use_cuda else "cpu")

DIR = './'
train_im_dir = DIR+'/train'
test_im_dir = DIR+'/test'

model_dir = os.path.join(DIR,'resnet34')
model_name = 'resnet34'

train_data = pd.read_csv(os.path.join(DIR,'train_labels.csv'))
test_data = pd.read_csv(os.path.join(DIR,'sample_submission.csv'))
patch_ids = pd.read_csv(os.path.join(DIR,'patch_id_wsi.csv'))
train_data = pd.merge(train_data, patch_ids, on='id')

n_groups = 15

# CROSS VALIDATION OBJECT
skf = GroupKFold(n_splits=n_groups)

# CONTENITORI DEI FOLD DELLA CROSS-VALIDATION
folds_id_train = []
folds_label_train = []
folds_id_val = []
folds_label_val = []

# POPOLANDO I CONTENITORI
for train_index, test_index in skf.split(train_data['id'].values, train_data['label'].values, train_data['wsi'].values):
    folds_id_train.append(train_data['id'].values[train_index])
    folds_id_val.append(train_data['id'].values[test_index])
    folds_label_train.append(train_data['label'].values[train_index])
    folds_label_val.append(train_data['label'].values[test_index])

# .....
test_id = test_data['id'].values
test_label = test_data['label'].values

# CONTENGONO INFO SULLA CROSS-VALIDATION
val_preds = []
val_labels = []
test_preds = []
scores_CV = []

# ESEGUE LA CROSS-VALIDATION
for valid_idx in range(n_groups):
    # IMPORT IL MODELLO BASE
    base_model = pretrainedmodels.resnet34(num_classes=1000,pretrained='imagenet').to(device) #load pretrained as base
    
    # CREO IL MODELLO
    model = Net(base_model, 512).to(device) # create model

    # CARICO I PARAMETRI DEL MODELLO TRAINATO  
    model.load_state_dict(torch.load(model_dir+'/resnet34.fold{}.best.pt'.format(valid_idx))) #loading weights
    
    # MODALITA' INFERENZA
    model.eval()
    
    
    valid_preds_idx = np.zeros((len(folds_id_val[valid_idx])))
    valid_target_idx = np.zeros((len(folds_id_val[valid_idx])))
    test_preds_idx = np.zeros((len(test_label)))
    
    val_loader = torch.utils.data.DataLoader(DataGenerator(folds_id_val[valid_idx], folds_label_val[valid_idx], validation_aug, train_im_dir), 
                                             shuffle=False, pin_memory=False, num_workers=1,batch_size=1)
    test_loader = torch.utils.data.DataLoader(DataGenerator(test_id, test_label, validation_aug, test_im_dir), 
                                              shuffle=False, pin_memory=False, num_workers=1,batch_size=1)  
    
    #predction for validation data
    with torch.no_grad():
        for batch_idx, (x, target) in enumerate(tqdm_notebook(val_loader)):
            #output = protein_model(x.to(device, dtype=torch.float))
            image = np.rollaxis(x.numpy()[0], 0, 3)
            images = make_tta_heavy(image,n_images=8) #create 8 images for random augmentations to take mean prediciton of each
            output = model(torch.from_numpy(images).to(device, dtype=torch.float))
            output = output.mean()
            valid_preds_idx[batch_idx] = output
            valid_target_idx[batch_idx] = target
    
    val_preds.append(valid_preds_idx)
    val_labels.append(valid_target_idx)
    
    score_CV_idx = roc_auc_score(valid_target_idx, valid_preds_idx)
    scores_CV.append(score_CV_idx)
    
    print('fold {}, score {}'.format(valid_idx, score_CV_idx))
    
    #predction for test data
    with torch.no_grad():
        for batch_idx, (x, target) in enumerate(tqdm_notebook(test_loader)):
            image = np.rollaxis(x.numpy()[0], 0, 3)
            images = make_tta_heavy(image,n_images=8) # TTA
            output = model(torch.from_numpy(images).to(device, dtype=torch.float))
            output = output.mean()            
            #output = protein_model(x.to(device, dtype=torch.float))
            test_preds_idx[batch_idx] = output
    test_preds.append(test_preds_idx)    
    
    
    
val_preds_combined = np.hstack(val_preds)
val_labels_combined = np.hstack(val_labels)
#average test predctions over each fold
test_preds_combined = np.vstack(test_preds)
test_preds_combined = np.mean(test_preds_combined, axis=0)
cv_rocauc = roc_auc_score(val_labels_combined, val_preds_combined)
print('Total roc auc'.format(cv_rocauc))
d = dict({'oof_preds':val_preds_combined,'oof_labels':val_labels_combined,'test_preds':test_preds_combined})
with open(os.path.join(model_dir, "resnet34.tta.Preds.pickle"), "wb") as output_file:
    pickle.dump(d, output_file)
#create sample submission
test_data['label'] = d['test_preds']
test_data.to_csv(os.path.join(model_dir, 'resnet34.prediction.tta.csv') ,sep=',',index=False)
