In [9]:
import pandas as pd
import numpy as np

In [7]:
df = pd.read_csv("histopathologic-cancer-detection/train_labels.csv")
df.label.value_counts()

label
0    130908
1     89117
Name: count, dtype: int64

In [8]:
130908/89117

1.468945319074924

In [23]:

def split_dataframe(df:pd.DataFrame, num_of_splits:int) -> list[pd.DataFrame]:

    # Shuffle the rows
    df_shuffled = df.sample(frac=1, random_state=10) 

    rows_per_split = len(df_shuffled) // num_of_splits

    dfs = [df_shuffled.iloc[i * rows_per_split:(i + 1) * rows_per_split] for i in range(num_of_splits)]

    # Print to check line losses
    print(f'Original dataset has {len(df)} rows')
    print(f'Splitted has in total {sum([len(part) for part in dfs])} rows')

    return dfs

dfs = split_dataframe(df = df, num_of_splits = 5)

Original dataset has 220025 rows
Splitted has in total 220025 rows


Original dataset has 220025 rows
Splits has in total 220020 rows


220020

In [24]:
import torch
import torch.nn as nn
#import torch.nn.functional as F
import torch.optim as optim
#from torchvision import datasets, transforms
#from torch.autograd import Variable
#from torch.optim import Optimizer
#from torch.utils import data
import pretrainedmodels
import pandas as pd
#import numpy as np
from sklearn.utils.class_weight import compute_class_weight
import os
#import cv2
from skimage.io import imread
#from torch.utils.data.sampler import WeightedRandomSampler, BatchSampler
#from tqdm import tqdm
#from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold, GroupKFold
#import pretrainedmodels.utils as utils
#from sklearn.metrics import roc_auc_score
from tools import *



In [25]:

# PER IMPOSTARE IL DEVICE
if torch.backends.mps.is_available():
    device = torch.device('mps')
    print('MPS is available')
elif torch.cuda.is_available():
    device = torch.device('cuda')
    print('CUDA is available')
else:
    device = torch.device('cpu')
    print('No acceleration available')


MPS is available


In [27]:

# FOLDERS PATH
DIR = 'study/Histopathologic-Cancer-Detection-master/' # './' #folder with train and test data
train_im_dir = DIR+'/train'
test_im_dir = DIR+'/test'

# IMPORTING DATA
train_data = pd.read_csv(os.path.join(DIR,'train_labels.csv')) #labels for train data
patch_ids = pd.read_csv(os.path.join(DIR,'patch_id_wsi.csv')) #slides id for correct split 
# MERGIN DATASETS
train_data = pd.merge(train_data, patch_ids, on='id')


train_data.head()

Unnamed: 0,id,label,wsi
0,f38a6374c348f90b587e046aac6079959adf3835,0,camelyon16_train_normal_033
1,c18f2d887b7ae4f6742ee445113fa1aef383ed77,1,camelyon16_train_tumor_054
2,755db6279dae599ebb4d39a9123cce439965282d,0,camelyon16_train_tumor_008
3,bc3f0c64fb968ff4a8bd33af6971ecae77c75e08,0,camelyon16_train_tumor_077
4,acfe80838488fae3c89bd21ade75be5c34e66be7,0,camelyon16_train_tumor_036


In [None]:


# MODELS
model_dir = os.path.join(DIR, 'resnet34')
model_name = 'resnet34'



In [None]:


n_groups = 15 # number of folds # CROSS-VALIDATION PER VALIDAZIONE
b_size = 96 # batch size

# DEVICE
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

# CROSS VALIDATION K-FOLD 
skf = GroupKFold(n_splits=n_groups)

# CONTENITORI PER RACCOGLIERE I DATI DELLA CROSS-VALIDATION
folds_id_train = []
folds_label_train = []
folds_id_val = []
folds_label_val = []

# RIEMPIO I CONTENITORI PER LA CROSS-VALIDATION
for train_index, test_index in skf.split(train_data['id'].values, train_data['label'].values, train_data['wsi'].values):
    folds_id_train.append(train_data['id'].values[train_index])
    folds_id_val.append(train_data['id'].values[test_index])
    folds_label_train.append(train_data['label'].values[train_index])
    folds_label_val.append(train_data['label'].values[test_index])

# INIZIO DEL TRAINING
samples_per_epoch = 50000 #define number of samples per epoch, since dataset is big
# CICLO SUI GRUPPI DELLA C.V.
for valid_idx in range(n_groups):
    logfile =  model_dir+'/{}.fold{}.logfile.txt'.format(model_name, valid_idx)
    best_w_path = model_dir+'/{}.fold{}.best.pt'.format(model_name, valid_idx)
    es_w_path =  model_dir+'/{}.fold{}.es.pt'.format(model_name, valid_idx)
    print('Training fold {}'.format(valid_idx))
    
    with open(logfile, "w") as log:
        pass    
    
    traing_aug = aug_train() # FUNZIONE IN UTILS
    validation_aug = aug_val() # FUNZIONE IN UTILS
    
    curr_lr = 3e-4
    
    train_sampler = torch.utils.data.RandomSampler(DataGenerator(folds_id_val[valid_idx],       # GENERATES DATASET FOR LOADING
                                                                 folds_label_val[valid_idx], 
                                                                 validation_aug, train_im_dir),
                                                   replacement=True, 
                                                   num_samples=samples_per_epoch)
    
    train_loader = torch.utils.data.DataLoader(DataGenerator(folds_id_train[valid_idx], 
                                                             folds_label_train[valid_idx], 
                                                             traing_aug, train_im_dir),
                                               pin_memory=False,
                                               num_workers=4,
                                               batch_size=b_size, 
                                               sampler=train_sampler)
    
    val_loader = torch.utils.data.DataLoader(DataGenerator(folds_id_val[valid_idx], 
                                                       folds_label_val[valid_idx], 
                                                       validation_aug, train_im_dir),
                                             pin_memory=False,
                                             num_workers=1,
                                             batch_size=b_size)
    
    # DEFINIAMO LA FUNZIONE DI LOSS - 
    loss_f = nn.BCELoss() # BINARY CROSS ENTROPY

    best_score = 0
    best_loss = 1e5
    idx_stop = 0

    # LOAD RESNET34 - PRETRAINED
    base_model = pretrainedmodels.resnet34(num_classes=1000, 
                                           pretrained='imagenet').to(device)
    
    # DEFINISCO IL MODEL!
    model = Net(base_model, 512).to(device)
    
    # "TRAINING WITH FROZEN LAYERS EXCEPT FOR CLASSIFICATION HEAD"
    optimizer = optim.SGD([{'params': model.layer0.parameters(), 'lr': 0},
                           {'params': model.layer1.parameters(), 'lr': 0},
                           {'params': model.layer2.parameters(), 'lr': 0},
                           {'params': model.layer3.parameters(), 'lr': 0},
                           {'params': model.layer4.parameters(), 'lr': 0},
                           {'params': model.classif.parameters()}], lr=0.05, momentum=0.9)
    
    # LAUNCH TRAIN, TEST AND WRITE LOG
    train_loss = train(model, train_loader, optimizer, 0, 100, loss_f, samples_per_epoch, device)
    test_loss, score = test(model, val_loader, loss_f, 0, device)
    write_log(logfile, train_loss, test_loss, score, lr = "not available")
    
    '''
    start training the model with all layers
    Training scheme : train while validation loss decreases, save model at each improvement of test loss. 
    if loss does not decreases for 3 epochs, reload last best model, reduce lr by factor of 2. 
    If loss still doesn't decrease for 10 epochs, stop the model. 
    '''
    for epoch in range(50):
        optimizer = torch.optim.SGD(model.parameters(), lr=curr_lr, momentum=0.9)
        scheduler = CyclicLR(optimizer, max_lr=3*curr_lr)

        # LAUNCH TRAIN, TEST AND WRITE LOG
        train_loss = train(model, train_loader, optimizer, epoch, 100, loss_f, samples_per_epoch)
        test_loss, score = test(model, val_loader, loss_f, epoch)
        write_log(logfile, train_loss, test_loss, score)
        
        if test_loss<best_loss:
            print('Test loss improved from {} to {}, saving'.format(best_loss, test_loss))
            best_loss = test_loss
            torch.save(model.state_dict(), best_w_path)
            idx_stop = 0
        else:
            print('Loss {}, did not improve from {} for {} epochs'.format(test_loss, best_loss, idx_stop))
            idx_stop += 1
        if idx_stop>3:
            print('Reducing LR by two and reloading best model')
            model.load_state_dict(torch.load(best_w_path))
            curr_lr = curr_lr/2
        if idx_stop>10:
            print('Stopping the model')
            torch.save(model.state_dict(), es_w_path)
