# Summary

Improved model.

## Import libs

In [None]:
import os
import math
import random

# For data manipulation
import pandas as pd

from matplotlib import pyplot as plt

# Pytorch Imports
import torch
import torch.optim as optim
from torch.utils.data import DataLoader

# Utils
from tqdm import tqdm

from sklearn.model_selection import GroupKFold

In [None]:
from hcd_3_common import (
    fetch_scheduler_optimizer,
    generate_submit_csv,
    get_optim_params,
    load_weights,
    prepare_loaders,
    plot_history,
    run_training,
    set_seed,
    HCDDataset,
    data_transforms,
    HCDModel_Res50,
    CONFIG,
    IS_INTERACTIVE,
)

## Configurations

In [None]:
ROOT_FOLDER = "/kaggle/input/histopathologic-cancer-detection"
# From https://github.com/azkalot1/Histopathologic-Cancer-Detection/blob/master/patch_id_wsi.csv
# It would fix a leak that patches from the sam WSI image are mixed in the training set, i.e.
# Validation set would be similar to training set. We use this selecting validation set without leak.
PATCH_ID_WSI_PATH = "/kaggle/input/hcd-patch-id-wsi/patch_id_wsi.csv"

PSUDO_CSV_PATH = "/kaggle/input/hcd-3-inference-result/submission.csv"
PSUDO_FOLDER = "/kaggle/input/histopathologic-cancer-detection/test"
PSUDO_THR = 0.999

PREVIOUS_BEST_WEIGHT = "/kaggle/input/hcd-3-improve-model/AUROC0.92_Loss0.1925_epoch8.bin"

In [None]:
# Following configurations are often overwritten
CONFIG["weights_name"] = "IMAGENET1K_V1" # None means no pretrained weights
CONFIG["dropout_p"] = 0.8
CONFIG["val_iterations"] = 50 # Do validation per iterations
CONFIG["epochs"] = 32
CONFIG["stop_epochs"] = 4 # Stop training when loss not improved for such epochs
CONFIG["train_batch_size"] = 256
CONFIG["valid_batch_size"] = 256
CONFIG["train_size"] = 153600
CONFIG["val_size"] = 1280
CONFIG["psudo_train_size"] = 0#None # None is all
CONFIG["tta_size"] = 4
CONFIG["learning_rate"] = 1e-4
CONFIG["min_lr"] = 1e-6
CONFIG["img_size"] = 96
CONFIG["k_fold"] = 5


if CONFIG["val_iterations"] is None:
    # By default, 1 validation per epoch
    CONFIG["val_iterations"] = CONFIG["train_size"] // CONFIG["train_batch_size"]

#CONFIG["scheduler"] = "ReduceLROnPlateau"
#ReduceLROnPlateau patience 2 epoch
#CONFIG["rlrop_patience"] = 1
#CONFIG['rlrop_thr'] = 1e-3

CONFIG["scheduler"] = "CosineAnnealingLR"
# Calculate the T_max, as epochs * iter per epoch
CONFIG['T_max'] = CONFIG["train_size"] * CONFIG['epochs'] // CONFIG['train_batch_size']

#CONFIG["scheduler"] = "CyclicLR"
#CONFIG["cl_base_lr"] = 1e-6
#CONFIG["cl_max_lr"] = 1e-4
# Calculate the step size in case of cyclic LR, loop in every 2 epoch (1 epoch up, 1 epoch down)
#CONFIG["cl_step_size"] = int(CONFIG["train_size"] / CONFIG["train_batch_size"])

print("CONFIG:", CONFIG)

In [None]:
set_seed(CONFIG['seed'])

## Load data

In [None]:

df = pd.read_csv(os.path.join(ROOT_FOLDER, "train_labels.csv"))
df["file_path"] = df["id"].apply(lambda image_id: os.path.join(ROOT_FOLDER, "train", f"{image_id}.tif"))

In [None]:
patch_id_wsi_df = pd.read_csv(PATCH_ID_WSI_PATH)
print(len(patch_id_wsi_df["wsi"].unique()))
df = pd.merge(df, patch_id_wsi_df, on='id')
df = df.sample(frac=1)

In [None]:
psudo_df = None

if CONFIG["psudo_train_size"] != 0:
    psudo_df = pd.read_csv(PSUDO_CSV_PATH)
    psudo_df["file_path"] = psudo_df["id"].apply(lambda image_id: os.path.join(PSUDO_FOLDER, f"{image_id}.tif"))
    psudo_df = psudo_df[(psudo_df["label"]>=PSUDO_THR) | (psudo_df["label"]<=(1-PSUDO_THR))] 
    psudo_df["label"] = psudo_df["label"].apply(lambda label: 1 if label>0.5 else 0)
    print("total psudo len:", len(psudo_df))
    
    if CONFIG["psudo_train_size"] is not None:
        psudo_df = psudo_df.sample(n=CONFIG["psudo_train_size"])
    

In [None]:
group_fold = GroupKFold(n_splits=CONFIG["k_fold"])

folds_id_train = []
folds_id_val = []
fold_idx = 0
for train_index, test_index in group_fold.split(df['id'].values, df['label'].values, df['wsi'].values):
    print("fold", fold_idx, "total train:", len(train_index))
    folds_id_train.append(df['id'].values[train_index])
    folds_id_val.append(df['id'].values[test_index])
    fold_idx += 1

## Run training

In [None]:
models = []
histories = []
best_epoch_aurocs = []

for fold_idx in range(len(folds_id_train)):
    
    val_df = df[df["id"].isin(folds_id_val[fold_idx])].sample(n=CONFIG["val_size"])
    # Make sure validation images are not in train data set
    train_df = df[df["id"].isin(folds_id_train[fold_idx])].sample(n=CONFIG["train_size"])

    if psudo_df is not None:
        # no need to shuflle, shuffled in dataloader
        train_df = pd.concat([train_df, psudo_df])
    
    train_dataloader, val_dataloader = prepare_loaders(train_df, val_df, tta_size=CONFIG["tta_size"])
    
    model = HCDModel_Res50(num_classes=CONFIG["num_classes"], weights=CONFIG["weights_name"], dropout_p=CONFIG["dropout_p"])
    model.to(CONFIG['device'])
    
    
    scheduler, optimizer = fetch_scheduler_optimizer(get_optim_params(model))

    model, history, best_epoch_loss, best_epoch_auroc = run_training(
        model, optimizer, scheduler,
        device=CONFIG['device'],
        num_epochs=CONFIG['epochs'],
        stop_epochs=CONFIG["stop_epochs"],
        train_dataloader=train_dataloader,
        val_dataloader=val_dataloader,
        k_fold=fold_idx,
        val_iterations=CONFIG["val_iterations"],
    )
    
    models.append(model)
    histories.append(history)
    best_epoch_aurocs.append(best_epoch_auroc)


print("Average best AUROCS:", sum(best_epoch_aurocs)/len(best_epoch_aurocs))

## Plot results

In [None]:
for k_folder, history in enumerate(histories):
    print("History of folder", k_folder)
    history = pd.DataFrame.from_dict(history)
    history.to_csv("history.csv", index=False)
    
    plot_history(history, "Loss")
    plot_history(history, "AUROC")