# Summary

In general, I plan to use transfer learning to load different pre-trained models as basis, and then the own model. 

As a baseline model, I would start with a small and quick one "tf_efficientnet_b0" which would train on the original images


## Import libs

In [1]:
import os
import math

# For data manipulation
import pandas as pd

# Pytorch Imports
import torch
import torch.optim as optim
from torch.utils.data import DataLoader

# Utils
from tqdm import tqdm

In [2]:
from hcd_2_common import (
    fetch_scheduler,
    generate_submit_csv,
    load_weights,
    prepare_loaders,
    plot_history,
    run_training,
    set_seed,
    HCDDataset,
    data_transforms,
    HCDModel,
    CONFIG,
    IS_INTERACTIVE,
)

## Configurations

In [3]:
ROOT_FOLDER = "/kaggle/input/histopathologic-cancer-detection"
PREVIOUS_BEST_WEIGHT = "/kaggle/input/hcd-2-baseline-model-original-images/AUROC0.94_Loss0.1720_epoch13.bin"

In [4]:
# Following configurations are often overwritten

CONFIG["epochs"] = 8
CONFIG["train_size"] = 8000
CONFIG["val_size"] = 800
CONFIG["generate_submit"] = True
CONFIG["force_retrain"] = True

print("CONFIG:", CONFIG)

CONFIG: {'seed': 42, 'epochs': 8, 'img_size': 96, 'model_name': 'tf_efficientnet_b0', 'num_classes': 2, 'train_batch_size': 32, 'valid_batch_size': 32, 'learning_rate': 0.0001, 'scheduler': 'CosineAnnealingLR', 'min_lr': 1e-06, 'T_max': 500, 'weight_decay': 1e-06, 'train_size': 8000, 'val_size': 200, 'n_accumulate': 1, 'device': device(type='cpu'), 'generate_submit': True, 'force_retrain': True}


In [5]:
set_seed(CONFIG['seed'])

## Load data

In [6]:

df = pd.read_csv(os.path.join(ROOT_FOLDER, "train_labels.csv"))
df["file_path"] = df["id"].apply(lambda image_id: os.path.join(ROOT_FOLDER, "train", f"{image_id}.tif"))

In [7]:
val_df = df.sample(n=CONFIG["val_size"])
val_ids = val_df["id"].values
# Make sure validation images are not in train data set
train_df = df[~df["id"].isin(val_ids)].sample(n=CONFIG["train_size"])


In [10]:
train_loader, valid_loader = prepare_loaders(train_df, val_df)

## Run training

In [None]:
# Calculate the T_max, the max iterations
CONFIG['T_max'] = train_df.shape[0] * CONFIG['epochs'] // CONFIG['train_batch_size']
CONFIG['T_max']

In [None]:
model = HCDModel(CONFIG['model_name'], CONFIG['num_classes'])

weights_loaded = load_weights(model, PREVIOUS_BEST_WEIGHT)

model.to(CONFIG['device']);

In [None]:
history = None

if CONFIG['force_retrain'] or not weights_loaded:
    optimizer = optim.Adam(model.parameters(), lr=CONFIG['learning_rate'], 
                           weight_decay=CONFIG['weight_decay'])
    scheduler = fetch_scheduler(optimizer)

    model, history = run_training(
        model, optimizer, scheduler,
        device=CONFIG['device'],
        num_epochs=CONFIG['epochs'],
        train_loader=train_loader,
        valid_loader=valid_loader,
    )
else:
    print("Skip retrain model!")

 48%|████▊     | 119/250 [01:51<01:58,  1.10it/s, Epoch=1, Epoch_Time=111, LR=9.91e-5, Train_AUROC=0.928, Train_Acc=0.933, Train_Loss=0.177]

## Plot results

In [None]:
if history is not None:
    history = pd.DataFrame.from_dict(history)
    history.to_csv("history.csv", index=False)
    
    plot_history(history, "Loss")
    plot_history(history, "AUROC")
    plot_history(history, "Accuracy")

## Inference

In [None]:
if CONFIG["generate_submit"]:
    generate_submit_csv(model, ROOT_FOLDER, data_transforms)
else:
    print("Skip generating submit csv")