# Library

In [None]:
from typing import List, Dict, Any, Tuple, Callable, Union, Optional

import re
import torch
from torch import nn

from scripts.default_setting import *

from GGUtils.utils.path import do_or_load, GetAbsolutePath
from GGDL.utils import get_verbose, get_process_id, code_test, set_seed_everything, make_basic_directory, GetDevice, Option
from GGDL.idx_dict.key_df import make_basic_key_df, binary_label_convertor
from GGDL.idx_dict.make_dict import StratifiedIndexdict
from GGDL.data_loader.classification import BasicDataset
from GGDL.model.vision import TimmHelper
from GGDL.model.optimzer import OptimHelper, SchedulerHelper
from GGDL.data_loader.utils import LabelDtype
from GGDL.pipeline.log import Log
from GGDL.metrics.classification import MetricsClassification

from GGImgMorph.scenario import sample_augment      # 증강 알고리즘

from GGStatify.descript import frequency_table

# Option
### Option 1. basic option

In [None]:
# gpu 상태 확인
GET_DEVICE = GetDevice()
GET_DEVICE.summary()

In [None]:
# Process 기본 설정
#######################################
# Process 상태 출력
VERBOSE = True
# code test 여부
CODE_TEST = False
# log save 여부
SAVE_LOG = True
# GPU 번호
GPU = 0
# process 진행 중 생성되는 파일들이 저장되는 초기 디렉터리 초기화 여부
MAKE_NEW_DEFAULT_DIR = True
# idx_dict을 새로 생성할지 여부
MAKE_NEW_IDX_DICT = True
# 모델의 종류
MODEL_TYPE = "classification"
# 이진 분류 여부
IS_BINARY = True


# Data 설정
#######################################
# image의 크기
IMG_SIZE = 224
# index dictionary 생성 방식
K_FOLD = 5                  # k-fold의 크기 (Stratified sampling)
TEST_RATIO = 0.2            # test dataset ratio
VALID_RATIO = 0.1           # validation dataset ratio, None인 경우 생성하지 않음


# 자동 설정
#######################################
_VERBOSE = get_verbose(verbose=VERBOSE)
_PROCESS_ID = get_process_id(verbose=_VERBOSE)
_CODE_TEST = code_test(do=CODE_TEST, test_ratio=0.1, verbose=_VERBOSE)

# device 설정
_DEVICE = GET_DEVICE(GPU)
torch.cuda.set_device(_DEVICE)

### Option 2. model

In [None]:
# timm에서 사용하고자 하는 모델의 이름을 찾는다.
model_name_ptn = r"efficient.+_b3"
TimmHelper.search(model_name_ptn)

In [None]:
# model 관련 설정
#######################################
# model 이름
MODEL_NAME = 'efficientnet_b3.ra2_in1k'
# pre-training 여부
PRETRAINED = True
# Fine tuning 방식
    # 0:FullFineTuning
    # 1:FixedFeatureExtractor
    # 2:PartialLayerFreezing
    # 3:FreezeNUnfreeze
TUNER_DICT = {
    'how':2,
    'freezing_ratio':0.9
}
# image의 channel 크기
IMG_CHANNEL = 3
# model이 추론할 class의 크기
CLASS_SIZE = 0
# 모델 추론을 위한 metrics
METRICS = MetricsClassification(is_binary=True, threshold=0.5)

# header 관련 설정
#######################################
from scripts.header import custom_binary_header, EXTRA_ACTIVATION_FN

# backbone 모델의 새로운 header
CUSTOM_HEAD_FN = custom_binary_header
# 손실 함수
LOSS_FN = nn.BCEWithLogitsLoss()
# label의 dtype을 loss function에 맞게 설정
LABEL_DTYPE_INS = LabelDtype(loss_fn=LOSS_FN)

### Option 3. pipe line setting

In [None]:
# Pipe line 내 각종 설정
#######################################
# AMP 사용 여부
USE_AMP = True
# Grad clipping 사용 여부
USE_CLIPPING = True
# Early Stopping 관련 설정
EARLY_STOPPING = {
    "patience":10,
    "delta":0.0,
    "target":"loss",
    "auto_remove":True,
    "best_model_dir":f"{RESULT}/{_PROCESS_ID}"
}

# data loader 관련 설정
#######################################
DATASET_CLASS = BasicDataset           # dataset의 class
TRAINSET_KWARGS = {
    "augments":sample_augment, 
    "resize":IMG_SIZE,
    "resize_how":0,                  # resize 방법
    "resize_how_list":[2, 3, 4],     # 무작위 resize 시, 방법의 list
    "resize_padding_color":"random"  # # resize padding 시, pixel의 색
}
VALIDSET_KWARGS = {
    "resize":IMG_SIZE,
    "resize_how":2,                  # resize 방법
    "resize_padding_color":"black"  # # resize padding 시, pixel의 색
}
WORKER = 4                          # DataLoader의 num_worker

### Option 4. Hyper Parameter

In [None]:
# Hyper Parameter 관련 설정
#######################################
HP_DICT = {
    'epochs':1000,
    'batch_size':64,
    
    # Optimizer
    'lr':0.00001,
    'betas':(0.9, 0.999),
    'eps':1e-08,
    
    # Scheduler
    'T_0':20,
    'T_mult':1,
    'eta_min':0.000001,
    
    # clipping
    'clipping_max_norm':5
}

# Optimizer 정의
OPTIM_HELPER = OptimHelper(name='Adam', hp_dict=HP_DICT)
# Scheduler 정의
SCHEDULE_HELPER = SchedulerHelper(name='CosineAnnealingWarmRestarts', hp_dict=HP_DICT)

# Process
### Process 1. make key_df
* key_df는 img의 절대 경로("path")와 label("label") 두 개의 컬럼으로 구성된 DataFrame 이다.

In [None]:
# 경로 정보
TRAIN_SET = "/mnt/d/rawdata/dogs-vs-cats/train/"        # train set의 경로
TEST_SET = "/mnt/d/rawdata/dogs-vs-cats/test1/"         # test set의 경로


# key_df 생성
path_list = GetAbsolutePath(None).get_all_path(parents_path=TRAIN_SET)
key_df = make_basic_key_df(
    paths=path_list,
    labels=[re.split(r".+/", i, maxsplit=1)[1].split('.')[0] for i in path_list]
)
# label을 이진 분류로 변환
key_df['label'] = binary_label_convertor(array=key_df['label'], positive_class='dog')

# 이해를 돕기 위한 key_df 출력
key_df

### Process 2. make idx_dict

In [None]:
# idx_dict의 경로
IDX_DICT_PATH = f"{SOURCE}/idx_dict.pickle"

# 기초 디렉터리 생성
make_basic_directory(source=SOURCE, estop_dir=ESPOINT_DIR, log=LOG, result=RESULT, make_new=MAKE_NEW_DEFAULT_DIR)

# Process log 설정
LOG_INS = Log(log_dir=LOG, process_id=_PROCESS_ID, model_type=MODEL_TYPE, save_log=SAVE_LOG)

# index dictionary 생성
make_index_dict = StratifiedIndexdict(columns=['label'], is_binary=IS_BINARY)
IDX_DICT = do_or_load(
    savepath=IDX_DICT_PATH, makes_new=MAKE_NEW_IDX_DICT, 
    fn=make_index_dict,
    key_df=key_df, k_fold_size=K_FOLD, test_ratio=TEST_RATIO, valid_ratio=VALID_RATIO, code_test=_CODE_TEST
)

### Process 3. make option instance

In [None]:
option = Option(
    process_id=_PROCESS_ID, verbose=_VERBOSE, log_ins=LOG_INS,

    model_name=MODEL_NAME, pretrained=PRETRAINED, device=_DEVICE, use_amp=USE_AMP, use_clipping=USE_CLIPPING,
    img_channel=IMG_CHANNEL, class_size=CLASS_SIZE, custom_header=CUSTOM_HEAD_FN, extra_activation_fn=EXTRA_ACTIVATION_FN,
    metrics=METRICS,

    optim_helper=OPTIM_HELPER, scheduler_helper=SCHEDULE_HELPER, loss_fn=LOSS_FN, label_dtype_fn=LABEL_DTYPE_INS, 
    tuner_dict=TUNER_DICT, hp_dict=HP_DICT, 

    dataset_class=DATASET_CLASS, trainset_kwargs=TRAINSET_KWARGS, validset_kwargs=VALIDSET_KWARGS, worker=WORKER,

    idx_dict=IDX_DICT, results_parents=RESULT, espoint_parents=f"{SOURCE}/{ESPOINT_DIR}",
    
    early_stopping=EARLY_STOPPING
)

### Process 4. model training

In [None]:
from GGDL.pipeline.pipeline import BackPropagation, EarlyStopping
from GGDL.model.fine_tuning import Tuner
from GGDL.model.vision import Classification as Model
from GGDL.data_loader.classification import GetLoader
from GGDL.pipeline.fit.classification import Classification as Fit
from GGUtils.utils.path import new_dir_maker

In [None]:
# 학습 전 모든 seed 고정
set_seed_everything(seed=SEED)

# k-fold cross validation
for k in option.idx_dict.keys():
    k_idx_dict = option.idx_dict[k]     # k-fold에 대한 idx_dict
    break

In [None]:
option.log_ins.k = k        # log의 k값 설정

# 결과가 저장될 디렉터리 생성
new_dir_maker(dir_path=f"{option.results_parents}/{option.process_id}")

# Data Loader 정의
loader = GetLoader(
    idx_dict=k_idx_dict, dataset_class=option.dataset_class, 
    batch_size=option.hp_dict['batch_size'], workers=option.worker
)
loader(key="train", **option.trainset_kwargs)
loader(key="test", **option.validset_kwargs)
if "valid" in k_idx_dict:
    loader(key="valid", **option.validset_kwargs)

# model 정의
model = Model(
    model_name=option.model_name, pretrained=option.pretrained, 
    channel=option.img_channel, class_size=option.class_size,
    custom_head_fn=option.custom_header
).to(option.device)

# Optimizer 설정
optimizer = option.optim_helper(param=model.parameters())
back_propagation = BackPropagation(
    optimizer=optimizer, use_amp=option.use_amp, 
    use_clipping=option.use_clipping, 
    max_norm=option.hp_dict['clipping_max_norm'] if 'clipping_max_norm' in option.hp_dict else None,
    device=option.device
)
# scheduler 설정
option.scheduler_helper(optimizer)

# Early Stopping 설정
if option.early_stopping is not None:
    early_stopping = EarlyStopping(
        model=model, path=f"{option.espoint_parents}/{option.process_id}.pt", 
        **option.early_stopping
    )
    early_stopping.k = k
else:
    early_stopping = None

# Fine tuning 방법 정의
tuner = Tuner(model, **option.tuner_dict)
tuner(epoch=0)      # model parameter 초기 변화

In [None]:
from GGDL.pipeline.fit.utils import get_lr

In [None]:
TEST_INS = Fit(
    model=model, loader=loader, optimizer=optimizer, 
    back_propagation=back_propagation, early_stopping=early_stopping, option=option
)

In [None]:
epoch_log_txt = None
for bar_txt, epoch in TEST_INS.pbar_ins(range(option.hp_dict['epochs'])):

    # Epochs 시작 시 log txt
    epoch_log_txt = TEST_INS.option.log_ins.epoch_log_txt(epoch_log_txt=epoch_log_txt, bar_txt=bar_txt)

    # model training
    train_loss, train_acc = TEST_INS._fit_iterator(epoch, epoch_log_txt)

    # validation and early stopping
    if TEST_INS.loader.valid is not None:
        valid_loss, valid_acc, _ = TEST_INS.inference(loader=TEST_INS.loader.valid)
        
        # Early stopping
        if TEST_INS.early_stopping_mask:
            valid_score = TEST_INS.early_stopping.validation_score(loss=valid_loss, acc=valid_acc)
            es_log_txt = TEST_INS.early_stopping(epoch, score=valid_score)
    else:
        valid_loss, valid_acc = (None, None)
        
    # scheduler 조정
    scheduler_score = valid_loss if valid_loss is not None else train_loss
    TEST_INS.option.scheduler_helper.epoch_step(score=scheduler_score)
        
    # epoch 내 log 출력
    epoch_log_txt = TEST_INS._epoch_log(
        bar_txt, es_log_txt, epoch, 
        train_acc, train_loss, valid_acc, valid_loss
    )
    # Early stopping stop
    if TEST_INS.early_stopping.stop:
        break

# 학습 종료 후, test set에 대한 지표 생성 및 log 출력
test_predict_dict = TEST_INS.end_of_fit(epoch, bar_txt, train_acc, train_loss, valid_acc, valid_loss)