In [17]:
import errno
import glob
import json
import os
import re
import torch
from torch.utils.tensorboard import SummaryWriter
from common_utils import read_csv, create_holdout_loader, stratify_split, make_holdout_df

from logger import init_logger
from train_utilities.trainer import Trainer
import config

In [None]:
def main(experiment_name, resume, settings, train_fold=None):
    assert train_fold is None or train_fold in range(settings.fold_num)
    experiment_dir = settings.save_dir + f'/{experiment_name}'

    try:
        # -------- SETUP --------
        # if resuming, get checkpoint parameters
        checkpoint_params = get_checkpoint_params(experiment_dir, resume, train_fold)
        logger = init_logger()
        tb_writer = SummaryWriter(f'./runs/{experiment_name}')
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        if not checkpoint_params:
            make_experiment_directory(experiment_dir)

            # -------- LOAD DATA FROM TRAIN FILE --------
            data_df = read_csv(settings.data_dir + '/train.csv', settings.debug)
            holdout_df = make_holdout_df(data_df)
            folds_df = stratify_split(data_df, settings.fold_num, settings.seed, settings.target_col)

            # -------- SAVE FILES (for experiment state) --------
            folds_df.to_csv(experiment_dir + '/folds.csv', index=False)
            # save holdout to a csv file for final inference (so we don't run inference on training examples)
            holdout_df.to_csv(experiment_dir + '/holdout.csv', index=False)
            # save the settings for this experiment to its directory
            with open(experiment_dir + '/settings.json', 'w') as f:
                json.dump(settings, f)
        else:
            # LOAD DATA FROM SAVED FILES
            folds_df = read_csv(experiment_dir + '/folds.csv', settings.debug)
            holdout_df = read_csv(experiment_dir + '/holdout.csv', settings.debug)
            with open(experiment_dir + '/settings.json', 'r') as f:
                settings = json.load(f)

        holdout_loader, holdout_targets = create_holdout_loader(holdout_df, settings.data_dir + '/train_images')

        trainer = Trainer(folds_df=folds_df, holdout_loader=holdout_loader,
                          logger=logger, tensorboard_writer=tb_writer,
                          device=device, checkpoint_params=checkpoint_params,
                          settings=settings, experiment_dir=experiment_dir)
        trainer.fit()
    finally:
        torch.cuda.empty_cache()

def make_experiment_directory(basename):
    try:
        os.makedirs(basename)
    except OSError as e:
        print('Experiment already exists. Be sure to resume training appropriately or start a new experiment.')
        if e.errno != errno.EEXIST: raise


def get_checkpoint_params(basename, resume, train_fold):
    """
    We can restart from the middle of a fold or start from the beginning of a fold.

    checkpoint_params: {"restart_from": fold, "start_beginning_of": fold, "checkpoint_file_path": file}
        restart_from (int): start from middle of a fold - typically used when a training session was cancelled mid fold
            checkpoint_file_path (str) is required in this case
        start_beginning_of (int): train a particular fold
    """

    checkpoint_params = None
    if resume:
        model_filenames = glob.glob(basename + '/*.pth')
        trained_folds = [re.findall(r'fold\d+', f)[0][len('fold'):]
                         for f in model_filenames]
        if train_fold is not None:
            assert train_fold not in trained_folds
            checkpoint_params['start_beginning_of'] = train_fold
        else:
            most_recent_fold = max(trained_folds)
            checkpoint_params['restart_from'] = most_recent_fold
            checkpoint_params['checkpoint_file_path'] = basename + f'/{config.settings.model_arch}_fold{most_recent_fold}.pth'
    return checkpoint_params



In [None]:
if __name__ == '__main__':
    try:
        print('Running in debug mode:', config.settings.debug)
        main(experiment_name_dir='', resume=False, settings=config.settings)
    except KeyboardInterrupt:
        pass
