# Cassava Leaf Disease Classification

This notebook builds and trains a model for cassava leaf disease classification for the [Kaggle competition](https://www.kaggle.com/c/cassava-leaf-disease-classification/overview).

## Ideas

1. Cross entropy loss, stratified CV, no fmix, cutmix, mixup, w gradient scaling & accumulation [done]
2. add hyperparam tuning with raytune [done]
2. Add smoothed cross entropy loss
3. Add *mixes
4. external data
5. emsemble of models - train a model for each fold and then average their predictions during inference [done]
6. train 15-20 epochs [done]
7. Test time augmentation
8. Better ensemble prediction - majority vote [done], other...?
10. train a resnet model
11. balanced classes instead of stratified?
12. verify per class accuracy
13. AdaBound - "as good as SGD and as fast as Adam"

In [None]:
import os
from datetime import datetime, timedelta
import time
import random
import warnings
import joblib
import warnings
import gc
import errno
import shutil

# My modules
from config import Config
from logger import init_logger
from common_utils import (set_seeds, read_csvs, stratify_split, setup_model_optimizer, 
                          get_data_dfs, get_loaders, create_holdout_loader, get_schd_crit)
from model import Model
from train_loop_functions import train_epoch, valid_epoch
from cassava_dataset import CassavaDataset
from early_stopping import EarlyStopping
from trainer import Trainer

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score

import torch
from torch import nn
from torch.cuda.amp import GradScaler
from torch.utils.tensorboard import SummaryWriter

In [None]:
%load_ext autoreload
%autoreload 2

## Preliminaries

In [None]:
label_map = pd.read_json(Config.data_dir + '/label_num_to_disease_map.json', orient='index')
label_map

In [None]:
set_seeds(Config.seed)
LOGGER = init_logger() # uses Python's logging framework

## Train
https://towardsdatascience.com/deep-learning-model-training-loop-e41055a24b73

In [1]:
# Work in progress: implement an encapsulating class for the training loops 
# and store all the state variables in it so we don't have to pass them as parameters everywhere

trainer = Trainer(logger, tensorboard_writer, device, train_loader, valid_loader, holdout_loader)
trainer.fit()

NameError: name 'Trainer' is not defined

In [None]:
'''
    Trains the model over N epochs for a given fold
    
    train_folds_df: the dataset with a column for fold number
    fold: an integer representing the fold used for validation
    
    Returns a DataFrame consisting of only the the rows used for validation along with the model's predictions
''' 
def train_valid_test(train_folds_df, fold, 
                     device, basename, 
                     holdout_dataloader, holdout_targets, 
                     tb_writer, checkpoint=None):
    model_checkpoint_name = basename + f'/{Config.model_arch}_fold{fold}.pth'
    
    # -------- DATASETS AND LOADERS --------
    # select one of the folds, create train & validation set loaders
    train_df, valid_df = get_data_dfs(train_folds_df, fold)
    train_dataloader, valid_dataloader = get_loaders(train_df, valid_df,
                                                     Config.train_bs, 
                                                     Config.data_dir + '/train_images')
    
    # make model and optimizer
    model, optimizer = setup_model_optimizer(Config.model_arch, 
                                           Config.lr, 
                                           Config.is_amsgrad, 
                                           num_labels=train_folds_df.label.nunique(), 
                                           weight_decay=Config.weight_decay,
                                           momentum=Config.momentum,
                                           fc_layer={"middle_fc": False, "middle_fc_size": 0},
                                           device=device,
                                           checkpoint=checkpoint)
    
    scheduler, criterion = get_schd_crit(optimizer)
    
    accuracy = 0.
    best_val_loss = float('inf')
    train_losses, val_losses = [], []
    
    early_stop = EarlyStopping('val_loss', LOGGER, patience=Config.loss_patience)

    for e in range(Config.epochs):
        epoch_start_time = time.time()
        LOGGER.info(f'Training epoch {e+1}/{Config.epochs}')
        
        # -------- TRAIN --------
        avg_training_loss = train_epoch(train_dataloader, model, 
                                      criterion, optimizer, 
                                      scheduler, GradScaler(), 
                                      Config.accum_iter, LOGGER,
                                      device, tb_writer, fold, e)

        # -------- VALIDATE --------
        avg_validation_loss, preds = valid_epoch(valid_dataloader, model, 
                                                 criterion, LOGGER, device, 
                                                 tb_writer, fold, e)
        
        train_losses.append(avg_training_loss)
        val_losses.append(avg_validation_loss)

        # -------- SCORE METRICS & LOGGING FOR THIS EPOCH --------
        validation_labels = valid_df[Config.target_col].values
        accuracy = accuracy_score(y_true=validation_labels, y_pred=preds)
       
        epoch_elapsed_time = time.time() - epoch_start_time
        
        tb_writer.add_scalar(f'Avg Epoch Train Loss Fold {fold}', avg_training_loss, e)
        tb_writer.add_scalar(f'Avg Epoch Val Loss Fold {fold}', avg_validation_loss, e)
        tb_writer.add_scalar(f'Epoch Val Accuracy Fold {fold}', accuracy, e)
        
        LOGGER.info(f'\nEpoch training summary:\n Fold {fold+1}/{Config.fold_num} | ' + \
                    f'Epoch: {e+1}/{Config.epochs} | ' + \
                    f'Epoch time: {epoch_elapsed_time} sec\n' + \
                    f'Training loss: {avg_training_loss} | ' + \
                    f'Validation loss: {avg_validation_loss} | ' + \
                    f'Accuracy: {accuracy}')
        
        early_stop(avg_validation_loss)
        if early_stop.stop: break
            
        # --------SAVE MODEL --------
        if avg_validation_loss < best_val_loss: 
            best_val_loss = avg_validation_loss
            torch.save({'model_state': model.state_dict(), 
                        'optimizer_state': optimizer.state_dict(),
                        'accuracy': accuracy, 
                        'preds': preds,
                        'val_loss': best_val_loss,
                        'fold': fold,
                        'epochs_no_improve': early_stop.counter,
                        'epoch_stopped_at': e
                       }, model_checkpoint_name)
            LOGGER.info(f'Saved model!')
        LOGGER.info('----------------')
        
        # -------- UPDATE LR --------
        if scheduler and e > 2:
            if Config.scheduler == 'ReduceLROnPlateau':
                scheduler.step(avg_validation_loss)
        gc.collect()

    # -------- TEST ON HOLDOUT SET --------
    # load best model
    checkpoint = torch.load(model_checkpoint_name)
    model.load_state_dict(checkpoint['model_state']) 
    holdout_loss, holdout_preds = valid_epoch(holdout_dataloader, model, 
                                              criterion, LOGGER, device, 
                                              tb_writer, fold, holdout=True)
    holdout_accuracy = accuracy_score(y_true=holdout_targets, y_pred=holdout_preds)
    
    tb_writer.add_scalar(f'Fold {fold} holdout accuracy', holdout_accuracy, fold)
    tb_writer.add_scalar(f'Fold {fold} holdout loss', holdout_loss, fold)
    
    valid_df['prediction'] = checkpoint['preds']
    
    del model
    del optimizer
    del train_dataloader
    del valid_dataloader
    return valid_df, checkpoint['accuracy'], holdout_accuracy

In [None]:
def train_folds(device, basename, tb_writer, 
                folds=None, holdout_dataloader=None, holdout_targets=None, checkpoint=None, train_fold=None):
    LOGGER.info('========== Running training ==========\n')
    aggregated_output_df = pd.DataFrame()
    accuracies = []

    starting_fold = 0
    if checkpoint: # resume from middle of a fold
        starting_fold = checkpoint['fold']
    elif train_fold is not None:
        starting_fold = train_fold
        
    folds = pd.read_csv(basename + '/folds.csv', engine='python')
    holdout_df = pd.read_csv(basename + '/holdout.csv', engine='python')
    holdout_dataloader, holdout_targets = create_holdout_loader(holdout_df, Config.train_img_dir)

    time_training_start = time.time()

    for fold in range(starting_fold, Config.fold_num):    
        # _df is the validation prediction output that we will save to file
        # _df.columns: ['image_id', 'label', 'fold', 'prediction']
        _df, val_accuracy, holdout_accuracy = train_valid_test(
                                                               folds, fold, device,
                                                               basename,
                                                               holdout_dataloader, 
                                                               holdout_targets, 
                                                               tb_writer, checkpoint)

        if aggregated_output_df.empty:
            aggregated_output_df[['image_id', 'label']] = _df[['image_id', 'label']]
        aggregated_output_df[['prediction_fold' + str(fold)]] = _df['prediction']

        accuracies.append((val_accuracy, holdout_accuracy))

        LOGGER.info(f'========== fold: {fold+1}/{Config.fold_num} result ==========')
        LOGGER.info(f'Best Validation Accuracy: {val_accuracy}')
        LOGGER.info(f'Holdout Accuracy: {holdout_accuracy}')
        LOGGER.info(f'Holdout Loss: {holdout_loss}')

    # Cross validation
    time_elapsed_training = time.time() - time_training_start 
    LOGGER.info(f"Training time: {str(timedelta(seconds=time_elapsed_training))}")
    LOGGER.info(f"Accuracy (best val, holdout): {accuracies}")

    # Save result
    aggregated_output_df.to_csv(basename + f'/aggregated_output_df.csv', index=False)
    LOGGER.info('========== Training complete ==========\n')

## Main loop

In [None]:
"""
Entry point to training.
experiment_name_dir (required): a name for a directory in ./trained-models 
"""
def run_training(experiment_name_dir, resume, train_fold=None):
    assert train_fold is None or train_fold in range(Config.fold_num)
    basename = Config.save_dir + f'/{experiment_name_dir}'
    
    try:
        # -------- SETUP --------
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        tb_writer = SummaryWriter(f'./runs/{experiment_name_dir}')
        
        # resume training 
        if resume and train_fold:
            train_folds(device=device, basename=basename, tb_writer=tb_writer, checkpoint=None, train_fold=train_fold)
        elif resume and train_fold is None:
            last_fold = np.max([int(f[-5]) for f in os.listdir(basename) if f[-3:] == 'pth'])
            if last_fold >= 0: 
                print(f'Experiment exists. Resuming training from latest fold ({last_fold}).')
                checkpoint = torch.load(basename + f'/{Config.model_arch}_fold{last_fold}.pth')
                train_folds(device=device, basename=basename, tb_writer=tb_writer, checkpoint=checkpoint)
        else:
            # MAKE DIRECTORY FOR EXPERIMENT MODELS AND FILES
            try:
                os.makedirs(Config.save_dir + f'/{experiment_name_dir}')
            except OSError as e:
                print('Experiment already exists and resume flag is not set. Abort training.')
                if e.errno != errno.EEXIST: raise
            
            # -------- LOAD DATA FROM TRAIN FILE --------
            data_df, sample_df, holdout_df = read_csvs(Config.data_dir, Config.debug, test_proportion=0.15)
            folds = stratify_split(data_df, Config.fold_num, Config.seed, Config.target_col)

            
            # create holdout dataloader to validate each fold on unseen data
            holdout_dataloader, holdout_targets = create_holdout_loader(holdout_df, Config.data_dir + '/train_images')
            
            # -------- SAVE CONFIG AND HOLDOUT --------
            # save folds to file
            folds.to_csv(basename + f'/folds.csv', index=False)
            # save holdout to a csv file for final inference (so we don't run inference on training examples)
            holdout_df.to_csv(basename + f'/holdout.csv', index=False)
            # copy the config file for this experiment to this directory
            shutil.copy2('./config.py', basename)
            train_folds(folds=folds, device=device, basename=basename, tb_writer=tb_writer,
                       holdout_dataloader=holdout_dataloader, holdout_targets=holdout_targets)        
    finally: 
        torch.cuda.empty_cache()

notes: fold 1 is just model from fold 0 trained more... this is accidental - it used model state from previous fold when loading model. this has been fixed.

In [None]:
if __name__ == '__main__':
    try:
        print('Running in debug mode:', Config.debug)
        run_training(experiment_name_dir='exp6_sgd', resume=True, train_fold=4)
    except KeyboardInterrupt:
        pass