# Cassava Leaf Disease Classification

This notebook builds and trains a model for cassava leaf disease classification for the [Kaggle competition](https://www.kaggle.com/c/cassava-leaf-disease-classification/overview).

## Ideas

1. Cross entropy loss, stratified CV, no fmix, cutmix, mixup, w gradient scaling & accumulation [done]
2. add hyperparam tuning with raytune [done]
2. Add smoothed cross entropy loss
3. Add *mixes
4. external data
5. emsemble of models - train a model for each fold and then average their predictions during inference [done]

In [None]:
import os
from datetime import datetime
import time
import random
import warnings
import joblib
import warnings
import gc

# My modules
from config import Config
from logger import init_logger
from common_utils import (set_seeds, read_csvs, stratify_split, setup_model_optimizer, 
                          get_data_dfs, get_loaders, create_holdout_loader, get_schd_crit)
from model import Model
from train_loop_functions import train_epoch, valid_epoch, ensemble_inference
from cassava_dataset import CassavaDataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score

import torch
from torch import nn
from torch.cuda.amp import GradScaler

In [None]:
%load_ext autoreload
%autoreload 2

## Preliminaries

In [None]:
label_map = pd.read_json(Config.data_dir + '/label_num_to_disease_map.json', orient='index')
label_map

In [None]:
set_seeds(Config.seed)
LOGGER = init_logger() # uses Python's logging framework

## Training and Validation Functions

gradient scaling https://pytorch.org/docs/stable/notes/amp_examples.html

gradient accumulation https://towardsdatascience.com/what-is-gradient-accumulation-in-deep-learning-ec034122cfa

https://towardsdatascience.com/deep-learning-model-training-loop-e41055a24b73

In [None]:
'''
    Trains the model over N epochs for a given fold
    
    train_folds_df: the dataset with a column for fold number
    fold: an integer representing the fold used for validation
    
    Returns a DataFrame consisting of only the the rows used for validation along with the model's predictions
''' 
def train_valid_test(train_folds_df, fold, resultsStore, device, 
                     experiment_name_dir, holdout_dataloader, holdout_targets):
    
    # -------- DATASETS AND LOADERS --------
    # select one of the folds, create train & validation set loaders
    train_df, valid_df = get_data_dfs(train_folds_df, fold)
    train_dataloader, valid_dataloader = get_loaders(train_df, valid_df,
                                                     Config.train_bs, 
                                                     Config.data_dir+'/train_images')
    
    
    # make model and optimizer
    model, optimizer = setup_model_optimizer(Config.model_arch, 
                                           Config.lr, 
                                           Config.is_amsgrad, 
                                           num_labels=train_folds_df.label.nunique(), 
                                           weight_decay=Config.weight_decay,
                                           fc_layer={"middle_fc": False, "middle_fc_size": 0},
                                           device=device,
                                           checkpoint=None)

    scheduler, criterion = get_schd_crit(optimizer)
    
    accuracy = 0.
    best_val_loss = float('inf')
    train_losses, val_losses = [], []
    
    for e in range(Config.epochs):
        epoch_start_time = time.time()
        LOGGER.info(f'Training epoch {e+1}/{Config.epochs}')
        
        # -------- TRAIN --------
        avg_training_loss = train_epoch(train_dataloader, model, 
                                      criterion, optimizer, 
                                      scheduler, GradScaler(), 
                                      Config.accum_iter, LOGGER,
                                      device)

        # -------- VALIDATE --------
        avg_validation_loss, preds = valid_epoch(valid_dataloader, model, 
                                                 criterion, LOGGER, device)
        
        train_losses.append(avg_training_loss)
        val_losses.append(avg_validation_loss)

        # -------- SCORE METRICS & LOGGING FOR THIS EPOCH --------
        validation_labels = valid_df[Config.target_col].values
        accuracy = accuracy_score(y_true=validation_labels, y_pred=preds)
       
        epoch_elapsed_time = time.time() - epoch_start_time
        
        LOGGER.info(f'\nEpoch training summary:\n Fold {fold+1}/{Config.fold_num} | ' + \
                    f'Epoch: {e+1}/{Config.epochs} | ' + \
                    f'Epoch time: {epoch_elapsed_time} sec\n' + \
                    f'Training loss: {avg_training_loss} | ' + \
                    f'Validation loss: {avg_validation_loss} | ' + \
                    f'Accuracy: {accuracy}\n')
        
        # --------SAVE MODEL --------
        if avg_validation_loss < best_val_loss: 
            best_val_loss = avg_validation_loss
            torch.save({'model': model.state_dict(), 
                        'accuracy': accuracy, 
                        'preds': preds,
                        'val_loss': best_val_loss,
                        'fold': fold
                       },
                      Config.save_dir + f'/{experiment_name_dir}/{Config.model_arch}_fold{fold}.pth')
            LOGGER.info(f'Saved model!')
        
        # -------- UPDATE LR --------
        if scheduler and e > 2:
            if Config.scheduler == 'ReduceLROnPlateau':
                scheduler.step(avg_validation_loss)
            elif Config.scheduler == 'CosineAnnealingLR' or Config.scheduler == 'CosineAnnealingWarmRestarts':
                scheduler.step()
        gc.collect()

    # -------- TEST ON HOLDOUT SET --------
    # load best model
    checkpoint = torch.load(Config.save_dir + f'/{experiment_name_dir}/{Config.model_arch}_fold{fold}.pth')
    model.load_state_dict(checkpoint['model']) 
    # test
    _, holdout_preds = valid_epoch(holdout_dataloader, model, criterion, LOGGER, device)
    holdout_accuracy = accuracy_score(y_true=holdout_targets, y_pred=holdout_preds)
    
    valid_df['prediction'] = checkpoint['preds']
    del model
    del optimizer
    del train_dataloader

    del valid_dataloader
    return valid_df, checkpoint['accuracy'], holdout_accuracy, train_losses, val_losses

## Main loop

In [None]:
class Results:
    def __init__(self):
        self.fold_to_predictions = {}
        self.fold_to_accuracy = {}

"""
Entry point to training and inference. 
experiment_name_dir (required): a name for a directory in ./trained-models 
"""
def main(experiment_name_dir, kaggle):
    base_experiment_filename = Config.save_dir + f'/{experiment_name_dir}/{Config.model_arch}_fold'
    
    try:
        # -------- SETUP --------
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        resultsStore = Results()
        

        # -------- LOAD DATA FROM FILE --------
        data_df, sample_df, holdout_df = read_csvs(Config.data_dir, Config.debug, test_proportion=0.15)
        folds = stratify_split(data_df, Config.fold_num, Config.seed, Config.target_col)
        test_df, test_loader = None, None
        
        # create holdout dataloader to test on totally unseen data
        holdout_dataloader, holdout_targets = create_holdout_loader(holdout_df, Config.data_dir + '/train_images')   

        experiment_list = os.listdir(Config.save_dir)
        if experiment_name_dir in experiment_list: # resume training from the last fold's checkpoint
            last_fold = len(os.listdir(Config.save_dir + f'/{experiment_name_dir}')) - 1
            if last_fold >= 0: 
                print(f'Experiment exists. Resuming training from latest fold ({last_fold}).')

                checkpoint = torch.load(base_experiment_filename + f'{last_fold}.pth')

                #resume(checkpoint, fold, model, optimizer)
        else: # -------- START TRAINING --------
            # make directory for experiment
            try:
                os.makedirs(Config.save_dir + f'/{experiment_name_dir}')
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise
            
            
            if Config.train:
                LOGGER.info('\n========== Running training ==========\n')

                aggregated_output_df = pd.DataFrame()

                for fold in range(Config.fold_num):    
                    # _df is the validation prediction output
                    # _df.columns: ['image_id', 'label', 'fold', 'prediction']
                    _df, val_accuracy, holdout_accuracy, train_losses, val_losses = train_valid_test(
                                                                        folds, fold, 
                                                                        resultsStore, device,
                                                                        experiment_name_dir,
                                                                        holdout_dataloader, 
                                                                        holdout_targets)
                    
                    if aggregated_output_df.empty:
                        aggregated_output_df[['image_id', 'label']] = _df[['image_id', 'label']]
                    aggregated_output_df[['prediction_fold'+str(fold)]] = _df['prediction']

                    resultsStore.fold_to_predictions[fold] = _df[['image_id', 'label', 'prediction']]
                    resultsStore.fold_to_accuracy[fold] = (val_accuracy, holdout_accuracy)

                    LOGGER.info(f'========== fold: {fold} result ==========')
                    LOGGER.info(f'Validation Accuracy: {val_accuracy}')
                    LOGGER.info(f'Holdout Accuracy: {holdout_accuracy}')

                # Cross validation
                LOGGER.info(f"========== CV ==========") # best results across all folds
                LOGGER.info(f"{resultsStore.fold_to_accuracy}")

                # Save result
                aggregated_output_df.to_csv(Config.save_dir + '/aggregated_output_df.csv', index=False)

        if Config.inference: # runs inference on all trained models, averages result
            LOGGER.info('\n========== Running inference ==========\n')
            
            model_states = [torch.load(base_experiment_filename + f'{fold}.pth')['model']
                            for fold in range(Config.fold_num)]
            assert len(model_states) == Config.fold_num
            
            
            if not kaggle: 
                loader = holdout_dataloader
                num_samples = len(holdout_df)
            else: 
                loader = test_dataloader 
                num_samples = len(test_df)
                
            predictions = ensemble_inference(states, Config.model_arch, 
                                    data_df.label.nunique(), loader, num_samples, device)
            
            if not kaggle:
                holdout_accuracy = accuracy_score(y_true=holdout_targets, y_pred=predictions)
                LOGGER.info(f"Ensemble model holdout accuracy: {holdout_accuracy}")
            
                
            
            # submission
            submission = pd.DataFrame()
            submission['image_id'] = test['image_id']
            submission['label'] = predictions
            submission.to_csv(Config.save_dir + '/submission.csv', index=False)
    finally: 
        torch.cuda.empty_cache()

In [None]:
if __name__ == '__main__':
    try:
        print('Training in debug mode: ', Config.debug)
        main(experiment_name_dir='exp0', kaggle=False)
        
    except KeyboardInterrupt:
        pass