# Cassava Leaf Disease Classification

This notebook builds and trains a model for cassava leaf disease classification for the [Kaggle competition](https://www.kaggle.com/c/cassava-leaf-disease-classification/overview).

## Ideas

1. Cross entropy loss, stratified CV, no fmix, cutmix, mixup, w gradient scaling & accumulation [done]
2. add hyperparam tuning with raytune
2. Add smoothed cross entropy loss
3. Add *mixes
4. external data
5. emsemble of models - train a model for each fold and then average their predictions during inference

In [None]:
package_paths = [
    # this is a project by Ross Wightman (https://github.com/rwightman/pytorch-image-models)
    '../pytorch-image-models'
]
import sys; 

for pth in package_paths:
    sys.path.append(pth)

In [None]:
import os
from datetime import datetime
import time
import random
import warnings
import joblib
import warnings

# My modules
from config import Config
from logger import init_logger
from train_loop_functions import train_epoch, valid_epoch, inference
from common_utils import set_seeds, read_csvs, get_image, stratify_split, get_train_transforms, get_valid_transforms, setup
from modes import Model

from sklearn.metric import accuracy_score

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
from torch import nn
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler

In [None]:
%load_ext autoreload
%autoreload 2

## Preliminary data loading and setup

In [None]:
train, test = read_csvs(Config.data_dir, Config.debug)`

In [None]:
train.head()

In [None]:
test.head()

In [None]:
label_map = pd.read_json(Config.data_dir + '/label_num_to_disease_map.json', orient='index')
label_map

Since the distribution of classes is uneven, we could do stratified k-fold cross validation to make each fold's train and validation distributions representative of the original distributions.

In [None]:
train.label.value_counts()

In [None]:
set_seeds(Config.seed)
LOGGER = init_logger() # uses Python's logging framework

sample_img = get_image(Config.train_img_dir + '/1000015157.jpg')
plt.imshow(sample_img)
plt.show()

## Dataset class

#### Test it out

In [None]:
cassava_dataset = CassavaDataset(train, Config.train_img_dir, output_label=True)
fig = plt.figure()
for i in range(2):
    img, target = cassava_dataset[i]
    print(i, img.shape, target)
    
    ax = plt.subplot(1, 2, i + 1)
    plt.tight_layout()
    ax.set_title('Class: {}'.format(target))
    ax.axis('off')
    plt.imshow(img)
    if i == 3:
        plt.show()
        break

## Stratified Cross Validation Folds

In [None]:
train_folds = stratifySplit(train, Config.fold_num, Config.seed, Config.target_col)
train_folds.groupby(['fold', 'label']).count()

## Training and Validation Functions

gradient scaling https://pytorch.org/docs/stable/notes/amp_examples.html

gradient accumulation https://towardsdatascience.com/what-is-gradient-accumulation-in-deep-learning-ec034122cfa

https://towardsdatascience.com/deep-learning-model-training-loop-e41055a24b73

In [None]:
'''
    Trains the model over epochs for a given fold
    
    train_folds_df: the dataset with a column for fold number
    fold: an integer representing the fold used for validation
    
    Returns a DataFrame consisting of only the the rows used for validation along with the model's predictions
''' 
def train_fold(train_folds_df, fold, model, optimizer, scheduler, criterion, resultsStore):
    # -------- DATASETS AND LOADERS --------
    train_dataloader, valid_dataloader = get_loaders(train_folds_df, fold, Config.train_bs)
    
    accuracy, best_accuracy = 0., 0.
    for e in range(Config.epochs):
        epoch_start_time = time.time()
        LOGGER.info(f'Training epoch {e+1}/{Config.epochs}')
        
        # -------- TRAIN --------
        training_losses = train_epoch(train_dataloader, model, criterion, optimizer, scheduler, GradScaler())
        avg_training_loss = sum(training_losses) / len(train_dataloader)
        
        # -------- VALIDATE --------
        validation_losses, preds = valid_epoch(valid_dataloader, model, criterion)
        avg_validation_loss = sum(validation_losses) / len(valid_dataloader)
        
        epoch_elapsed_time = time.time() - epoch_start_time

        # -------- SCORE METRICS & LOGGING FOR THIS EPOCH --------
        validation_labels = valid_df[Config.target_col].values
        accuracy = accuracy_score(y_true=validation_labels, y_pred=preds)
        
        LOGGER.info(f'\nEpoch training summary:\n Fold {fold}/{Config.fold_num} | ' + \
                    f'Epoch: {e+1}/{Config.epochs} | ' + \
                    f'Epoch time: {epoch_elapsed_time} sec | ' + \
                    f'Training loss: {avg_training_loss} | ' + \
                    f'Validation loss: {avg_validation_loss} | ' + \
                    f'Accuracy: {accuracy}\n')
        
        # SAVE MODEL (keeps only the best model for this fold)
        if accuracy > best_accuracy: 
            best_accuracy = accuracy
            torch.save({'model': model.state_dict(), 'preds': preds, 'accuracy': best_accuracy, 'fold': fold},
                      Config.save_dir + f'/{Config.model_arch}_fold{fold}.pth')
            LOGGER.info(f'Saved model on epoch {e+1}, fold {fold}, and accuracy score {accuracy:.3f}')
        
        # -------- UPDATE LR (POTENTIALLY) --------
        if scheduler:
            if Config.scheduler == 'ReduceLROnPlateau':
                scheduler.step(avg_validation_loss)
            elif Config.scheduler == 'CosineAnnealingLR' or Config.scheduler == 'CosineAnnealingWarmRestarts':
                scheduler.step()

    checkpoint = torch.load(Config.save_dir + f'/{Config.model_arch}_fold{fold}.pth')
    valid_df['prediction'] = checkpoint['preds']
    return valid_df, best_accuracy

## Main loop

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

class Results:
    def __init__(self):
        self.fold_to_predictions = {}
        self.fold_to_accuracy = {}
        
def main():    
    try:
        resultsStore = Results()
        model, optimizer, scheduler, criterion = setup()
            
        if Config.train:
            LOGGER.info('\n========== Running training ==========\n')
            
            aggregated_output_df = pd.DataFrame()
            
            for fold in range(Config.fold_num):
                # _df is the validation prediction output
                # _df.columns: ['image_id', 'label', 'fold', 'prediction']
                _df, best_fold_accuracy = train_fold(train_folds, fold, model, optimizer, scheduler, criterion, resultsStore)
                
                if aggregated_output_df.empty:
                    aggregated_output_df[['image_id', 'label']] = _df[['image_id', 'label']]
                aggregated_output_df[['prediction_fold'+str(fold)]] = _df['prediction']
                
                resultsStore.fold_to_predictions[fold] = _df[['image_id', 'label', 'prediction']]
                resultsStore.fold_to_accuracy[fold] = best_fold_accuracy
                
                LOGGER.info(f'========== fold: {fold} result ==========')
                LOGGER.info(f'Accuracy: {best_fold_accuracy}')
                
            # Cross validation
            LOGGER.info(f"========== CV ==========") # best results across all folds
            LOGGER.info(f"{resultsStore.fold_to_accuracy}")
            
            # Save result
            aggregated_output_df.to_csv(Config.save_dir + '/aggregated_output_df.csv', index=False)
            
        if Config.inference: 
            LOGGER.info('\n========== Running inference ==========\n')
            test_dataset = CassavaDataset(test, Config.test_img_dir, output_label=True, 
                                          transform=get_valid_transforms())
            
            test_dataloader = DataLoader(test_dataset, batch_size=Config.valid_bs, 
                                  pin_memory=True, shuffle=False, 
                                  num_workers=Config.num_workers)

            predictions = inference(model, test_dataloader)
            targets = test.label.values
            
            # submission
            submission = pd.DataFrame()
            submission['image_id'] = test['image_id']
            submission['label'] = predictions
            submission.to_csv(Config.save_dir + '/submission.csv', index=False)
    finally: 
        del model
        torch.cuda.empty_cache()

For each fold, run tune? Each fold gets num_samples trials.

In [None]:
if __name__ == '__main__':
    try:
        print('Training in debug mode: ', Config.debug)
        main()
    except KeyboardInterrupt:
        pass