# validation.ipynb

Validation implementation.

Author: Connacher Murphy

In [23]:
# Libraries
import pest_classification as pest

import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import StratifiedKFold
import timm
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader
from types import SimpleNamespace

In [24]:
# Use a super small sample for quick testing
quickly = False

In [25]:
# Set random seed
pest.set_seed(123)

In [26]:
# Configuration
config = SimpleNamespace(**{})

# Grab training observations from images df
df_all = pest.df

if quickly:  # select small sample
    config.total_size = 256
    config.batch_size = 32
    config.num_epochs = 2

    df = df_all[df_all["set"] == "train"].sample(config.total_size)
else:  # "full" sample
    config.batch_size = 512
    config.num_epochs = 4

    df = df_all[df_all["set"] == "train"]

df = df.reset_index(drop=True)

In [27]:
# Dataset and dataloader
# config.image_dir = os.path.expanduser("~/data/ccmt/CCMT Dataset-Augmented")
# config.image_size = 256

In [28]:
# Add folds to the dataframe
config.n_folds = 4

skf = StratifiedKFold(n_splits=config.n_folds)
# CM: should I add a shuffle here?
# skf = StratifiedKFold(n_splits=config.n_folds, shuffle=True)

In [29]:
# Partition into folds
for fold, (train_index, val_index) in enumerate(skf.split(df, df.label)):
    df.loc[val_index, "fold"] = fold

In [30]:
# Specify architecture parameters
config.num_classes = len(pest.crop_descriptions["Maize"])
# config.num_classes = 2
config.backbone = "resnet18"

# Specify optimizer parameters
config.lr = 1e-4

In [31]:
# Select GPU if available
print(torch.cuda.is_available())
config.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

False


In [32]:
# Training function
def train(train_dataloader, valid_dataloader, model, optimizer, config):
    train_loss_history = []
    train_accuracy_history = []
    valid_loss_history = []
    valid_accuracy_history = []

    for epoch in range(config.num_epochs):
        print(f"Epoch {epoch + 1}")
        print("Training...")
        model, train_loss, train_accuracy, train_tab = pest.train_epoch(
            train_dataloader, model, optimizer, config
        )

        train_loss_history.append(train_loss)
        train_accuracy_history.append(train_accuracy)

        train_tab_sum = np.sum(train_tab, axis=1, keepdims=True)

        print(f"Training: loss = {train_loss}, accuracy = {train_accuracy}")
        print("Label versus predicted label tabulation:")
        print(np.round(train_tab / train_tab_sum, 2))
        print(train_tab_sum)
        print("\n")

        print("Validating...")
        valid_loss, valid_accuracy, valid_tab = pest.validate_epoch(
            valid_dataloader, model, config
        )

        valid_loss_history.append(valid_loss)
        valid_accuracy_history.append(valid_accuracy)

        valid_tab_sum = np.sum(train_tab, axis=1, keepdims=True)

        print(f"Validation: loss = {valid_loss}, accuracy = {valid_accuracy}")
        print("Label versus predicted label tabulation:")
        print(np.round(valid_tab / valid_tab_sum, 2))
        print(valid_tab_sum)
        print("\n")

    return (
        train_loss_history,
        train_accuracy_history,
        valid_loss_history,
        valid_accuracy_history,
    )

In [33]:
for fold in range(config.n_folds):
    print(f"Fold {fold}")

    # Split into training and validation sets
    train_df = df[df["fold"] != fold].reset_index(drop=True)
    valid_df = df[df["fold"] == fold].reset_index(drop=True)

    train_dataset = pest.AugmentedCCMT(config, train_df, transform=pest.transform_train)
    valid_dataset = pest.AugmentedCCMT(config, valid_df)

    # Dataloaders
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=config.batch_size,
        shuffle=True,
        num_workers=0,
    )
    valid_dataloader = DataLoader(
        valid_dataset,
        batch_size=config.batch_size,
        shuffle=False,
        num_workers=0,
    )

    # Initialize (pre-trained) model
    model = timm.create_model(
        config.backbone, pretrained=True, num_classes=config.num_classes
    )
    model.to(config.device)

    # Specify loss function (CM: move this to outer loop?)
    config.criterion = nn.CrossEntropyLoss()

    # Initialize optimizer
    optimizer = optim.Adam(model.parameters(), lr=config.lr, weight_decay=0.0)

    # Call training function
    (
        train_loss_history,
        train_accuracy_history,
        valid_loss_history,
        valid_accuracy_history,
    ) = train(train_dataloader, valid_dataloader, model, optimizer, config)

    plt.plot(train_loss_history, label="Training")
    plt.plot(valid_loss_history, label="Validation")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()

    plt.plot(train_accuracy_history, label="Training")
    plt.plot(valid_accuracy_history, label="Validation")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.show()

    print("\n")

Fold 0
Epoch 1
Training...


100%|██████████| 7/7 [02:45<00:00, 23.63s/it]


Training: loss = 1.9087098155702864, accuracy = 0.2013358778625954
Label versus predicted label tabulation:
[[0.19 0.06 0.05 0.   0.22 0.48 0.01]
 [0.33 0.24 0.02 0.02 0.19 0.18 0.02]
 [0.02 0.   0.13 0.   0.61 0.21 0.02]
 [0.17 0.11 0.11 0.06 0.29 0.23 0.02]
 [0.14 0.03 0.06 0.01 0.45 0.29 0.01]
 [0.12 0.01 0.19 0.   0.43 0.23 0.01]
 [0.03 0.04 0.1  0.02 0.41 0.38 0.03]]
[[172.]
 [394.]
 [122.]
 [549.]
 [595.]
 [737.]
 [575.]]


Validating...


100%|██████████| 3/3 [00:22<00:00,  7.46s/it]


Validation: loss = 1.9162999391555786, accuracy = 0.24236641221374045
Label versus predicted label tabulation:
[[0.01 0.01 0.   0.   0.05 0.28 0.  ]
 [0.06 0.08 0.   0.   0.01 0.17 0.  ]
 [0.   0.   0.01 0.   0.12 0.2  0.  ]
 [0.01 0.05 0.   0.02 0.05 0.19 0.  ]
 [0.   0.   0.01 0.   0.06 0.26 0.  ]
 [0.   0.   0.01 0.   0.09 0.23 0.  ]
 [0.   0.01 0.01 0.   0.06 0.25 0.  ]]
[[172.]
 [394.]
 [122.]
 [549.]
 [595.]
 [737.]
 [575.]]


Epoch 2
Training...


100%|██████████| 7/7 [02:55<00:00, 25.07s/it]


Training: loss = 1.8368864400046212, accuracy = 0.29834605597964375
Label versus predicted label tabulation:
[[0.1  0.04 0.01 0.01 0.35 0.49 0.01]
 [0.11 0.32 0.01 0.07 0.19 0.27 0.03]
 [0.   0.01 0.02 0.01 0.54 0.41 0.02]
 [0.06 0.07 0.02 0.24 0.29 0.29 0.03]
 [0.01 0.   0.01 0.01 0.5  0.46 0.01]
 [0.01 0.   0.04 0.01 0.47 0.43 0.04]
 [0.   0.01 0.02 0.01 0.42 0.46 0.08]]
[[172.]
 [394.]
 [122.]
 [549.]
 [595.]
 [737.]
 [575.]]


Validating...


100%|██████████| 3/3 [00:22<00:00,  7.37s/it]


Validation: loss = 1.8393666744232178, accuracy = 0.32729007633587787
Label versus predicted label tabulation:
[[0.   0.01 0.   0.   0.03 0.29 0.  ]
 [0.01 0.15 0.   0.   0.03 0.14 0.  ]
 [0.   0.   0.   0.   0.11 0.22 0.  ]
 [0.   0.04 0.   0.05 0.06 0.18 0.  ]
 [0.   0.   0.   0.   0.1  0.23 0.  ]
 [0.   0.   0.   0.   0.08 0.25 0.  ]
 [0.   0.   0.   0.   0.09 0.22 0.02]]
[[172.]
 [394.]
 [122.]
 [549.]
 [595.]
 [737.]
 [575.]]


Epoch 3
Training...


 29%|██▊       | 2/7 [00:58<02:27, 29.55s/it]

: 