In [1]:
%matplotlib inline

In [2]:
! [ ! -z "$COLAB_GPU" ] && pip install torch torchvision ray timm
! [ ! -z "$COLAB_GPU" ] && mkdir -p datasets
! [ ! -z "$COLAB_GPU" ] && wget -nc --no-check-certificate https://download.pytorch.org/tutorial/hymenoptera_data.zip -P datasets
! [ ! -z "$COLAB_GPU" ] && unzip -u datasets/hymenoptera_data.zip -d datasets


Transfer Learning for Computer Vision Tutorial
==============================================
**Author**: `Sasank Chilamkurthy <https://chsasank.github.io>`_

In this tutorial, you will learn how to train a convolutional neural network for
image classification using transfer learning. You can read more about the transfer
learning at `cs231n notes <https://cs231n.github.io/transfer-learning/>`__

Quoting these notes,

    In practice, very few people train an entire Convolutional Network
    from scratch (with random initialization), because it is relatively
    rare to have a dataset of sufficient size. Instead, it is common to
    pretrain a ConvNet on a very large dataset (e.g. ImageNet, which
    contains 1.2 million images with 1000 categories), and then use the
    ConvNet either as an initialization or a fixed feature extractor for
    the task of interest.

These two major transfer learning scenarios look as follows:

-  **Finetuning the convnet**: Instead of random initialization, we
   initialize the network with a pretrained network, like the one that is
   trained on imagenet 1000 dataset. Rest of the training looks as
   usual.
-  **ConvNet as fixed feature extractor**: Here, we will freeze the weights
   for all of the network except that of the final fully connected
   layer. This last fully connected layer is replaced with a new one
   with random weights and only this layer is trained.




In [3]:
# License: BSD
# Author: Sasank Chilamkurthy

from __future__ import print_function, division

import os
from urllib import request
from zipfile import ZipFile

import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
from torch.utils.data import random_split
import timm
from functools import partial

from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

plt.ion()   # interactive mode

<matplotlib.pyplot._IonContext at 0x7f96261dd160>

Load Data
---------

We will use torchvision and torch.utils.data packages for loading the
data.

The problem we're going to solve today is to train a model to classify
**ants** and **bees**. We have about 120 training images each for ants and bees.
There are 75 validation images for each class. Usually, this is a very
small dataset to generalize upon, if trained from scratch. Since we
are using transfer learning, we should be able to generalize reasonably
well.

This dataset is a very small subset of imagenet.

.. Note ::
   Download the data from
   `here <https://download.pytorch.org/tutorial/hymenoptera_data.zip>`_
   and extract it to the current directory.



In [4]:
# def download_and_extract_data(dataset_dir='datasets'):
#     data_zip = os.path.join(dataset_dir, 'hymenoptera_data.zip')
#     data_path = os.path.join(dataset_dir, 'hymenoptera_data')
#     url = "https://download.pytorch.org/tutorial/hymenoptera_data.zip"

#     if not os.path.exists(data_path):
#         if not os.path.exists(data_zip):
#             print("Starting to download data...")
#             data = request.urlopen(url, timeout=15).read()
#             with open(data_zip, 'wb') as f:
#                 f.write(data)

#         print("Starting to extract data...")
#         with ZipFile(data_zip, 'r') as zip_f:
#             zip_f.extractall(dataset_dir)
        
#     print("Data has been downloaded and extracted to {}.".format(dataset_dir))
    
# download_and_extract_data()

In [5]:
# Data augmentation and normalization for training
# Just normalization for validation


def load_data(data_dir='./hymenoptera_data'):
    data_transforms = {
        'train': transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
        'val': transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
    }

    trainset = datasets.ImageFolder(os.path.join(data_dir, 'train'),
                                              data_transforms['train'])

    testset = datasets.ImageFolder(os.path.join(data_dir, 'val'),
                                              data_transforms['val'])

    return trainset, testset



In [6]:
def train(config, checkpoint_dir=None, data_dir=None):
    seed = 99
    torch.random.manual_seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

    model = torch.hub.load("facebookresearch/deit:main", "deit_tiny_patch16_224", pretrained=True)

    for param in model.parameters():
        param.requires_grad = False
    
    num_ftrs = model.head.in_features
    model.head = nn.Linear(num_ftrs, 2)
    
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            model = nn.DataParallel(model)
    model.to(device)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.head.parameters(), lr=config["lr"], momentum=0.9)
    
#     main_lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
#             optimizer, T_max=max_num_epochs - 6
#         )

#     warmup_lr_scheduler = torch.optim.lr_scheduler.LinearLR(
#                     optimizer, start_factor=0.0001, total_iters=6
#                 )

#     lr_scheduler = torch.optim.lr_scheduler.SequentialLR(
#                 optimizer, schedulers=[warmup_lr_scheduler, main_lr_scheduler], milestones=[6]
#             )  


    if checkpoint_dir:
        checkpoints = torch.load(
            os.path.join(checkpoint_dir, "checkpoint"))
        model_state, optimizer_state = checkpoints["model"], checkpoints["optimizer"]
        model.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)
    
    ##### Data loaders
    trainset, testset = load_data(data_dir)
    
    test_abs = int(len(trainset) * 0.8)
    train_subset, val_subset = random_split(
        trainset, [test_abs, len(trainset) - test_abs])

    trainloader = torch.utils.data.DataLoader(
        train_subset,
        batch_size=int(config["batch_size"]),
        shuffle=True,
        num_workers=8)
    valloader = torch.utils.data.DataLoader(
        val_subset,
        batch_size=int(config["batch_size"]),
        shuffle=True,
        num_workers=8)

    
    for epoch in range(10):  # loop over the dataset multiple times
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            epoch_steps += 1
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
                                                running_loss / epoch_steps))
                running_loss = 0.0

        # Validation loss
        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        for i, data in enumerate(valloader, 0):
            with torch.no_grad():
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = model(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

                loss = criterion(outputs, labels)
                val_loss += loss.cpu().numpy()
                val_steps += 1
                
        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save({"model": model.state_dict(), "optimizer": optimizer.state_dict()}, path)

        tune.report(loss=(val_loss / val_steps), accuracy=correct / total)
    print("Finished Training")

In [7]:
def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2):
    data_dir = os.path.abspath("./hymenoptera_data")
    load_data(data_dir)
    config = {
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([2, 4, 8, 16])
    }
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2)
    reporter = CLIReporter(
        # parameter_columns=["l1", "l2", "lr", "batch_size"],
        metric_columns=["loss", "accuracy", "training_iteration"])
    result = tune.run(
        partial(train, data_dir=data_dir),
        resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter)

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))

#     best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"])
#     device = "cpu"
#     if torch.cuda.is_available():
#         device = "cuda:0"
#         if gpus_per_trial > 1:
#             best_trained_model = nn.DataParallel(best_trained_model)
#     best_trained_model.to(device)

#     best_checkpoint_dir = best_trial.checkpoint.value
#     model_state, optimizer_state = torch.load(os.path.join(
#         best_checkpoint_dir, "checkpoint"))
#     best_trained_model.load_state_dict(model_state)

#     test_acc = test_accuracy(best_trained_model, device)
#     print("Best trial test set accuracy: {}".format(test_acc))


if __name__ == "__main__":
    # You can change the number of GPUs per trial here:
    main(num_samples=10, max_num_epochs=10, gpus_per_trial=0)

2021-12-07 23:07:17,612	INFO registry.py:69 -- Detected unknown callable for trainable. Converting to class.


TypeError: ray.cloudpickle.dumps(<class 'ray.tune.function_runner.wrap_function.<locals>.ImplicitFunc'>) failed.
To check which non-serializable variables are captured in scope, re-run the ray script with 'RAY_PICKLE_VERBOSE_DEBUG=1'. Other options: 
-Try reproducing the issue by calling `pickle.dumps(trainable)`. 
-If the error is typing-related, try removing the type annotations and try again.