In [28]:
import os

import cv2
import numpy as np
from scipy.spatial.distance import pdist

import torch
import torchvision.models as models
from torchvision import datasets, transforms
import torch.optim as optim
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter

%load_ext autoreload
%autoreload 2
from utils.data import get_data_loaders
from utils.train_eval import train, train_curriculum
from utils.misc import get_features, cluster_features

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
# Either cifar10 or flowers102
dataset = "cifar10"
data_config = {
    "batch_size": 64,
    "num_workers": 2,
}

loaders, num_classes, datasets = get_data_loaders(dataset=dataset,
                                                  batch_size=data_config["batch_size"],
                                                  num_workers=data_config["num_workers"],
                                                  return_dataset=True)

train_loader, test_loader = loaders
train_dataset, test_dataset = datasets

Files already downloaded and verified
Files already downloaded and verified


In [4]:
print(f"Number of training batches: {len(train_loader)}")
print(f"Number of testing batches: {len(test_loader)}")

Number of training batches: 782
Number of testing batches: 157


In [5]:
# which model to use for feature extraction?
vgg16 = models.vgg16(pretrained=True)
extractor = vgg16.features

# features for all the samples in the train dataloader
feats = get_features(extractor, train_loader, device)



In [6]:
"""
NOTE: If there are too many samples, training KMeans can take a 
lot of time. To prevent that from happening, you can limit the number
of samples being used using this cell (for debugging purposes). 
E.g., uncomment the last line to only use the first 1000 features.
"""
features = feats
# features = feats[:1000]

In [7]:
# dists = get_pairwise_distance(feats)
# print(dists.shape)

In [8]:
# Number of clusters to split the input samples into using KMeans 
num_clusters = 5
c_labels = cluster_features(features, num_clusters=num_clusters)

  super()._check_params_vs_input(X, default_n_init=10)


In [9]:
# separate out the data into clusters
from collections import defaultdict

clustered_data = defaultdict(list)

for idx, l in enumerate(c_labels):
    clustered_data[l].append(idx)

In [18]:
# How big are the different clusters?
c_sizes = []

for l in clustered_data.keys():
    c_sizes.append((l, len(clustered_data[l])))

# sort by the number of samples in the cluster
c_sizes = sorted(c_sizes, key=lambda x: x[1], reverse=True)
print(c_sizes)

[(2, 19180), (3, 13136), (4, 8251), (1, 5887), (0, 3546)]


In [11]:
# the network to be trained
model = models.resnet18(pretrained=False)

# Change the output of the last FC layer as per the number of classes
fc_input = model.fc.in_features
model.fc = nn.Linear(fc_input, num_classes)

learning_rate = 1e-3

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)



In [26]:
model = model.to(device)

# How many epochs to train for (per cluster)?
num_epochs = 1

"""
We experimented with two different ways to present data to the model:
1. Present clusters in increasing order of cluster size (mode: S2L)
2. Present clusters in decreasing order of cluster size (mode: L2S)
"""
mode = "L2S"

params = {
    "model": model,
    "dataset": dataset,
    "train_dataset": train_dataset,
    "test_loader": test_loader,
    "clustered_data": clustered_data,
    "c_sizes": c_sizes,
    "optimizer": optimizer,
    "criterion": criterion,
    "mode": mode,
    "num_epochs": num_epochs,
    "data_config": data_config,
    "device": device,
}

train_curriculum(**params)

Training on cluster 2 (38.36% data) done Test Acc: 65.410
Training on cluster 3 (26.27% data) done Test Acc: 69.050
Training on cluster 4 (16.50% data) done Test Acc: 65.680
Training on cluster 1 (11.77% data) done Test Acc: 67.530
Training on cluster 0 (7.09% data) done Test Acc: 68.050


In [27]:
# Residual training: Fine-tune the model on the entire dataset for a few epochs

num_epochs_res = 2

# TensorBoard log directory
log_dir = f"./logs/{dataset}/vgg16_{mode}_{num_epochs}_c{num_clusters}_residual{num_epochs_res}"
writer = SummaryWriter(log_dir)

config = {
    "opt": optimizer,
    "crit": criterion,
    "log_freq_test": 250,
    "log_freq_tr": 150, 
}

train(model, train_loader, test_loader, num_epochs_res, config, device, writer)

[Epoch: 1, Batch: 100] Loss: 0.687
[Epoch: 1, Batch: 200] Loss: 0.735
[Epoch: 1, Batch: 300] Loss: 0.717
[Epoch: 1, Batch: 400] Loss: 0.703
[Epoch: 1, Batch: 500] Loss: 0.704
[Epoch: 1, Batch: 600] Loss: 0.718
[Epoch: 1, Batch: 700] Loss: 0.738
[Epoch: 2, Batch: 100] Loss: 0.592
[Epoch: 2, Batch: 200] Loss: 0.599
[Epoch: 2, Batch: 300] Loss: 0.595
[Epoch: 2, Batch: 400] Loss: 0.610
[Epoch: 2, Batch: 500] Loss: 0.617
[Epoch: 2, Batch: 600] Loss: 0.627
[Epoch: 2, Batch: 700] Loss: 0.622
Training finished.


In [29]:
# Save the model
ckpt_dir = os.path.join(log_dir, "checkpoints")
os.makedirs(ckpt_dir, exist_ok=True)
torch.save(model, f"{ckpt_dir}/{num_epochs_res}.pth")