In [None]:
%load_ext lab_black
%config IPCompleter.greedy=True

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch import optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
import torchvision

from torchsummary import summary

from ray import tune
from ray.tune.schedulers import ASHAScheduler

import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

import time
from pathlib import Path

from carbontracker.tracker import CarbonTracker

In [None]:
%matplotlib inline

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
n_workers = 4 * torch.cuda.device_count()

# Define Q2Data Class for custom Dataset

the following class reads the data for Q2 and creates a torch dataset object for it. With this, you can easily 
use a dataloader to train your model. 

Make sure that the file "hw2_Q2_data.npz" is located properly (in this example, it should be in the same folder as this notebook.

 



In [None]:
class Q2Data(Dataset):
    def __init__(self, mode="", ray_tune=False):
        # Ray Tune requires an absolute path
        # go back 2 folders since ray goes 2 deeper
        actual_cwd = str(Path.cwd().parents[1])
        if not ray_tune:
            actual_cwd = "."

        data = np.load(f"{actual_cwd}/data/hw2_Q2_and_Q3_data.npz")
        if "train" in mode:
            # trainloader
            self.images = data["arr_0"].T
            self.labels = data["arr_1"]
        elif "val" in mode:
            # valloader
            self.images = data["arr_2"].T
            self.labels = data["arr_3"]
        elif "test" in mode:
            # testloader
            self.images = data["arr_4"].T
            self.labels = data["arr_5"]

        self.images = np.float32(self.images) / 255.0

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        sample = self.images[idx, :]
        labels = self.labels[idx]
        return sample, labels

# Example on how to load data

In [None]:
b_size = 100

train_loader = DataLoader(
    Q2Data("train"), batch_size=b_size, num_workers=n_workers, shuffle=True
)

val_loader = DataLoader(
    Q2Data("val"), batch_size=b_size, num_workers=n_workers, shuffle=False
)

test_loader = DataLoader(
    Q2Data("test"), batch_size=b_size, num_workers=n_workers, shuffle=False
)

# Exploring our data

In [None]:
tmp_loader = DataLoader(Q2Data("train"), batch_size=8, num_workers=4, shuffle=True)
image_batch, labels = next(iter(tmp_loader))
fig, ax_arr = plt.subplots(2, 4)
for i in range(8):
    img = image_batch[i].numpy()
    ax_arr[i // 4, i % 4].imshow(img.reshape([28, 28]), cmap="gray")
    # ax_arr[i // 4, i % 4].axis("off")
    ax_arr[i // 4, i % 4].axes.get_yaxis().set_visible(False)
    ax_arr[i // 4, i % 4].set_xlabel(labels[i].item())
fig.set_figheight(10)
fig.set_figwidth(20)
# plt.subplots_adjust(wspace=0.01, hspace=0.01)
plt.show()
labels

In [None]:
# # This was mainly how to use torchvision's make_grid()
# # (8, 784) -> (8,1,28,28)
# img = image_batch.reshape(-1, 28, 28).unsqueeze(1)
# out = torchvision.utils.make_grid(img, nrow=4)
# fig, ax = plt.subplots(figsize=(20, 10))
# ax.imshow(out.permute(1, 2, 0), interpolation="nearest", aspect="auto")
# ax.axis("off")
# # plt.imshow(out.numpy().transpose((1, 2, 0)))

# Defining the main functions

In [None]:
def plot_log(log, model_config, select=False, save=False):
    fig, ax = plt.subplots()
    if select:
        # find min/max for criteria
        selected = {}
        for key in log:
            if "loss" in key:
                idx = np.argmin(log[key][9::10])
                label = "Min "
            elif "acc" in key:
                idx = np.argmax(log[key][9::10])
                label = "Max "
            # 10 - 2000 epochs would be saved as 9 - 1999
            # take every 10th epoch, determine a usable model
            actual_idx = (idx + 1) * model_config["save_interval"] - 1
            selected[key] = actual_idx

            print(key, actual_idx)
            print(actual_idx, log.get(key)[actual_idx])

            ax.plot(
                actual_idx,
                log.get(key)[actual_idx],
                label=label + key,
                markersize=16,
                marker="X",
            )

    epochs = model_config.get("num_epochs")
    x_axis = np.linspace(1, epochs, epochs)

    ax.plot(x_axis, log.get("train_loss"), label="Train Loss")
    ax.plot(x_axis, log.get("val_loss"), label="Validation Loss")
    ax.plot(x_axis, log.get("acc"), label="Validation Accuracy")

    ax.set_ylabel("Loss")
    ax.set_xlabel("Epochs")

    fig.set_figheight(10)
    fig.set_figwidth(16)

    ax.legend(loc="best", prop={"size": 12})
    if save:
        plt.savefig(f"./LR_{model_config['lr']}_{model_config['num_epochs']}.jpg")
    plt.show()
    return selected

In [None]:
def test_model(net, data_generator, loss_fn):
    """Function to easily test model on specified dataset"""

    device = "cuda" if torch.cuda.is_available() else "cpu"

    with torch.no_grad():
        batch_loss, batch_steps = 0.0, 0
        correct_pred, total_pred = 0, 0

        for batch_id, (data, label) in enumerate(data_generator):
            data, label = data.to(device), label.to(device)

            output = net(data)
            batch_loss += loss_fn(output, label).item()
            batch_steps += 1

            # indices where probability is maximum
            _, val_pred = torch.max(output, 1)

            correct_pred += (val_pred == label).sum().item()
            total_pred += label.shape[0]

        # average loss/acc across ALL batches
        # i.e. ACROSS specified dataset
        avg_loss = batch_loss / batch_steps
        avg_acc = correct_pred / total_pred

    return avg_loss, avg_acc

In [None]:
def train_model(config):
    """The original training function has been modified in order to use Ray's Tune"""

    device = "cuda" if torch.cuda.is_available() else "cpu"
    n_workers = 4 * torch.cuda.device_count()

    logger = {
        "train_loss": np.zeros(config["num_epochs"]),
        "val_loss": np.zeros(config["num_epochs"]),
        "acc": np.zeros(config["num_epochs"]),
    }

    #### LOAD DATA ####
    ray_tune = config["ray_tune_enabled"]
    b_size = config["batch_size"]

    train_data = Q2Data("train", ray_tune)
    train_dataloader = DataLoader(
        train_data,
        batch_size=b_size,
        num_workers=n_workers,
        shuffle=True,
        pin_memory=False,
    )

    val_data = Q2Data("val", ray_tune)
    val_dataloader = DataLoader(
        val_data,
        batch_size=b_size,
        num_workers=n_workers,
        shuffle=True,
        pin_memory=False,
    )

    tracker = CarbonTracker(epochs=config["num_epochs"])

    #### INSTANTIATE MODEL ####
    net = config["model"].to(device)

    loss_function = nn.CrossEntropyLoss()

    optimizer = optim.SGD(
        net.parameters(),
        lr=config["lr"],
        momentum=config["momentum"],
        weight_decay=config["weight_decay"],
    )

    if config["lr_variable"]:
        # what approximate epoch does convergence occur? 80 in this case
        scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[80], gamma=0.1)

    #### BEGIN TRAINING ####
    start_time = time.time()
    for j in range(config["num_epochs"]):
        tracker.epoch_start()
        ## START OF BATCH ##
        train_loss, train_steps = 0.0, 0
        for batch_id, (data, label) in enumerate(train_dataloader):
            data, label = data.to(device), label.to(device)

            output = net(data)

            loss = loss_function(output, label)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            train_steps += 1

        ## END OF BATCH ##

        # average training loss for 1 epoch
        train_loss /= train_steps

        # test model on validation dataset
        val_loss, val_acc = test_model(net, val_dataloader, loss_function)

        # send current training result back to Tune
        if config["ray_tune_enabled"]:
            tune.report(loss=(train_loss), accuracy=(val_acc))

        logger["train_loss"][j] = train_loss
        logger["val_loss"][j] = val_loss
        logger["acc"][j] = val_acc

        if config["log_training"] and (j + 1) % config["log_interval"] == 0:
            print(
                f"Epoch:{j+1}/{config['num_epochs']} \
                Train Loss: {logger['train_loss'][j]:.6f} \
                Val Loss: {logger['val_loss'][j]:.6f} \
                Acc: {logger['acc'][j]:.6f}"
            )

        # make sure folder is created to place saved checkpoints
        path = Path.cwd() / "models" / net._name
        if not path.exists():
            path.mkdir(parents=True, exist_ok=False)

        if config["save_model"] and (j + 1) % config["save_interval"] == 0:
            lr_str = "VarLR" if config["lr_variable"] else "FixedLR"
            # pad with appropriate number of zeros i.e. epoch 10 named as 010
            checkpoint_num = str(j + 1).zfill(len(str(config["num_epochs"])))

            model_path = (
                f"./models/{net._name}/{net._name}_{lr_str}_{checkpoint_num}.pt"
            )
            torch.save(net.state_dict(), model_path)

        # this is used only to vary learning rate during training
        if config["lr_variable"]:
            scheduler.step()

        tracker.epoch_end()
    # Optional: Add a stop in case of early termination before all monitor_epochs has
    # been monitored to ensure that actual consumption is reported.
    tracker.stop()

    print(f"{config['num_epochs']} epochs took {time.time() - start_time:.2f}s")

    if config["log_training"]:
        return logger

# ShallowMLP

In [None]:
!mkdir models/ShallowMLP  # create folder for model storage

## Defining the network

In [None]:
class ShallowMLP(nn.Module):
    def __init__(self):
        super(ShallowMLP, self).__init__()
        self.fc1 = nn.Linear(784, 32)
        self.fc2 = nn.Linear(32, 10)

        self._name = self.__class__.__name__

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x

# SEARCHING FOR IDEAL PARAMETERS

## Ray Tune

In [None]:
# assert False  # remove to make cell work
# search_space = {
#     "lr": tune.loguniform(1e-4, 1e-2),
#     "lr_variable": False,
#     "momentum": 0.9,
#     "weight_decay": 1e-4,
#     "batch_size": tune.choice([64, 128, 256]),
#     "log_training": False,
#     "log_interval": 10,
#     "save_model": False,
#     "save_interval": 10,
#     "num_epochs": 500,
#     "ray_tune_enabled": True,
# }

# # enable early stopping
# asha_scheduler = ASHAScheduler(max_t=500, grace_period=50)
# # number of samples to run
# n_samples = 50
# # run training with Tune
# analysis = tune.run(
#     train_model,
#     num_samples=n_samples,
#     config=search_space,
#     resources_per_trial={"gpu": 1},
#     scheduler=asha_scheduler,
#     metric="accuracy",
#     mode="max",
#     local_dir="./",
# )

## linspaced LRs

In [None]:
# lr_list = np.linspace(1,50,50) * 1e-3
# summary = []
# for cur_lr in lr_list:
#     # create new instance for every lr change
#     net = ShallowMLP().to(device)
#     model_config = {
#         "model": net,
#         "lr": cur_lr,
#         "lr_variable": False,
#         "momentum": 0.9,
#         "weight_decay": 1e-4,
#         "batch_size": 128,
#         "log_training": True,
#         "log_interval": 10,
#         "save_model": False,
#         "save_interval": 10,
#         "num_epochs": 300,
#         "ray_tune_enabled": False,
#     }
#     log = train_model(model_config)
#     selected= plot_log(log, model_config, select=True, save=True)
#     summary.append(selected)

# ShallowMLP FixedLR

### Training

In [None]:
net = ShallowMLP().to(device)
model_config = {
    "model": net,
    "lr": 1e-3,
    "lr_variable": False,
    "momentum": 0.9,
    "weight_decay": 0,
    "batch_size": 128,
    "log_training": True,
    "log_interval": 10,
    "save_model": True,
    "save_interval": 10,
    "num_epochs": 1500,
    "ray_tune_enabled": False,
}

In [None]:
# log = train_model(model_config)
# plot_log(log, model_config, select=True)

### Eval

In [None]:
model = ShallowMLP().to(device)
model.eval()
# select 210, 240, 1490 models
model_path = f"models/{model._name}/{model._name}_FixedLR_1490.pt"
model.load_state_dict(torch.load(model_path))

test_data = Q2Data("test", ray_tune=False)
test_dataloader = DataLoader(
    test_data,
    batch_size=model_config.get("batch_size"),
    num_workers=n_workers,
    shuffle=True,
    pin_memory=False,
)

# arbitrary loss function
loss_func = nn.CrossEntropyLoss()
_, acc = test_model(model, test_dataloader, loss_func)
print(f"Accuracy on test dataset: {acc}")

## Variable (Multistep) Learning Rate

### Training

In [None]:
net = ShallowMLP().to(device)
model_config = {
    "model": net,
    "lr": 1e-3,
    "lr_variable": True,
    "momentum": 0.9,
    "weight_decay": 0,
    "batch_size": 128,
    "log_training": True,
    "log_interval": 10,
    "save_model": True,
    "save_interval": 10,
    "num_epochs": 1500,
    "ray_tune_enabled": False,
}

In [None]:
# log = train_model(model_config)
# plot_log(log, model_config, select=True)

### Eval

In [None]:
model = ShallowMLP().to(device)
model.eval()

# load 1270, 1430 and 1500 models
model_path = f"models/{model._name}/{model._name}_VarLR_1500.pt"
model.load_state_dict(torch.load(model_path))

test_data = Q2Data("test", ray_tune=False)
test_dataloader = DataLoader(
    test_data,
    batch_size=model_config.get("batch_size"),
    num_workers=n_workers,
    shuffle=True,
    pin_memory=False,
)

# arbitrary loss function
loss_func = nn.CrossEntropyLoss()
_, acc = test_model(model, test_dataloader, loss_func)
print(acc)

# DeepMLP

In [None]:
!mkdir models/DeepMLP

## Defining the network

In [None]:
class DeepMLP(nn.Module):
    def __init__(self):
        super(DeepMLP, self).__init__()
        self.fc1 = nn.Linear(784, 32)
        self.fc2 = nn.Linear(32, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 10)

        self._name = self.__class__.__name__

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        x = F.relu(x)
        x = self.fc4(x)
        return x

## Train

In [None]:
net = DeepMLP().to(device)
model_config = {
    "model": net,
    "lr": 1e-3,
    "lr_variable": True,
    "momentum": 0.9,
    "weight_decay": 0,
    "batch_size": 128,
    "log_training": True,
    "log_interval": 10,
    "save_model": True,
    "save_interval": 10,
    "num_epochs": 1500,
    "ray_tune_enabled": False,
}

In [None]:
# log = train_model(model_config)
# _ = plot_log(log, model_config, select=True)

In [None]:
_ = plot_log(log, model_config, select=True)

## Eval

In [None]:
model = DeepMLP().to(device)
model.eval()

# load 1240 model
model_path = f"models/{model._name}/{model._name}_VarLR_1240.pt"
model.load_state_dict(torch.load(model_path))

test_data = Q2Data("test", ray_tune=False)
test_dataloader = DataLoader(
    test_data,
    batch_size=model_config.get("batch_size"),
    num_workers=n_workers,
    shuffle=True,
    pin_memory=False,
)

# arbitrary loss function
loss_func = nn.CrossEntropyLoss()
_, acc = test_model(model, test_dataloader, loss_func)
print(f"Accuracy on test dataset: {acc}")

# DeepWideMLP

In [None]:
class DeepWideMLP(nn.Module):
    def __init__(self):
        super(DeepWideMLP, self).__init__()
        self.fc1 = nn.Linear(784, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 128)
        self.fc4 = nn.Linear(128, 10)

        self._name = self.__class__.__name__

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        x = F.relu(x)
        x = self.fc4(x)
        return x

## Train the model

In [None]:
net = DeepWideMLP().to(device)
model_config = {
    "model": net,
    "lr": 1e-3,
    "lr_variable": True,
    "momentum": 0.9,
    "weight_decay": 0,
    "batch_size": 128,
    "log_training": True,
    "log_interval": 10,
    "save_model": True,
    "save_interval": 10,
    "num_epochs": 1500,
    "ray_tune_enabled": False,
}

In [None]:
# log = train_model(model_config)
# _ = plot_log(log, model_config, select=True)

## Eval

In [None]:
model = DeepWideMLP().to(device)
model.eval()

# load 1010 model
model_path = f"models/{model._name}/{model._name}_VarLR_1010.pt"
model.load_state_dict(torch.load(model_path))

test_data = Q2Data("test", ray_tune=False)
test_dataloader = DataLoader(
    test_data,
    batch_size=model_config.get("batch_size"),
    num_workers=n_workers,
    shuffle=True,
    pin_memory=False,
)

# arbitrary loss function
loss_func = nn.CrossEntropyLoss()
_, acc = test_model(model, test_dataloader, loss_func)
print(f"Accuracy on test dataset: {acc}")

# Number of parameters

In [None]:
image_batch, _ = next(iter(tmp_loader))

In [None]:
tmp_b_size = 1
summary(DeepMLP().to(device), (tmp_b_size, image_batch.shape[1]))

In [None]:
tmp_b_size = 1
summary(DeepWideMLP().to(device), (tmp_b_size, image_batch.shape[1]))