In [38]:
import torch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor


class MLP:
    def __init__(
        self,
        linear_1_in_features,
        linear_1_out_features,
        f_function,
        linear_2_in_features,
        linear_2_out_features,
        g_function
    ):
        """
        Args:
            linear_1_in_features: the in features of first linear layer
            linear_1_out_features: the out features of first linear layer
            linear_2_in_features: the in features of second linear layer
            linear_2_out_features: the out features of second linear layer
            f_function: string for the f function: relu | sigmoid | identity
            g_function: string for the g function: relu | sigmoid | identity
        """
        self.f_function = f_function
        self.g_function = g_function

        self.activations = {'relu': nn.ReLU(), 'sigmoid': nn.Sigmoid(),
                            'identity': nn.Identity()}

        try:
            self.activations[self.f_function]
        except KeyError:
            print('The function f is not valid. Defaulting to identity.')
            self.f_function = 'identity'

        try:
            self.activations[self.g_function]
        except KeyError:
            print('The function g is not valid. Defaulting to identity.')
            self.g_function = 'identity'

        self.parameters = dict(
            W1=torch.randn(linear_1_out_features, linear_1_in_features),
            b1=torch.randn(linear_1_out_features),
            W2=torch.randn(linear_2_out_features, linear_2_in_features),
            b2=torch.randn(linear_2_out_features),
        )
        self.grads = dict(
            dJdW1=torch.zeros(linear_1_out_features, linear_1_in_features),
            dJdb1=torch.zeros(linear_1_out_features),
            dJdW2=torch.zeros(linear_2_out_features, linear_2_in_features),
            dJdb2=torch.zeros(linear_2_out_features),
        )

        # put all the cache value you need in self.cache
        self.cache = dict()

    def forward(self, x):
        """
        Args:
            x: tensor shape (batch_size, linear_1_in_features)
        """
        self.cache['x'] = x
        z1 = torch.matmul(x, self.parameters['W1'].t()) + self.parameters['b1']
        self.cache['z1'] = z1

        z2 = self.activations[self.f_function](z1)
        self.cache['z2'] = z2

        z3 = torch.matmul(z2, self.parameters['W2'].t())+self.parameters['b2']
        self.cache['z3'] = z3

        y_hat = self.activations[self.g_function](z3)
        self.cache['y_hat'] = y_hat

        return self.cache['y_hat']

    def grad_backprop_helper(self, func, input_):
        sigma = torch.nn.Sigmoid()
        z = sigma(input_)
        grad_mappings = {'relu': torch.ones(input_.size()) * (input_ > 0),
                         'sigmoid': torch.mul(z, 1-z),
                         'identity': torch.ones(input_.size())}
        return grad_mappings[func]

    def backward(self, dJdy_hat):
        """
        Args:
            dJdy_hat: The gradient tensor of shape (batch_size, linear_2_out_features)
        """
        # TODO: Implement the backward function

        # At final layer
        batch_size = dJdy_hat.shape[0]
        dydz3 = self.grad_backprop_helper(self.g_function, self.cache['z3'])
        dJdz3 = torch.mul(dJdy_hat, dydz3)
        self.grads['dJdb2'] = torch.matmul(dJdz3.t(), torch.ones(batch_size))
        self.grads['dJdW2'] = torch.matmul(dJdz3.t(), self.cache['z2'])

        # First linear layer
        dz3dz2 = self.parameters['W2']
        dz2dz1 = self.grad_backprop_helper(self.f_function, self.cache['z1'])
        dJdz1 = dz2dz1 * (dJdz3 @ dz3dz2)

        # First linear layer grads
        self.grads['dJdb1'] = torch.matmul(dJdz1.t(), torch.ones(batch_size))
        self.grads['dJdW1'] = torch.matmul(dJdz1.t(), self.cache['x'])

    def clear_grad_and_cache(self):
        for grad in self.grads:
            self.grads[grad].zero_()
        self.cache = dict()


def mse_loss(y, y_hat):
    """
    Args:
        y: the label tensor (batch_size, linear_2_out_features)
        y_hat: the prediction tensor (batch_size, linear_2_out_features)
    Return:
        J: scalar of loss
        dJdy_hat: The gradient tensor of shape (batch_size, linear_2_out_features)
    """
    # TODO: Implement the mse loss
    loss = torch.pow((y_hat-y), 2).mean()
    dJdy_hat = 2*(y_hat - y)/(y.shape[0]*y.shape[1])

    return loss, dJdy_hat


def bce_loss(y, y_hat):
    """
    Args:
        y_hat: the prediction tensor
        y: the label tensor

    Return:
        loss: scalar of loss
        dJdy_hat: The gradient tensor of shape (batch_size, linear_2_out_features)
    """
    # TODO: Implement the bce loss
    loss = - (y * torch.clamp(torch.log(y_hat), min=-100) + (1-y)
              * torch.clamp(torch.log(1-y_hat), min=-100)).mean()
    dJdy_hat = (- y/y_hat + (1-y)/(1-y_hat))/(y.shape[0]*y.shape[1])

    return loss, dJdy_hat


In [39]:
from collections import OrderedDict

import torch
import torch.nn as nn
import torch.nn.functional as F
#from mlp import MLP, mse_loss, bce_loss

net = MLP(
    linear_1_in_features=2,
    linear_1_out_features=41,
    f_function='relu',
    linear_2_in_features=41,
    linear_2_out_features=11,
    g_function='identity'
)
x = torch.randn(10, 2)
y = torch.randn(10, 11)

net.clear_grad_and_cache()
y_hat = net.forward(x)
J, dJdy_hat = mse_loss(y, y_hat)
net.backward(dJdy_hat)

# ------------------------------------------------
# compare the result with autograd
net_autograd = nn.Sequential(
    OrderedDict([
        ('linear1', nn.Linear(2, 40)),
        ('relu', nn.ReLU()),
        ('linear2', nn.Linear(40, 10)),
    ])
)
net_autograd.linear1.weight.data = net.parameters['W1']
net_autograd.linear1.bias.data = net.parameters['b1']
net_autograd.linear2.weight.data = net.parameters['W2']
net_autograd.linear2.bias.data = net.parameters['b2']

y_hat_autograd = net_autograd(x)

J_autograd = F.mse_loss(y_hat_autograd, y)

net_autograd.zero_grad()
J_autograd.backward()

print((net_autograd.linear1.weight.grad.data -
      net.grads['dJdW1']).norm() < 1e-3)
print((net_autograd.linear1.bias.grad.data - net.grads['dJdb1']).norm() < 1e-3)
print((net_autograd.linear2.weight.grad.data -
      net.grads['dJdW2']).norm() < 1e-3)
print((net_autograd.linear2.bias.grad.data - net.grads['dJdb2']).norm() < 1e-3)
# ------------------------------------------------


tensor(True)
tensor(True)
tensor(True)
tensor(True)


In [40]:
from collections import OrderedDict

import torch
import torch.nn as nn
import torch.nn.functional as F
#from mlp import MLP, mse_loss, bce_loss

net = MLP(
    linear_1_in_features=2,
    linear_1_out_features=13,
    f_function='relu',
    linear_2_in_features=13,
    linear_2_out_features=22,
    g_function='relu'
)
x = torch.randn(10, 2)
y = torch.randn(10, 22)

net.clear_grad_and_cache()
y_hat = net.forward(x)
J, dJdy_hat = mse_loss(y, y_hat)
net.backward(dJdy_hat)

# ------------------------------------------------
# compare the result with autograd
net_autograd = nn.Sequential(
    OrderedDict([
        ('linear1', nn.Linear(2, 13)),
        ('relu1', nn.ReLU()),
        ('linear2', nn.Linear(13, 22)),
        ('relu2', nn.ReLU()),
    ])
)
net_autograd.linear1.weight.data = net.parameters['W1']
net_autograd.linear1.bias.data = net.parameters['b1']
net_autograd.linear2.weight.data = net.parameters['W2']
net_autograd.linear2.bias.data = net.parameters['b2']

y_hat_autograd = net_autograd(x)

J_autograd = F.mse_loss(y_hat_autograd, y)

net_autograd.zero_grad()
J_autograd.backward()

print((net_autograd.linear1.weight.grad.data -
      net.grads['dJdW1']).norm() < 1e-3)
print((net_autograd.linear1.bias.grad.data - net.grads['dJdb1']).norm() < 1e-3)
print((net_autograd.linear2.weight.grad.data -
      net.grads['dJdW2']).norm() < 1e-3)
print((net_autograd.linear2.bias.grad.data - net.grads['dJdb2']).norm() < 1e-3)
# ------------------------------------------------


tensor(True)
tensor(True)
tensor(True)
tensor(True)


In [41]:
from collections import OrderedDict

import torch
import torch.nn as nn
import torch.nn.functional as F
#from mlp import MLP, mse_loss, bce_loss

net = MLP(
    linear_1_in_features=2,
    linear_1_out_features=30,
    f_function='sigmoid',
    linear_2_in_features=30,
    linear_2_out_features=5,
    g_function='sigmoid'
)
x = torch.randn(10, 2)
y = (torch.randn(10, 5) < 0.5) * 1.0

net.clear_grad_and_cache()
y_hat = net.forward(x)
J, dJdy_hat = bce_loss(y, y_hat)
net.backward(dJdy_hat)

# ------------------------------------------------
# compare the result with autograd
net_autograd = nn.Sequential(
    OrderedDict([
        ('linear1', nn.Linear(2, 30)),
        ('sigmoid1', nn.Sigmoid()),
        ('linear2', nn.Linear(30, 5)),
        ('sigmoid2', nn.Sigmoid()),
    ])
)
net_autograd.linear1.weight.data = net.parameters['W1']
net_autograd.linear1.bias.data = net.parameters['b1']
net_autograd.linear2.weight.data = net.parameters['W2']
net_autograd.linear2.bias.data = net.parameters['b2']

y_hat_autograd = net_autograd(x)

J_autograd = torch.nn.BCELoss()(y_hat_autograd, y)

net_autograd.zero_grad()
J_autograd.backward()

print((net_autograd.linear1.weight.grad.data -
      net.grads['dJdW1']).norm() < 1e-3)
print((net_autograd.linear1.bias.grad.data - net.grads['dJdb1']).norm() < 1e-3)
print((net_autograd.linear2.weight.grad.data -
      net.grads['dJdW2']).norm() < 1e-3)
print((net_autograd.linear2.bias.grad.data - net.grads['dJdb2']).norm() < 1e-3)
# ------------------------------------------------


tensor(True)
tensor(True)
tensor(True)
tensor(True)


In [42]:
import cv2 as cv
import numpy as np
import torch
from torchvision import transforms
from torchvision.models import VGG13_BN_Weights, vgg13_bn
from tqdm import tqdm

DEVICE = "cuda"
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]


def save_img(image, path):
    # Push to CPU, convert from (1, 3, H, W) into (H, W, 3)
    image = image[0].permute(1, 2, 0)
    image = image.clamp(min=0, max=1)
    image = (image * 255).cpu().detach().numpy().astype(np.uint8)
    # opencv expects BGR (and not RGB) format
    cv.imwrite(path, image[:, :, ::-1])


def main():
    model = vgg13_bn(VGG13_BN_Weights.IMAGENET1K_V1).to(DEVICE)
    print(model)
    for label in [0, 15, 732]:
        image = torch.randn(1, 224, 224, 3).to(DEVICE)
        image = (image * 8 + 128) / 255  # background color = 128,128,128
        image = image.permute(0, 3, 1, 2)
        image.requires_grad_()
        image = gradient_descent(image, model, lambda tensor: tensor[0, label].mean(),)
        save_img(image, f"./img_{label}.jpg")
        out = model(image)
        print(f"ANSWER_FOR_LABEL_{label}: {out.softmax(1)[0, label].item()}")


# DO NOT CHANGE ANY OTHER FUNCTIONS ABOVE THIS LINE FOR THE FINAL SUBMISSION


def normalize_and_jitter(img, step=32):
    # You should use this as data augmentation and normalization,
    # convnets expect values to be mean 0 and std 1
    dx, dy = np.random.randint(-step, step - 1, 2)
    return transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)(
        img.roll(dx, -1).roll(dy, -2)
    )


def gradient_descent(input, model, loss, iterations=256):
    input = normalize_and_jitter(input)
    input = torch.nn.Parameter(input)
    lr = 0.01
    for _ in range(iterations):
        logits = model(input)
        l = loss(logits)
        l.backward()
        input.data = input.data + lr * input.grad.data
        input.grad.data.zero_()

    return input


def forward_and_return_activation(model, input, module):
    """
    This function is for the extra credit. You may safely ignore it.
    Given a module in the middle of the model (like `model.features[20]`),
    it will return the intermediate activations.
    Try setting the module to `model.features[20]` and the loss to `tensor[0, ind].mean()`
    to see what intermediate activations activate on.
    """
    features = []

    def hook(model, input, output):
        num_downscales = 20
        for i in range(256):


            features.append(output)

    handle = module.register_forward_hook(hook)
    model(input)
    handle.remove()

    return features[0]


if __name__ == "__main__":
    main()


if num_downscales > 0:
            # scale down the image
            image_small = image.filter(ImageFilter.GaussianBlur(2))
            small_size = (int(image.size[0]/2), int(image.size[1]/2))            
            if (small_size[0] == 0 or small_size[1] == 0):
                small_size = image.size
            image_small = image_small.resize(small_size, Image.ANTIALIAS)
            
            # run deepDreamRecursive on the scaled down image
            image_small = self.deepDreamRecursive(image_small, layer, iterations, lr, num_downscales-1)
            
            # Scale up the result image to the original size
            image_large = image_small.resize(image.size, Image.ANTIALIAS)
            
            # Blend the two image
            image = ImageChops.blend(image, image_large, BLEND_ALPHA)
        img_result = self.deepDream(image, layer, iterations, lr)
        img_result = img_result.resize(image.size)
        return img_result

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): ReLU(inplace=True)
    (10): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): ReLU(inplace=True)
    (13): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (14): Conv2d(128, 256