# Problem1

## (a)

$
\nu_\beta(x) = \log [(\sum _{i=1}^{n} \exp(\beta_i x_i)) ^ \frac{1}{\beta}] \\
\lim_{\beta \to \infty} \nu_\beta(x) = \lim_{\beta \to \infty} \frac {\log[(\sum _{i=1}^{n} \exp(\beta x_i))]} {\beta} \\
$
By L'Hopital's rule, we have \
$
\lim_{\beta \to \infty} \nu_\beta(x) = \lim_{\beta \to \infty} \frac {\sum _{i=1}^{n} x_i \exp(\beta x_i)} {\sum _{i=1}^{n} \exp(\beta x_i)} \\
= \lim_{\beta \to \infty} \frac {\sum _{i=1}^{n} x_i \exp(\beta x_i - \max_j x_j)} {\sum _{i=1}^{n} \exp(\beta x_i - \max_j x_j)} \\
= \lim_{\beta \to \infty} \frac {\sum _{i=1}^{n} x_i 1_{x_i = \max_j x_j}} {\sum _{i=1}^{n} 1_{x_i = \max_j x_j}} \\
= \max x_i
$

## (b)
$
\frac \partial {\partial x_j} \nu_1(x) = \frac \partial {\partial x_j} \log(\sum _{i=1}^{n} \exp(x_i)) \\
= \frac {\exp(x_j)} {\sum _{i=1}^{n} \exp(x_i)} \\
= \mu(x)_j
$

$
\nabla \nu_1(x) 
= \mu(x) \\
$

## (c)
$
(\nabla \nu_\beta(x))_j = \frac {\exp(\beta x_j)} {\sum _{i=1}^{n} \exp(\beta x_i)} \\
$

$
\lim_{\beta \to \infty} (\nabla \nu_\beta(x))_j = \lim_{\beta \to \infty} \frac {\exp(\beta x_j)} {\sum _{i=1}^{n} \exp(\beta x_i)} \\
=\lim_{\beta \to \infty} \frac {\exp(\beta (x_j - x_{i_{max}}))} {\sum _{i=1}^{n} \exp(\beta (x_i - x_{i_{max}}))} \\
= \delta_{j, i_{max}} \\
= e_{i_{max}} \\
$

# Problem2

In [14]:
import torch
import torch.nn as nn


class AlexNet(nn.Module):
    def __init__(self, num_classes: int = 1000) :
        super(AlexNet, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
        self.classifier = nn.Sequential(
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes),
        )

    def forward(self, x: torch.Tensor) :
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x



model = AlexNet()

conv_params = sum(p.numel() for p in model.features.parameters() if p.requires_grad)
print('conv_params: ', conv_params)
linear_params = sum(p.numel() for p in model.classifier.parameters() if p.requires_grad)
print('linear_params: ', linear_params)
print('conv_params+linear_params: ', conv_params + linear_params)
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('total_params: ', total_params)
print('conv_params: {}%'.format((conv_params/total_params)*100))
print('linear_params: {}%: '.format((linear_params/total_params)*100))

# Count Multiplications and Additions
# Convolutions
features = model.features
input_size = 227
multiplications_conv = 0
additions_conv = 0
for feature in features:
    if isinstance(feature, nn.Conv2d):
        output_size = (input_size - feature.kernel_size[0] + feature.padding[0] * 2) // feature.stride[0] + 1
        multiplications_conv += feature.in_channels * feature.out_channels * (feature.kernel_size[0] ** 2) * ((output_size) ** 2)
        input_size = output_size
    if isinstance(feature, nn.MaxPool2d):
        output_size = (input_size - feature.kernel_size) // feature.stride + 1
        input_size = output_size
additions_conv = multiplications_conv # Each multiplication has a corresponding addition
print('Conv Multiplications: ', multiplications_conv)
print('Conv Additions: ', additions_conv)

# Linear Layers
classifier = model.classifier
multiplications_lin = 0
additions_lin = 0
for layer in classifier:
    if isinstance(layer, nn.Linear):
        multiplications_lin += layer.in_features * layer.out_features
        additions_lin += layer.out_features * layer.in_features
print('Linear Multiplications: ', multiplications_lin)
print('Linear Additions: ', additions_lin)
print('Linear Multiplications Percentage: {}%'.format((multiplications_lin/(multiplications_conv + multiplications_lin))*100))
print('Linear Additions Percentage: {}%'.format((additions_lin/(additions_conv + additions_lin))*100))

conv_params:  2469696
linear_params:  58631144
conv_params+linear_params:  61100840
total_params:  61100840
conv_params: 4.042000077249347%
linear_params: 95.95799992275064%: 
Conv Multiplications:  655566528
Conv Additions:  655566528
Linear Multiplications:  58621952
Linear Additions:  58621952
Linear Multiplications Percentage: 8.208190644576066%
Linear Additions Percentage: 8.208190644576066%


# Problem3

In [39]:
import torch.nn as nn
from torch.utils.data import DataLoader
import torch
import torchvision
import torchvision.transforms as transforms


# Instantiate model with BN and load trained parameters
class smallNetTrain(nn.Module) :
    # CIFAR-10 data is 32*32 images with 3 RGB channels
    def __init__(self, input_dim=3*32*32) :
        super().__init__()

        self.conv1 = nn.Sequential(
                            nn.Conv2d(3, 16, kernel_size=3, padding=1),
                            nn.BatchNorm2d(16),
                            nn.ReLU()
                            )      
        self.conv2 = nn.Sequential(
                            nn.Conv2d(16, 16, kernel_size=3, padding=1),
                            nn.BatchNorm2d(16),
                            nn.ReLU()
                            ) 
        self.fc1 = nn.Sequential(
                            nn.Linear(16*32*32, 32*32),
                            nn.BatchNorm1d(32*32),
                            nn.ReLU()
                            )   
        self.fc2 = nn.Sequential(
                            nn.Linear(32*32, 10),
                            nn.ReLU()
                            )   
    def forward(self, x) :
        x = self.conv1(x)
        x = self.conv2(x)
        x = x.float().view(-1, 16*32*32)
        x = self.fc1(x)
        x = self.fc2(x)
        
        return x
    
model = smallNetTrain()
model.load_state_dict(torch.load("./smallNetSaved",map_location=torch.device('cpu')))


# Instantiate model without BN
class smallNetTest(nn.Module) :
    # CIFAR-10 data is 32*32 images with 3 RGB channels
    def __init__(self, input_dim=3*32*32) :
        super().__init__()

        self.conv1 = nn.Sequential(
                            nn.Conv2d(3, 16, kernel_size=3, padding=1),
                            nn.ReLU()
                            )      
        self.conv2 = nn.Sequential(
                            nn.Conv2d(16, 16, kernel_size=3, padding=1),
                            nn.ReLU()
                            ) 
        self.fc1 = nn.Sequential(
                            nn.Linear(16*32*32, 32*32),
                            nn.ReLU()
                            )   
        self.fc2 = nn.Sequential(
                            nn.Linear(32*32, 10),
                            nn.ReLU()
                            )   
    def forward(self, x) :
        x = self.conv1(x)
        x = self.conv2(x)
        x = x.float().view(-1, 16*32*32)
        x = self.fc1(x)
        x = self.fc2(x)
        
        return x
    
model_test = smallNetTest()



# Initialize weights of model without BN

conv1_bn_beta, conv1_bn_gamma = model.conv1[1].bias, model.conv1[1].weight
conv1_bn_mean, conv1_bn_var = model.conv1[1].running_mean, model.conv1[1].running_var
conv2_bn_beta, conv2_bn_gamma = model.conv2[1].bias, model.conv2[1].weight
conv2_bn_mean, conv2_bn_var = model.conv2[1].running_mean, model.conv2[1].running_var
fc1_bn_beta, fc1_bn_gamma = model.fc1[1].bias, model.fc1[1].weight
fc1_bn_mean, fc1_bn_var = model.fc1[1].running_mean, model.fc1[1].running_var
eps = 1e-05



# Initialize the following parameters
model_test.conv1[0].weight.data = model.conv1[0].weight.data * conv1_bn_gamma.view(-1, 1, 1, 1) / torch.sqrt(conv1_bn_var + eps).view(-1, 1, 1, 1)
model_test.conv1[0].bias.data = (model.conv1[0].bias.data - conv1_bn_mean) / torch.sqrt(conv1_bn_var + eps) * conv1_bn_gamma + conv1_bn_beta

model_test.conv2[0].weight.data = model.conv2[0].weight.data * conv2_bn_gamma.view(-1, 1, 1, 1) / torch.sqrt(conv2_bn_var + eps).view(-1, 1, 1, 1)
model_test.conv2[0].bias.data = (model.conv2[0].bias.data - conv2_bn_mean) / torch.sqrt(conv2_bn_var + eps) * conv2_bn_gamma + conv2_bn_beta

model_test.fc1[0].weight.data = model.fc1[0].weight.data * fc1_bn_gamma.view(-1, 1) / torch.sqrt(fc1_bn_var + eps).view(-1, 1)
model_test.fc1[0].bias.data = (model.fc1[0].bias.data - fc1_bn_mean) / torch.sqrt(fc1_bn_var + eps) * fc1_bn_gamma + fc1_bn_beta

model_test.fc2[0].weight.data = model.fc2[0].weight.data
model_test.fc2[0].bias.data = model.fc2[0].bias.data




# Verify difference between model and model_test

model.eval()  
# model_test.eval()  # not necessary since model_test has no BN or dropout 


test_dataset = torchvision.datasets.CIFAR10(root='./cifar_10data/',
                                train=False, 
                                transform=transforms.ToTensor(), download = True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=100, shuffle=False)

diff = []
with torch.no_grad():
    for images, _ in test_loader:
        diff.append(torch.norm(model(images) - model_test(images))**2) 
        
print(max(diff)) # If less than 1e-08, you got the right answer.



'''
For debugging purposes, you may want to match the output of conv1 first before
moving on working on conv2. To do so, you can replace the forward-evaluation
functions of the two models with 
def forward(self, x) :
    x = self.conv1(x)
    return x
'''

Files already downloaded and verified
tensor(1.1230e-07)


'\nFor debugging purposes, you may want to match the output of conv1 first before\nmoving on working on conv2. To do so, you can replace the forward-evaluation\nfunctions of the two models with \ndef forward(self, x) :\n    x = self.conv1(x)\n    return x\n'

# Problem4

![](https://media.discordapp.net/attachments/947918193924636695/1234409611411329024/Screenshot_20240429_164302.jpg?ex=6630a109&is=662f4f89&hm=490fe4d7775787b604b52f8a767e644b2176bb236f5ed6f54b7a8ee28b62864a&=&format=webp&width=850&height=1118)

# Problem5

In [49]:
from turtle import down
import torch.nn as nn
import torch
import torchvision


class Net1(nn.Module):
    def __init__(self, num_classes=10):
        super(Net1, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=7, stride=1),
            nn.ReLU(),
            nn.Conv2d(64, 192, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Conv2d(192, 384, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Conv2d(384, 256, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, stride=1),
        )
        self.classifier = nn.Sequential(
            nn.Linear(256 * 18 * 18, 4096),
            nn.ReLU(),
            nn.Linear(4096, 4096),
            nn.ReLU(),
            nn.Linear(4096, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x


class Net2(nn.Module):
    def __init__(self, num_classes=10):
        super(Net2, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=7, stride=1),
            nn.ReLU(),
            nn.Conv2d(64, 192, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Conv2d(192, 384, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Conv2d(384, 256, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, stride=1),
        )

    ###########################################################  
    ### TODO: Complete initialization of self.classifier    ###
    ###        by filling in the ...                        ###
    ###########################################################
        self.classifier = nn.Sequential(
            nn.Conv2d (256, 4096, kernel_size=18, stride=1),
            nn.ReLU(),
            nn.Conv2d (4096, 4096, kernel_size=1, stride=1),
            nn.ReLU(),
            nn.Conv2d (4096, num_classes, kernel_size=1, stride=1)
        )

    def copy_weights_from(self, net1):
        with torch.no_grad():
            for i in range(0, len(self.features), 2):
                self.features[i].weight.copy_(net1.features[i].weight)
                self.features[i].bias.copy_(net1.features[i].bias)

            for i in range(len(self.classifier)):
                ####################################################
                ### TO DO: Correctly transfer weight of Net1     ###
                ####################################################
                if i % 2 == 1: continue
                self.classifier[i].weight.copy_(net1.classifier[i].weight.view(self.classifier[i].weight.size()))
                self.classifier[i].bias.copy_(net1.classifier[i].bias.view(self.classifier[i].bias.size()))

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x



model1 = Net1() # model1 randomly initialized
model2 = Net2()
model2.copy_weights_from(model1)

test_dataset = torchvision.datasets.CIFAR10(
    root='./data',
    train=False,
    transform=torchvision.transforms.ToTensor(),
    download=True
)

test_loader = torch.utils.data.DataLoader(
    dataset=test_dataset,
    batch_size=10
)

imgs, _ = next(iter(test_loader))
diff = torch.mean((model1(imgs) - model2(imgs).squeeze()) ** 2)
print(f"Average Pixel Difference: {diff.item()}") # should be small


test_dataset = torchvision.datasets.CIFAR10(
    root='./data',
    train=False,
    transform=torchvision.transforms.Compose([
        torchvision.transforms.Resize((36, 38)),
        torchvision.transforms.ToTensor()
        ]),
    download=True
)

test_loader = torch.utils.data.DataLoader(
    dataset=test_dataset,
    batch_size=10,
    shuffle=False
)

images, _ = next(iter(test_loader))
b, w, h = images.shape[0], images.shape[-1], images.shape[-2]
out1 = torch.empty((b, 10, h - 31, w - 31))
for i in range(h - 31):
    for j in range(w - 31):
        ########################################################
        ### TO DO: fill in ... to make out1 and out2 equal   ###
        ########################################################
        out1[:, :, i, j] = model1(images[:, :, i:i + 32, j:j + 32])
out2 = model2(images)
diff = torch.mean((out1 - out2) ** 2)

print(f"Average Pixel Diff: {diff.item()}")


Files already downloaded and verified
Average Pixel Difference: 9.156371896770803e-17
Files already downloaded and verified
Average Pixel Diff: 1.6456181245547169e-16
