In [4]:
from torch import nn
import torch

##### Attention:
For this lab, there are no local answers for the questions in learntools. However, there are some assertions for the first 4 questions. You should pass all of the assertions before submitting to the autograding.

Some of the questions use mathematical symbols and equations to depict the scenario. These are just some basic "formal" definitions and you should understand them. In case you haven't understood them, read again and give yourself sometime to ponder. If there is any error or details in the questions that heavily confuse you, please post your concerns on Google Classroom. 

## Questions

### Q1: Given a function $f(x) = x^2 + 3x + 8$, return the gradient of $x$ when $x=2.0$ using pytorch autograd. The gradient tensor should have type torch.float32 and dim=0.

In [10]:
def exercise_1() -> torch.Tensor:
    x = torch.tensor(2.0, requires_grad=True, dtype=torch.float32)
    y = x**2 + 3*x + 8
    # Compute the gradient of y with respect to x
    y.backward()

    # Access the gradient of x
    gradient = x.grad

    return gradient

ex1_sol = exercise_1()
assert torch.equal(ex1_sol, torch.tensor(7.0))

### Q2: A typical layer in a Neural Network would look like $f(x) = w_1 x_1 + w_2 x_2 + w_3 x_3 + ... + b$, where $w_j$ is the j-th weight of the function and $b$ is its bias. This function outputs a result for a multi-dimension input, where $x_j$ is the j-th feature of the input. Implement this function, return the gradients of each weight for a given input `x` and the output using `tuple` with the formar `(output, weight_grads_tensor)`. The weights should be randomized using `torch.randn()` and the bias must be set to $5.0$. The input used by the autograder will be the same size as the one given below.

In [9]:
x = torch.tensor([1, 2, 3, 4, 5])

def exercise_2(x) -> tuple:
    # all the output and gradients are tensors

    # Ensure input x is a float tensor and requires grad
    x = x.float().requires_grad_(True)
    n_features = x.size(0)
    print(x.size(0))

    torch.manual_seed(0) # must set seed=0 before random your weight
    weights = torch.randn(n_features, dtype=torch.float32, requires_grad=True)
    bias = torch.tensor(5.0, dtype=torch.float32, requires_grad=True)

    # Calculate the output
    output = torch.sum(weights * x) + bias

    for i in range(n_features):
        print(weights[i].detach() )
        print("x +")
    print(bias.detach())
    print("output: ", "end=", output.detach())

    # Compute the gradients of the output with respect to the weights
    output.backward()

    # Access the gradients of the weights
    weight_grads_tensor = weights.grad


    return (output.detach(), weight_grads_tensor.detach())

    # return (output, weight_grads_tensor)

ex2_sol = exercise_2(x)

assert (ex2_sol[0] - torch.tensor(-3.7311) < 1e-4).item()
assert torch.equal(ex2_sol[1], torch.tensor([1, 2, 3, 4, 5], dtype=torch.float32))

5
tensor(1.5410)
x +
tensor(-0.2934)
x +
tensor(-2.1788)
x +
tensor(0.5684)
x +
tensor(-1.0845)
x +
tensor(5.)
output:  end= tensor(-3.7311)


### Q3: Suppose each feature $x_j$ needs a separate function $f_j(x_j)$, where $f_j(x_j) = w_j x_j + b_j$. In other words, each feature requires a function, which has a separate set of 1 weight and 1 bias. Therefore, the output $y$ of your function $G(x)=[f_1(x_1), f_2(x_2), ..., f_n(x_n)]$ should be $y = [(w_1 x_1 + b_1), (w_2 x_2 + b_2), ..., (w_n x_n + b_n)]$ (**note:** these are matrices written lazily with markdown). Implement the function $G(x)$, return the output and the gradient sets for each function for a given `x`. The weights should be randomized using `torch.randn()`, and the bias is the same as $j$ of $f_j()$ (for example: $b_1 = 1.0$, $b_2 = 2.0$). The input used by the autograder will be the same size as the one given below.

In [8]:
x = torch.tensor([1, 2, 3, 4, 5])

def exercise_3(x) -> tuple:
    # all the output and gradients are tensors

    # Ensure input x is a float tensor and requires grad
    x = x.float().requires_grad_(True)
    n_features = x.size(0)

    torch.manual_seed(0) # must set seed=0 before random your weight
    weights = torch.randn(n_features, dtype=torch.float32, requires_grad=True)
    biases = torch.arange(1, n_features + 1, dtype = torch.float32, requires_grad = True)

    # Calculate the output
    output = weights * x + biases

    gradients = []
    for i in range(n_features):
        output[i].backward(retain_graph=True)
        gradients.append(weights.grad[i].item())
        print("i= ", i, end='\n')
        # for u in range(5):
        #     print(weights.grad[i].item(), end="\n")
        weights.grad.zero_()
    weight_grads_tensor = torch.tensor(gradients, dtype=torch.float32)
        

    # for i in range(n_features):
    #     print(weights[i].detach() )
    #     print("x +")
    # print(bias.detach())

    # print("output: ", output.detach())
    # print("output: ", gradients)

    return (output.detach(), weight_grads_tensor.detach())

ex3_sol = exercise_3(x)
assert torch.all((ex3_sol[0] - torch.tensor([ 2.5410,  1.4131, -3.5364,  6.2737, -0.4226 ], dtype=torch.float32)) < 5e-4).item()
assert torch.equal(ex3_sol[1], torch.tensor([1, 2, 3, 4, 5], dtype=torch.float32))
# q3.check()

i=  0
i=  1
i=  2
i=  3
i=  4


### Q4: Using the function $f(x)$ similar to the one in question 2, implement the function with random weights and bias (must use `torch.manual_seed(0)` before each initialization). Calculate the Mean Squared Error between the `target` and your function prediction on the `data` as a loss metric, then return a tuple containing the gradients of your function weights and bias based on the calculated loss.

In [7]:
import torch

data = torch.tensor([2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0], dtype=torch.float32)
target = torch.tensor([2.0], dtype=torch.float32).unsqueeze(0) # Added unsqueeze for proper broadcasting

def exercise_4(data, target) -> tuple:
    torch.manual_seed(0)
    n_features = data.size(0) # Get the size of the 1D data tensor
    weights = torch.randn(n_features, dtype=torch.float32, requires_grad=True)
    torch.manual_seed(0) # Reset seed for consistent bias initialization
    bias = torch.randn(1, dtype=torch.float32, requires_grad=True)

    # Calculate the prediction
    prediction = torch.sum(data * weights) + bias # Removed dim argument as data is 1D
    print(prediction.shape)
    print(target.shape)
    prediction = prediction.unsqueeze(0) # Ensure prediction has the same number of dimensions as target

    # Calculate the Mean Squared Error loss
    loss = torch.mean((prediction - target)**2)

    # Compute the gradients of the loss with respect to weights and bias
    loss.backward()

    return (weights.grad.detach(), bias.grad.detach())

ex4_sol = exercise_4(data, target)
assert ex4_sol[0].dim() == 1 and ex4_sol[1].dim() == 1, print("All tensors must have dim == 1")
assert torch.allclose(ex4_sol[0], torch.tensor([-85.1518, -127.7277, -170.3036, -212.8795, -255.4553, -298.0312, -340.6071, -383.1830, -425.7589, -468.3348], dtype=torch.float32), atol=5e-4)
assert torch.allclose(ex4_sol[1], torch.tensor([-42.5759], dtype=torch.float32), atol=5e-4)

print("Check passed!")

torch.Size([1])
torch.Size([1, 1])
Check passed!


### Q5: This question worth 40% the points of this lab. Your solution for this question will be graded manually by the TAs, hence you will be informed later for the total points for this lab after the deadline has been met.

**Note**: This question will not focus on getting the right results as your implementation is more important. Therefore there will be no local answer or autograding.

#### Create a small neural network consisting 2 Linear layers, each followed by a Sigmoid activation function. The template code has been provided below including the return format (`x` should be replaced by the final `tensor` after the forward pass). Your code must use layers and functions provided by `pytorch`. Initialize all necessary components that can be used during the training phase including a Binary Cross Entropy Loss function and a Gradient Descent optimizer. Train your model for 2 epochs with a proper train loop, then make a prediction on a given `val_x` and return your prediction.

**Note**: the first Linear Layer can have any `in_features` and `out_features`, but the last Linear Layer must have `out_features=2`. Your model will consume the whole input for each epoch, so you don't need to separate the input into batches.

*Context*: This part is not necessary for your implementation, but it could help you to understand the work that your network is simulating. The `train_x` consists grades for 5 subjects of 20 students, with the minimum grade is 1 and the maximum is 10. The problem is a classification task, when your model predicts if a student is good or bad based on their 5 grades. That is the reason why the final layer must have 2 output features, which correspond to the 2 classes "good" and "bad".

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

train_x = torch.randint(1, 10, (20, 5), dtype=torch.float32) # input
train_y = (train_x.sum(1) > 30).long() # target should be long for CrossEntropyLoss

val_x = torch.randint(1, 10, (10, 5), dtype=torch.float32)
val_y = (val_x.sum(1) > 30 ).long() # For accuracy check



def exercise_5(train_x, train_y, val_x) -> torch.Tensor:

    class MyNetwork(nn.Module):
        def __init__(self):
            super().__init__()
            self.linear1 = nn.Linear(5, 10)
            self.sigmoid1 = nn.Sigmoid()
            self.linear2 = nn.Linear(10, 2) # Output size 2 for two classes
            self.sigmoid2 = nn.Sigmoid() # Sigmoid after the second linear layer

        def forward(self, x):
            x = self.linear1(x)
            x = self.sigmoid1(x)
            x = self.linear2(x)
            x = self.sigmoid2(x)
            
            return x # Return the output after the final sigmoid
            # return x[:,0] ==> will cause error for the loss function
            # because the output argument does not have the expected shape for a 2-class classification task

    loss_fn = nn.CrossEntropyLoss()
    model = MyNetwork() # model is an instant of class that inherited from nn.Module - the base class for all neural network modules in PyTorch
    optimizer = optim.SGD(model.parameters(), lr=0.01) # initializes the Stochastic Gradient Descent (SGD) optimizer.

    # train loop
    for epoch in range(2):

        model.train() # Set the model's mode for training
        optimizer.zero_grad() 
        output = model(train_x)
        loss = loss_fn(output, train_y)
        loss.backward()
        optimizer.step() #  adjusts the model's parameters. 
        # For each parameter, it updates its value based on its gradient and the learning rate
        # parameter = parameter - learning_rate * gradient
        print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

    # make a prediction on val_x
    model.eval() # Set the model's mode for evaluation
    with torch.no_grad():
        logits = model(val_x) # Output is now after the sigmoid
        print('logits: ', logits)
        loss = loss_fn(logits, val_y)
        print("Final loss: ", loss, end="\n")
        predictions = torch.argmax(logits, dim=1) # Get the class with the highest "probability"

    return predictions

ex5_sol = exercise_5(train_x=train_x, train_y=train_y, val_x=val_x)

# print("val_x", val_x)
print("\nPredictions on val_x:")
for i in ex5_sol:
    if i == 0 :
        print('Bad', end='-')
    else:
        print('Good', end='-') 
    print(end='\n') 
# print(ex5_sol)



Epoch 1, Loss: 0.6834
Epoch 2, Loss: 0.6829
logits:  tensor([[0.5393, 0.5751],
        [0.5893, 0.5371],
        [0.5751, 0.5606],
        [0.6016, 0.5306],
        [0.5648, 0.5543],
        [0.5904, 0.5327],
        [0.6000, 0.5560],
        [0.5551, 0.5815],
        [0.5337, 0.5473],
        [0.5587, 0.5721]])
Final loss:  tensor(0.6814)

Predictions on val_x:
Good-
Bad-
Bad-
Bad-
Bad-
Bad-
Bad-
Good-
Good-
Good-
