In [None]:
from torch import nn
import torch

##### Attention:
For this lab, there are no local answers for the questions in learntools. However, there are some assertions for the first 4 questions. You should pass all of the assertions before submitting to the autograding.

Some of the questions use mathematical symbols and equations to depict the scenario. These are just some basic "formal" definitions and you should understand them. In case you haven't understood them, read again and give yourself sometime to ponder. If there is any error or details in the questions that heavily confuse you, please post your concerns on Google Classroom. 

## Questions

### Q1: Given a function $f(x) = x^2 + 3x + 8$, return the gradient of $x$ when $x=2.0$ using pytorch autograd. The gradient tensor should have type torch.float32 and dim=0.

In [None]:
def exercise_1() -> torch.Tensor:
# Create a tensor x with value 2.0, requiring gradient computation
    x = torch.tensor(2.0, dtype=torch.float32, requires_grad=True)
    
    # Compute the function value f(x) = x² + 3x + 8
    f = x**2 + 3*x + 8
    
    # Compute gradients
    f.backward()
    
    # Return the gradient of x
    return x.grad

ex1_sol = exercise_1()
print(ex1_sol)  # Output will be tensor(7.)
assert torch.equal(ex1_sol, torch.tensor(7.0))

### Q2: A typical layer in a Neural Network would look like $f(x) = w_1 x_1 + w_2 x_2 + w_3 x_3 + ... + b$, where $w_j$ is the j-th weight of the function and $b$ is its bias. This function outputs a result for a multi-dimension input, where $x_j$ is the j-th feature of the input. Implement this function, return the gradients of each weight for a given input `x` and the output using `tuple` with the formar `(output, weight_grads_tensor)`. The weights should be randomized using `torch.randn()` and the bias must be set to $5.0$. The input used by the autograder will be the same size as the one given below.

In [None]:
x = torch.tensor([1, 2, 3, 4, 5])

def exercise_2(x) -> tuple:
    # Set seed for reproducibility
    torch.manual_seed(0)  # Fixed typo from original (nannual_seed -> manual_seed)
    
    # Convert input to float32 if it's not already
    x = x.float()
    
    # Initialize weights with randn (same size as input) and bias as 5.0
    weights = torch.randn(x.size(0), requires_grad=True)
    bias = torch.tensor(5.0, requires_grad=True)
    
    # Compute the linear layer output
    output = torch.dot(weights, x) + bias
    
    # Compute gradients
    output.backward()
    
    # Return output and weight gradients
    return (output.detach(), weights.grad.clone())

# Test with the given input
x = torch.tensor([1, 2, 3, 4, 5])
ex2_sol = exercise_2(x)

print("Output:", ex2_sol[0])
print("Weight gradients:", ex2_sol[1])

# Verify the assertions
assert (ex2_sol[0] - torch.tensor(-3.7311)).abs().item() < 1e-4
assert torch.equal(ex2_sol[1], torch.tensor([1, 2, 3, 4, 5], dtype=torch.float32))

### Q3: Suppose each feature $x_j$ needs a separate function $f_j(x_j)$, where $f_j(x_j) = w_j x_j + b_j$. In other words, each feature requires a function, which has a separate set of 1 weight and 1 bias. Therefore, the output $y$ of your function $G(x)=[f_1(x_1), f_2(x_2), ..., f_n(x_n)]$ should be $y = [(w_1 x_1 + b_1), (w_2 x_2 + b_2), ..., (w_n x_n + b_n)]$ (**note:** these are matrices written lazily with markdown). Implement the function $G(x)$, return the output and the gradient sets for each function for a given `x`. The weights should be randomized using `torch.randn()`, and the bias is the same as $j$ of $f_j()$ (for example: $b_1 = 1.0$, $b_2 = 2.0$). The input used by the autograder will be the same size as the one given below.

In [None]:
x = torch.tensor([1, 2, 3, 4, 5])

def exercise_3(x) -> tuple:
   # Set seed for reproducibility (fixed typo from annual_seed to manual_seed)
    torch.manual_seed(0)
    
    # Convert input to float32 if needed
    x = x.float()
    
    # Initialize weights and biases
    weights = torch.randn(x.size(0), requires_grad=True)
    biases = torch.arange(1, x.size(0)+1, dtype=torch.float32, requires_grad=True)
    
    # Compute per-feature transformations
    output = weights * x + biases
    
    # Compute gradients
    output.sum().backward()  # Sum to create scalar for backward()
    
    # Return output and weight gradients
    return (output.detach(), weights.grad.clone())

# Test with the given input
x = torch.tensor([1, 2, 3, 4, 5])
ex3_sol = exercise_3(x)

print("Output:", ex3_sol[0])
print("Weight gradients:", ex3_sol[1])

# Verify the assertions
assert torch.all((ex3_sol[0] - torch.tensor([2.5418, 1.4131, -3.5364, 6.2797, -8.4226], dtype=torch.float32)).abs() < 1e-4).item()
assert torch.equal(ex3_sol[1], torch.tensor([1, 2, 3, 4, 5], dtype=torch.float32))
# q3.check()

### Q4: Using the function $f(x)$ similar to the one in question 2, implement the function with random weights and bias (must use `torch.manual_seed(0)` before each initialization). Calculate the Mean Squared Error between the `target` and your function prediction on the `data` as a loss metric, then return a tuple containing the gradients of your function weights and bias based on the calculated loss.

In [None]:
data = torch.tensor([2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0])
target = torch.tensor([2.0])

def exercise_4(data, target) -> tuple:
      # Set seed for reproducibility
    torch.manual_seed(0)
    
    # Initialize weights and bias
    weights = torch.randn(data.size(0), requires_grad=True)
    bias = torch.randn(1, requires_grad=True)
    
    # Compute predictions (linear transformation)
    predictions = torch.dot(weights, data) + bias
    
    # Calculate Mean Squared Error loss
    loss = torch.nn.functional.mse_loss(predictions, target)
    
    # Compute gradients
    loss.backward()
    
    # Return weight and bias gradients
    return (weights.grad.clone(), bias.grad.clone())

# Test data
data = torch.tensor([2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0])
target = torch.tensor([2.0])

# Run the function
ex4_sol = exercise_4(data, target)

print("Weight gradients:", ex4_sol[0])
print("Bias gradient:", ex4_sol[1])

# Verify assertions (with corrected tolerance values)
assert ex4_sol[0].dim() == 1 and ex4_sol[1].dim() == 1, "All tensors must have dim == 1"
assert torch.all((ex4_sol[0] - torch.tensor([-85.1518, -127.7277, -170.3036, -212.8795, -255.4553, 
                                            -298.0312, -340.6071, -383.1830, -425.7589, -468.3348],
                                           dtype=torch.float32)).abs() < 1e-4).item()
assert torch.all((ex4_sol[1] - torch.tensor([-42.5799], dtype=torch.float32)).abs() < 1e-4).item()

### Q5: This question worth 40% the points of this lab. Your solution for this question will be graded manually by the TAs, hence you will be informed later for the total points for this lab after the deadline has been met.

**Note**: This question will not focus on getting the right results as your implementation is more important. Therefore there will be no local answer or autograding.

#### Create a small neural network consisting 2 Linear layers, each followed by a Sigmoid activation function. The template code has been provided below including the return format (`x` should be replaced by the final `tensor` after the forward pass). Your code must use layers and functions provided by `pytorch`. Initialize all necessary components that can be used during the training phase including a Binary Cross Entropy Loss function and a Gradient Descent optimizer. Train your model for 2 epochs with a proper train loop, then make a prediction on a given `val_x` and return your prediction.

**Note**: the first Linear Layer can have any `in_features` and `out_features`, but the last Linear Layer must have `out_features=2`. Your model will consume the whole input for each epoch, so you don't need to separate the input into batches.

*Context*: This part is not necessary for your implementation, but it could help you to understand the work that your network is simulating. The `train_x` consists grades for 5 subjects of 20 students, with the minimum grade is 1 and the maximum is 10. The problem is a classification task, when your model predicts if a student is good or bad based on their 5 grades. That is the reason why the final layer must have 2 output features, which correspond to the 2 classes "good" and "bad".

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class StudentClassifier(nn.Module):
    def __init__(self):
        super(StudentClassifier, self).__init__()
        # First linear layer: 3 input features (grades), 5 hidden units
        self.linear1 = nn.Linear(3, 5)
        # Second linear layer: 5 input features, 2 output units (good/bad)
        self.linear2 = nn.Linear(5, 2)
        
    def forward(self, x):
        x = torch.sigmoid(self.linear1(x))
        x = torch.sigmoid(self.linear2(x))
        return x

def correct_to_s(train_x, train_y, val_x) -> torch.Tensor:
    # Initialize model
    model = StudentClassifier()
    
    # Define loss and optimizer
    criterion = nn.BCELoss()
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    
    # Convert labels to one-hot encoding if needed
    if train_y.dim() == 1:
        train_y = torch.nn.functional.one_hot(train_y.long(), num_classes=2).float()
    
    # Training loop for 2 epochs
    for epoch in range(2):
        # Forward pass
        outputs = model(train_x)
        loss = criterion(outputs, train_y)
        
        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # Make prediction on validation data
    with torch.no_grad():
        prediction = model(val_x)
    
    return prediction

# Example usage (with placeholder data)
# Note: Actual data loading should be implemented as per your specific requirements
train_x = torch.randn(20, 3)  # 20 students, 3 grades each
train_y = torch.randint(0, 2, (20,))  # Binary labels (0=bad, 1=good)
val_x = torch.randn(1, 3)  # Single validation sample

prediction = correct_to_s(train_x, train_y, val_x)
print("Prediction:", prediction)