# ──────────────────────────────
# Day 4 — Chain Rule & Gradient Descent
# ──────────────────────────────

"""
# Day 4 — Chain Rule & Gradient Descent

**Author:** Dhairya Patel  

This notebook covers:
1. Chain Rule examples
2. Gradients on nested functions
3. Gradient Descent on Linear Regression cost function
4. Visualization of cost minimization
"""


In [None]:
import sympy as sp
import numpy as np
import matplotlib.pyplot as plt


In [None]:
"""
## 1) Chain Rule Examples
If y = f(g(x)), then dy/dx = f'(g(x)) * g'(x).
"""


In [None]:
x = sp.Symbol('x')
inner = 3*x**2 + 2*x
outer = (inner)**5
derivative = sp.diff(outer, x)
outer, derivative


In [None]:
"""
## 2) Gradient of a Nested Function
Let's compute gradient for z = (x^2 + y^2)^3
"""


In [None]:
x, y = sp.symbols('x y')
f = (x**2 + y**2)**3
df_dx = sp.diff(f, x)
df_dy = sp.diff(f, y)
[df_dx, df_dy]


In [None]:
"""
## 3) Gradient Descent on Linear Regression Cost

We minimize MSE: J(m,b) = (1/n) * Σ (y - (mx+b))^2
"""


In [None]:
# Dataset
X = np.array([1,2,3,4,5])
Y = np.array([2,4,6,8,10])  # perfect line y=2x

# Initialize params
m, b = 0.0, 0.0
lr = 0.01
epochs = 1000
n = len(X)

costs = []

for i in range(epochs):
    y_pred = m*X + b
    error = y_pred - Y
    cost = (1/n) * np.sum(error**2)
    costs.append(cost)
    
    # Gradients
    dm = (2/n) * np.sum(error * X)
    db = (2/n) * np.sum(error)
    
    # Update
    m -= lr * dm
    b -= lr * db

m, b


In [None]:
plt.plot(costs)
plt.title("Cost Function Decreasing with Gradient Descent")
plt.xlabel("Iterations")
plt.ylabel("MSE Cost")
plt.show()


In [None]:
"""
---

### Notes
- The Chain Rule is the backbone of backpropagation in neural networks.
- Gradient Descent allows us to iteratively minimize a cost function.
- Learning rate tuning is key to convergence.

**End of Day 4.**
"""
