# Derivatives and Differentiation

[![Python](https://img.shields.io/badge/Python-3.8+-blue.svg)](https://www.python.org/)
[![NumPy](https://img.shields.io/badge/NumPy-1.21+-green.svg)](https://numpy.org/)
[![Matplotlib](https://img.shields.io/badge/Matplotlib-3.5+-orange.svg)](https://matplotlib.org/)
[![SymPy](https://img.shields.io/badge/SymPy-1.10+-purple.svg)](https://www.sympy.org/)

## Introduction

Derivatives are the cornerstone of calculus and are essential for understanding how functions change. In machine learning and data science, derivatives are used extensively for optimization, gradient descent, and understanding model behavior.

### Why Derivatives Matter in AI/ML

1. **Gradient Descent**: The most fundamental optimization algorithm in machine learning
2. **Backpropagation**: Computing gradients through neural networks
3. **Model Training**: Understanding how parameters affect the loss function
4. **Feature Importance**: Understanding how input changes affect output
5. **Optimization**: Finding minima and maxima of functions

### Mathematical Definition

The derivative of a function f(x) at a point x = a is defined as:

$$f'(a) = \lim_{h \to 0} \frac{f(a + h) - f(a)}{h}$$

This represents the instantaneous rate of change of the function at that point.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import sympy as sp
from sympy import symbols, diff, limit
from scipy.misc import derivative

# Set up plotting style
plt.style.use('seaborn-v0_8')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# Enable LaTeX rendering
plt.rcParams['text.usetex'] = True

## 2.1 Understanding the Derivative

The derivative represents the slope of the tangent line to a function at a given point. Let's visualize this concept.

In [None]:
# Visualizing the derivative as the slope of the tangent line
def f(x):
    return x**2

def f_prime(x):
    return 2*x

def tangent_line(x, x0, y0, slope):
    return slope * (x - x0) + y0

# Create visualization
x_vals = np.linspace(-3, 3, 1000)
y_vals = f(x_vals)

# Points where we'll draw tangent lines
tangent_points = [-2, -1, 0, 1, 2]

plt.figure(figsize=(15, 10))

# Main plot with function and tangent lines
plt.subplot(2, 2, 1)
plt.plot(x_vals, y_vals, 'b-', linewidth=3, label='f(x) = x²')

colors = ['red', 'orange', 'green', 'purple', 'brown']
for i, x0 in enumerate(tangent_points):
    y0 = f(x0)
    slope = f_prime(x0)
    
    # Draw tangent line
    x_tangent = np.linspace(x0 - 1, x0 + 1, 100)
    y_tangent = tangent_line(x_tangent, x0, y0, slope)
    
    plt.plot(x_tangent, y_tangent, color=colors[i], linestyle='--', linewidth=2, 
             label=f'Tangent at x={x0}, slope={slope}')
    plt.scatter(x0, y0, color=colors[i], s=100, zorder=5)

plt.xlabel('x')
plt.ylabel('f(x)')
plt.title('Derivative as Slope of Tangent Line')
plt.legend()
plt.grid(True, alpha=0.3)

# Derivative function
plt.subplot(2, 2, 2)
y_prime_vals = f_prime(x_vals)
plt.plot(x_vals, y_prime_vals, 'r-', linewidth=3, label="f'(x) = 2x")
plt.xlabel('x')
plt.ylabel("f'(x)")
plt.title('Derivative Function')
plt.legend()
plt.grid(True, alpha=0.3)

# Secant lines approaching tangent
plt.subplot(2, 2, 3)
x0 = 1
y0 = f(x0)
plt.plot(x_vals, y_vals, 'b-', linewidth=3, label='f(x) = x²')
plt.scatter(x0, y0, color='red', s=100, zorder=5, label=f'Point ({x0}, {y0})')

# Draw secant lines with different h values
h_values = [0.5, 0.2, 0.1, 0.05]
colors_secant = ['orange', 'green', 'purple', 'brown']

for i, h in enumerate(h_values):
    x1 = x0 + h
    y1 = f(x1)
    slope = (y1 - y0) / h
    
    x_secant = np.linspace(x0 - 0.5, x0 + 0.5, 100)
    y_secant = tangent_line(x_secant, x0, y0, slope)
    
    plt.plot(x_secant, y_secant, color=colors_secant[i], linestyle='--', linewidth=2,
             label=f'Secant h={h}, slope={slope:.2f}')
    plt.scatter(x1, y1, color=colors_secant[i], s=50, zorder=5)

plt.xlabel('x')
plt.ylabel('f(x)')
plt.title('Secant Lines Approaching Tangent')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xlim(0.5, 1.5)
plt.ylim(0.5, 2.5)

# Numerical approximation of derivative
plt.subplot(2, 2, 4)
h_values = np.logspace(-5, 0, 100)
x0 = 1
exact_derivative = f_prime(x0)

numerical_derivatives = []
for h in h_values:
    approx = (f(x0 + h) - f(x0)) / h
    numerical_derivatives.append(abs(approx - exact_derivative))

plt.loglog(h_values, numerical_derivatives, 'b-', linewidth=2, label='Error')
plt.axhline(y=1e-10, color='r', linestyle='--', label='Machine precision')
plt.xlabel('h (step size)')
plt.ylabel('Absolute Error')
plt.title('Numerical Derivative Error vs Step Size')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 2.2 Basic Differentiation Rules

Understanding the fundamental rules of differentiation is crucial for computing derivatives efficiently.

In [None]:
# Demonstrate basic differentiation rules using SymPy
x = sp.Symbol('x')

# Power rule
print("=== Power Rule ===")
functions = [x**2, x**3, x**0.5, 1/x, 1/x**2]
for func in functions:
    derivative = sp.diff(func, x)
    print(f"d/dx({func}) = {derivative}")

print("\n=== Constant Multiple Rule ===")
constants = [2, 3, -1, 0.5]
for c in constants:
    derivative = sp.diff(c * x**2, x)
    print(f"d/dx({c} * x²) = {derivative}")

print("\n=== Sum Rule ===")
sum_func = x**2 + 3*x + 1
derivative = sp.diff(sum_func, x)
print(f"d/dx({sum_func}) = {derivative}")

print("\n=== Product Rule ===")
product_func = x**2 * sp.sin(x)
derivative = sp.diff(product_func, x)
print(f"d/dx({product_func}) = {derivative}")

print("\n=== Quotient Rule ===")
quotient_func = x**2 / (x + 1)
derivative = sp.diff(quotient_func, x)
print(f"d/dx({quotient_func}) = {derivative}")

print("\n=== Chain Rule ===")
chain_func = sp.sin(x**2)
derivative = sp.diff(chain_func, x)
print(f"d/dx({chain_func}) = {derivative}")

## 2.3 Chain Rule and Its Importance in ML

The chain rule is fundamental to backpropagation in neural networks. It allows us to compute derivatives of composite functions.

In [None]:
# Chain rule visualization and application
def simple_chain_function(x):
    """f(x) = sin(x²)"""
    return np.sin(x**2)

def chain_derivative(x):
    """f'(x) = 2x * cos(x²) by chain rule"""
    return 2 * x * np.cos(x**2)

# Visualize the chain rule
x_vals = np.linspace(-3, 3, 1000)
y_vals = simple_chain_function(x_vals)
y_prime_vals = chain_derivative(x_vals)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Original function
ax1.plot(x_vals, y_vals, 'b-', linewidth=3, label='f(x) = sin(x²)')
ax1.set_xlabel('x')
ax1.set_ylabel('f(x)')
ax1.set_title('Composite Function')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Derivative
ax2.plot(x_vals, y_prime_vals, 'r-', linewidth=3, label="f'(x) = 2x * cos(x²)")
ax2.set_xlabel('x')
ax2.set_ylabel("f'(x)")
ax2.set_title('Derivative by Chain Rule')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Demonstrate chain rule step by step
print("Chain Rule Step-by-Step:")
print("f(x) = sin(x²)")
print("Let u = x², then f(x) = sin(u)")
print("By chain rule: f'(x) = d/dx[sin(u)] * d/dx[x²]")
print("f'(x) = cos(u) * 2x = cos(x²) * 2x = 2x * cos(x²)")

# Verify with SymPy
x = sp.Symbol('x')
chain_func = sp.sin(x**2)
derivative = sp.diff(chain_func, x)
print(f"\nSymPy verification: d/dx(sin(x²)) = {derivative}")

## 2.4 Partial Derivatives and Gradients

In machine learning, we often work with functions of multiple variables. Partial derivatives and gradients are essential for multivariate optimization.

In [None]:
# Partial derivatives and gradients
from mpl_toolkits.mplot3d import Axes3D

def multivariate_function(x, y):
    """f(x,y) = x² + y²"""
    return x**2 + y**2

def gradient_x(x, y):
    """∂f/∂x = 2x"""
    return 2 * x

def gradient_y(x, y):
    """∂f/∂y = 2y"""
    return 2 * y

# Create 3D visualization
x = np.linspace(-3, 3, 50)
y = np.linspace(-3, 3, 50)
X, Y = np.meshgrid(x, y)
Z = multivariate_function(X, Y)

fig = plt.figure(figsize=(15, 10))

# 3D surface plot
ax1 = fig.add_subplot(2, 2, 1, projection='3d')
surf = ax1.plot_surface(X, Y, Z, cmap='viridis', alpha=0.8)
ax1.set_xlabel('x')
ax1.set_ylabel('y')
ax1.set_zlabel('f(x,y)')
ax1.set_title('Multivariate Function: f(x,y) = x² + y²')

# Contour plot with gradient vectors
ax2 = fig.add_subplot(2, 2, 2)
contour = ax2.contour(X, Y, Z, levels=20, cmap='viridis')
ax2.clabel(contour, inline=True, fontsize=8)

# Add gradient vectors at selected points
sample_points = [(-2, -2), (-1, -1), (0, 0), (1, 1), (2, 2)]
for x0, y0 in sample_points:
    grad_x = gradient_x(x0, y0)
    grad_y = gradient_y(x0, y0)
    ax2.arrow(x0, y0, grad_x * 0.1, grad_y * 0.1, 
              head_width=0.1, head_length=0.1, fc='red', ec='red', alpha=0.7)
    ax2.scatter(x0, y0, color='red', s=50, zorder=5)

ax2.set_xlabel('x')
ax2.set_ylabel('y')
ax2.set_title('Contour Plot with Gradient Vectors')
ax2.grid(True, alpha=0.3)

# Partial derivative with respect to x
ax3 = fig.add_subplot(2, 2, 3)
Z_dx = gradient_x(X, Y)
contour_dx = ax3.contour(X, Y, Z_dx, levels=20, cmap='Reds')
ax3.clabel(contour_dx, inline=True, fontsize=8)
ax3.set_xlabel('x')
ax3.set_ylabel('y')
ax3.set_title('Partial Derivative ∂f/∂x = 2x')
ax3.grid(True, alpha=0.3)

# Partial derivative with respect to y
ax4 = fig.add_subplot(2, 2, 4)
Z_dy = gradient_y(X, Y)
contour_dy = ax4.contour(X, Y, Z_dy, levels=20, cmap='Blues')
ax4.clabel(contour_dy, inline=True, fontsize=8)
ax4.set_xlabel('x')
ax4.set_ylabel('y')
ax4.set_title('Partial Derivative ∂f/∂y = 2y')
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Verify with SymPy
x, y = sp.symbols('x y')
f = x**2 + y**2
print("Partial derivatives:")
print(f"∂f/∂x = {sp.diff(f, x)}")
print(f"∂f/∂y = {sp.diff(f, y)}")
print(f"Gradient ∇f = [{sp.diff(f, x)}, {sp.diff(f, y)}]")

## 2.5 Gradient Descent: The Foundation of ML Optimization

Gradient descent is the most fundamental optimization algorithm in machine learning. It uses derivatives to find the minimum of a function.

In [None]:
# Gradient descent implementation
def simple_function(x):
    """f(x) = x² + 2x + 1"""
    return x**2 + 2*x + 1

def derivative_function(x):
    """f'(x) = 2x + 2"""
    return 2*x + 2

def gradient_descent(start_x, learning_rate, num_iterations):
    """Simple gradient descent for 1D function"""
    x = start_x
    history = []
    
    for i in range(num_iterations):
        # Compute gradient
        grad = derivative_function(x)
        
        # Update parameter
        x_new = x - learning_rate * grad
        
        # Store history
        history.append((x, simple_function(x), grad))
        
        # Update x
        x = x_new
    
    return x, history

# Run gradient descent
start_x = 5.0
learning_rate = 0.1
num_iterations = 20

optimal_x, history = gradient_descent(start_x, learning_rate, num_iterations)

print(f"Starting point: x = {start_x}")
print(f"Optimal point: x = {optimal_x:.6f}")
print(f"Optimal value: f(x) = {simple_function(optimal_x):.6f}")
print(f"Analytical minimum: x = -1, f(-1) = 0")

In [None]:
# Visualize gradient descent
x_vals = np.linspace(-2, 6, 1000)
y_vals = simple_function(x_vals)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Function and optimization path
ax1.plot(x_vals, y_vals, 'b-', linewidth=3, label='f(x) = x² + 2x + 1')
ax1.axvline(x=-1, color='g', linestyle='--', linewidth=2, label='Analytical minimum')

# Plot optimization path
x_history = [h[0] for h in history]
y_history = [h[1] for h in history]
ax1.plot(x_history, y_history, 'ro-', linewidth=2, markersize=8, label='Gradient descent path')
ax1.scatter(x_history[0], y_history[0], color='red', s=200, zorder=5, label='Start')
ax1.scatter(x_history[-1], y_history[-1], color='green', s=200, zorder=5, label='End')

ax1.set_xlabel('x')
ax1.set_ylabel('f(x)')
ax1.set_title('Gradient Descent Optimization')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Convergence plot
iterations = range(len(history))
function_values = [h[1] for h in history]
gradients = [abs(h[2]) for h in history]

ax2.plot(iterations, function_values, 'b-', linewidth=2, label='Function value')
ax2.set_xlabel('Iteration')
ax2.set_ylabel('f(x)')
ax2.set_title('Convergence of Function Value')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print optimization history
print("\nOptimization History:")
print("Iteration | x | f(x) | f'(x)")
print("-" * 35)
for i, (x, fx, grad) in enumerate(history):
    print(f"{i:9d} | {x:6.4f} | {fx:6.4f} | {grad:6.4f}")

## 2.6 Higher-Order Derivatives

Higher-order derivatives provide information about the curvature and behavior of functions. They are important for optimization algorithms like Newton's method.

In [None]:
# Higher-order derivatives
def polynomial_function(x):
    """f(x) = x³ - 3x² + 2x"""
    return x**3 - 3*x**2 + 2*x

def first_derivative(x):
    """f'(x) = 3x² - 6x + 2"""
    return 3*x**2 - 6*x + 2

def second_derivative(x):
    """f''(x) = 6x - 6"""
    return 6*x - 6

def third_derivative(x):
    """f'''(x) = 6"""
    return 6

# Visualize higher-order derivatives
x_vals = np.linspace(-1, 3, 1000)
y_vals = polynomial_function(x_vals)
y_prime_vals = first_derivative(x_vals)
y_double_prime_vals = second_derivative(x_vals)
y_triple_prime_vals = third_derivative(x_vals)

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

# Original function
ax1.plot(x_vals, y_vals, 'b-', linewidth=3, label='f(x) = x³ - 3x² + 2x')
ax1.set_xlabel('x')
ax1.set_ylabel('f(x)')
ax1.set_title('Original Function')
ax1.legend()
ax1.grid(True, alpha=0.3)

# First derivative
ax2.plot(x_vals, y_prime_vals, 'r-', linewidth=3, label="f'(x) = 3x² - 6x + 2")
ax2.axhline(y=0, color='k', linestyle='--', alpha=0.5)
ax2.set_xlabel('x')
ax2.set_ylabel("f'(x)")
ax2.set_title('First Derivative (Critical Points)')
ax2.legend()
ax2.grid(True, alpha=0.3)

# Second derivative
ax3.plot(x_vals, y_double_prime_vals, 'g-', linewidth=3, label="f''(x) = 6x - 6")
ax3.axhline(y=0, color='k', linestyle='--', alpha=0.5)
ax3.set_xlabel('x')
ax3.set_ylabel("f''(x)")
ax3.set_title('Second Derivative (Concavity)')
ax3.legend()
ax3.grid(True, alpha=0.3)

# Third derivative
ax4.plot(x_vals, y_triple_prime_vals, 'm-', linewidth=3, label="f'''(x) = 6")
ax4.set_xlabel('x')
ax4.set_ylabel("f'''(x)")
ax4.set_title('Third Derivative (Constant)')
ax4.legend()
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Find critical points and analyze them
from scipy.optimize import fsolve

def find_critical_points():
    """Find where f'(x) = 0"""
    def equation(x):
        return first_derivative(x)
    
    # Try different starting points
    critical_points = []
    for start in [0, 1, 2]:
        try:
            root = fsolve(equation, start)[0]
            if abs(equation(root)) < 1e-10:  # Check if it's actually a root
                critical_points.append(root)
        except:
            continue
    
    return list(set([round(x, 6) for x in critical_points]))

critical_points = find_critical_points()
print("Critical points (where f'(x) = 0):")
for x in critical_points:
    f_val = polynomial_function(x)
    f_double_prime = second_derivative(x)
    point_type = "Local minimum" if f_double_prime > 0 else "Local maximum" if f_double_prime < 0 else "Saddle point"
    print(f"x = {x:.4f}: f(x) = {f_val:.4f}, f''(x) = {f_double_prime:.4f} ({point_type})")

## 2.7 Applications in Machine Learning

Derivatives are fundamental to many machine learning algorithms and concepts.

In [None]:
# Machine learning applications of derivatives
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pandas as pd

# Generate synthetic data for linear regression
np.random.seed(42)
X = np.random.randn(100, 1) * 2
y = 3 * X + 2 + np.random.randn(100, 1) * 0.5

# Manual gradient descent for linear regression
def linear_regression_gradient_descent(X, y, learning_rate=0.01, epochs=1000):
    """Implement linear regression using gradient descent"""
    n_samples = X.shape[0]
    
    # Initialize parameters
    w = np.random.randn(1, 1)
    b = np.random.randn(1, 1)
    
    # Store history
    history = []
    
    for epoch in range(epochs):
        # Forward pass
        y_pred = X @ w + b
        
        # Compute gradients
        dw = (2/n_samples) * X.T @ (y_pred - y)
        db = (2/n_samples) * np.sum(y_pred - y)
        
        # Update parameters
        w -= learning_rate * dw
        b -= learning_rate * db
        
        # Store history every 100 epochs
        if epoch % 100 == 0:
            mse = mean_squared_error(y, y_pred)
            history.append((epoch, w[0,0], b[0,0], mse))
    
    return w, b, history

# Run gradient descent
w_gd, b_gd, history = linear_regression_gradient_descent(X, y)

# Compare with sklearn
lr = LinearRegression()
lr.fit(X, y)
w_sklearn = lr.coef_[0, 0]
b_sklearn = lr.intercept_[0]

print("Linear Regression Results:")
print(f"Gradient Descent: w = {w_gd[0,0]:.4f}, b = {b_gd[0,0]:.4f}")
print(f"Sklearn: w = {w_sklearn:.4f}, b = {b_sklearn:.4f}")
print(f"Difference: w = {abs(w_gd[0,0] - w_sklearn):.6f}, b = {abs(b_gd[0,0] - b_sklearn):.6f}")

In [None]:
# Visualize the optimization process
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Data and fitted lines
ax1.scatter(X, y, alpha=0.6, label='Data')

# Plot fitted lines
X_line = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
y_gd = X_line * w_gd[0,0] + b_gd[0,0]
y_sklearn = X_line * w_sklearn + b_sklearn

ax1.plot(X_line, y_gd, 'r-', linewidth=2, label=f'Gradient Descent: y = {w_gd[0,0]:.3f}x + {b_gd[0,0]:.3f}')
ax1.plot(X_line, y_sklearn, 'g--', linewidth=2, label=f'Sklearn: y = {w_sklearn:.3f}x + {b_sklearn:.3f}')

ax1.set_xlabel('X')
ax1.set_ylabel('y')
ax1.set_title('Linear Regression Comparison')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Convergence plot
epochs, ws, bs, mses = zip(*history)
ax2.plot(epochs, mses, 'b-', linewidth=2, label='MSE Loss')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Mean Squared Error')
ax2.set_title('Loss Convergence')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print optimization history
print("\nOptimization History:")
print("Epoch | Weight | Bias | MSE")
print("-" * 35)
for epoch, w, b, mse in history:
    print(f"{epoch:5d} | {w:6.4f} | {b:6.4f} | {mse:.6f}")