# Mathematics for Machine Learning

## Learning Objectives
- Understand calculus concepts: derivatives, gradients, partial derivatives
- Master linear algebra: vectors, matrices, operations
- Learn probability and statistics fundamentals
- Explore optimization techniques: gradient descent, cost functions

## Calculus for ML

### Derivatives
- **Definition**: Rate of change of a function
- **ML Application**: Finding optimal parameters by minimizing cost functions
- **Chain Rule**: Essential for backpropagation in neural networks

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import minimize
import sympy as sp

# Derivative example
x = np.linspace(-3, 3, 100)
y = x**2 + 2*x + 1  # f(x) = x² + 2x + 1
dy_dx = 2*x + 2     # f'(x) = 2x + 2

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

# Original function
ax1.plot(x, y, 'b-', label='f(x) = x² + 2x + 1')
ax1.axhline(y=0, color='k', linestyle='--', alpha=0.3)
ax1.axvline(x=0, color='k', linestyle='--', alpha=0.3)
ax1.set_title('Original Function')
ax1.set_xlabel('x')
ax1.set_ylabel('f(x)')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Derivative
ax2.plot(x, dy_dx, 'r-', label="f'(x) = 2x + 2")
ax2.axhline(y=0, color='k', linestyle='--', alpha=0.3)
ax2.axvline(x=0, color='k', linestyle='--', alpha=0.3)
ax2.axvline(x=-1, color='g', linestyle=':', label='Minimum at x=-1')
ax2.set_title('Derivative (Slope)')
ax2.set_xlabel('x')
ax2.set_ylabel("f'(x)")
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Key insight: Derivative = 0 at minimum (x = -1)")

### Partial Derivatives
- **Definition**: Derivative with respect to one variable, holding others constant
- **ML Application**: Gradients in multivariable optimization

In [None]:
# Partial derivatives example
# f(x,y) = x² + y² + 2xy
x = np.linspace(-2, 2, 50)
y = np.linspace(-2, 2, 50)
X, Y = np.meshgrid(x, y)
Z = X**2 + Y**2 + 2*X*Y

# Partial derivatives
# ∂f/∂x = 2x + 2y
# ∂f/∂y = 2y + 2x

fig = plt.figure(figsize=(15, 4))

# 3D surface
ax1 = fig.add_subplot(131, projection='3d')
ax1.plot_surface(X, Y, Z, alpha=0.7, cmap='viridis')
ax1.set_title('f(x,y) = x² + y² + 2xy')
ax1.set_xlabel('x')
ax1.set_ylabel('y')
ax1.set_zlabel('f(x,y)')

# Contour plot
ax2 = fig.add_subplot(132)
contour = ax2.contour(X, Y, Z, levels=20)
ax2.clabel(contour, inline=True, fontsize=8)
ax2.set_title('Contour Plot')
ax2.set_xlabel('x')
ax2.set_ylabel('y')
ax2.grid(True, alpha=0.3)

# Gradient field
ax3 = fig.add_subplot(133)
dx = 2*X + 2*Y  # ∂f/∂x
dy = 2*Y + 2*X  # ∂f/∂y
ax3.quiver(X[::5, ::5], Y[::5, ::5], dx[::5, ::5], dy[::5, ::5], alpha=0.7)
ax3.contour(X, Y, Z, levels=10, alpha=0.3)
ax3.set_title('Gradient Field')
ax3.set_xlabel('x')
ax3.set_ylabel('y')
ax3.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Gradient points in direction of steepest ascent")
print("Negative gradient points toward minimum")

## Linear Algebra for ML

### Vectors and Matrices
- **Vectors**: Represent data points, features, parameters
- **Matrices**: Represent datasets, transformations, weights
- **Operations**: Dot products, matrix multiplication, eigenvalues

In [None]:
# Vector operations
v1 = np.array([1, 2, 3])
v2 = np.array([4, 5, 6])

print("Vector Operations:")
print(f"v1 = {v1}")
print(f"v2 = {v2}")
print(f"v1 + v2 = {v1 + v2}")
print(f"Dot product: v1 · v2 = {np.dot(v1, v2)}")
print(f"Magnitude of v1: ||v1|| = {np.linalg.norm(v1):.3f}")

# Matrix operations
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])

print("\nMatrix Operations:")
print(f"Matrix A:\n{A}")
print(f"Matrix B:\n{B}")
print(f"A + B:\n{A + B}")
print(f"A @ B (matrix multiplication):\n{A @ B}")
print(f"A^T (transpose):\n{A.T}")
print(f"det(A) = {np.linalg.det(A)}")
print(f"A^(-1) (inverse):\n{np.linalg.inv(A)}")

In [None]:
# ML application: Linear regression using matrix operations
# y = Xβ + ε
# Solution: β = (X^T X)^(-1) X^T y

# Generate sample data
np.random.seed(42)
n_samples = 100
X_data = np.random.randn(n_samples, 1)
y_data = 2 * X_data.ravel() + 1 + 0.1 * np.random.randn(n_samples)

# Add bias term (intercept)
X_matrix = np.column_stack([np.ones(n_samples), X_data])

# Analytical solution using linear algebra
beta = np.linalg.inv(X_matrix.T @ X_matrix) @ X_matrix.T @ y_data

print(f"True parameters: intercept=1, slope=2")
print(f"Estimated parameters: intercept={beta[0]:.3f}, slope={beta[1]:.3f}")

# Visualization
plt.figure(figsize=(10, 6))
plt.scatter(X_data, y_data, alpha=0.6, label='Data')
x_line = np.linspace(X_data.min(), X_data.max(), 100)
y_line = beta[0] + beta[1] * x_line
plt.plot(x_line, y_line, 'r-', label=f'Fitted line: y = {beta[0]:.2f} + {beta[1]:.2f}x')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Linear Regression using Matrix Operations')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## Probability and Statistics

### Key Concepts
- **Probability Distributions**: Normal, Bernoulli, Poisson
- **Expectation and Variance**: Central tendencies and spread
- **Bayes' Theorem**: Foundation for probabilistic ML

In [None]:
from scipy import stats

# Common probability distributions
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# Normal distribution
x_norm = np.linspace(-4, 4, 100)
y_norm = stats.norm.pdf(x_norm, 0, 1)
axes[0, 0].plot(x_norm, y_norm, 'b-', label='μ=0, σ=1')
axes[0, 0].fill_between(x_norm, y_norm, alpha=0.3)
axes[0, 0].set_title('Normal Distribution')
axes[0, 0].set_xlabel('x')
axes[0, 0].set_ylabel('Probability Density')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Bernoulli distribution
x_bern = [0, 1]
p = 0.3
y_bern = [1-p, p]
axes[0, 1].bar(x_bern, y_bern, alpha=0.7, color='orange')
axes[0, 1].set_title(f'Bernoulli Distribution (p={p})')
axes[0, 1].set_xlabel('Outcome')
axes[0, 1].set_ylabel('Probability')
axes[0, 1].set_xticks([0, 1])
axes[0, 1].grid(True, alpha=0.3)

# Poisson distribution
x_pois = np.arange(0, 15)
lambda_param = 3
y_pois = stats.poisson.pmf(x_pois, lambda_param)
axes[1, 0].bar(x_pois, y_pois, alpha=0.7, color='green')
axes[1, 0].set_title(f'Poisson Distribution (λ={lambda_param})')
axes[1, 0].set_xlabel('Number of Events')
axes[1, 0].set_ylabel('Probability')
axes[1, 0].grid(True, alpha=0.3)

# Central Limit Theorem demonstration
sample_means = []
for _ in range(1000):
    sample = np.random.exponential(2, 30)  # Non-normal distribution
    sample_means.append(np.mean(sample))

axes[1, 1].hist(sample_means, bins=30, alpha=0.7, density=True, color='purple')
axes[1, 1].set_title('Central Limit Theorem\n(Sample means are normal)')
axes[1, 1].set_xlabel('Sample Mean')
axes[1, 1].set_ylabel('Density')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Sample means: μ = {np.mean(sample_means):.3f}, σ = {np.std(sample_means):.3f}")
print(f"Theoretical: μ = 2.0, σ = {2/np.sqrt(30):.3f}")

### Bayes' Theorem
**P(A|B) = P(B|A) × P(A) / P(B)**

- **P(A|B)**: Posterior probability
- **P(B|A)**: Likelihood
- **P(A)**: Prior probability
- **P(B)**: Evidence

In [None]:
# Bayes' Theorem example: Medical diagnosis
# Disease prevalence: 1%
# Test accuracy: 95% (both sensitivity and specificity)

P_disease = 0.01  # Prior: 1% of population has disease
P_no_disease = 0.99

P_positive_given_disease = 0.95  # Sensitivity
P_negative_given_no_disease = 0.95  # Specificity
P_positive_given_no_disease = 0.05  # False positive rate

# Total probability of positive test
P_positive = (P_positive_given_disease * P_disease + 
              P_positive_given_no_disease * P_no_disease)

# Bayes' theorem: P(disease|positive test)
P_disease_given_positive = (P_positive_given_disease * P_disease) / P_positive

print("Bayes' Theorem Example: Medical Diagnosis")
print(f"Prior probability of disease: {P_disease:.1%}")
print(f"Test accuracy: {P_positive_given_disease:.1%}")
print(f"Probability of positive test: {P_positive:.3f}")
print(f"Probability of disease given positive test: {P_disease_given_positive:.1%}")
print("\nKey insight: Even with 95% accurate test, only 16% chance of having disease!")

# Visualization
categories = ['Prior\n(Disease)', 'Posterior\n(Disease|Positive)']
probabilities = [P_disease, P_disease_given_positive]

plt.figure(figsize=(8, 5))
bars = plt.bar(categories, probabilities, color=['lightblue', 'lightcoral'], alpha=0.7)
plt.ylabel('Probability')
plt.title('Bayes\' Theorem: Updating Beliefs with Evidence')
plt.ylim(0, 0.2)

# Add value labels on bars
for bar, prob in zip(bars, probabilities):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005,
             f'{prob:.1%}', ha='center', va='bottom', fontweight='bold')

plt.grid(True, alpha=0.3)
plt.show()

## Optimization: Gradient Descent

### Cost Functions
- **Purpose**: Measure how well model fits data
- **Examples**: Mean Squared Error, Cross-entropy
- **Goal**: Minimize cost function to find optimal parameters

In [None]:
# Gradient descent implementation
def gradient_descent_1d(f, df, x_start, learning_rate=0.1, max_iter=100):
    """1D gradient descent"""
    x = x_start
    history = [x]
    
    for i in range(max_iter):
        gradient = df(x)
        x = x - learning_rate * gradient
        history.append(x)
        
        if abs(gradient) < 1e-6:  # Convergence check
            break
    
    return x, history

# Example function: f(x) = x² - 4x + 4 = (x-2)²
def f(x):
    return x**2 - 4*x + 4

def df(x):
    return 2*x - 4

# Run gradient descent
x_optimal, history = gradient_descent_1d(f, df, x_start=0, learning_rate=0.3)

print(f"Optimal x: {x_optimal:.6f}")
print(f"Minimum value: {f(x_optimal):.6f}")
print(f"Iterations: {len(history)-1}")

# Visualization
x_plot = np.linspace(-1, 5, 100)
y_plot = f(x_plot)

plt.figure(figsize=(12, 5))

# Function and optimization path
plt.subplot(1, 2, 1)
plt.plot(x_plot, y_plot, 'b-', label='f(x) = (x-2)²')
plt.plot(history, [f(x) for x in history], 'ro-', alpha=0.7, label='Gradient descent path')
plt.plot(2, 0, 'g*', markersize=15, label='True minimum')
plt.xlabel('x')
plt.ylabel('f(x)')
plt.title('Gradient Descent Optimization')
plt.legend()
plt.grid(True, alpha=0.3)

# Convergence plot
plt.subplot(1, 2, 2)
plt.plot(range(len(history)), history, 'o-')
plt.axhline(y=2, color='g', linestyle='--', label='True minimum')
plt.xlabel('Iteration')
plt.ylabel('x value')
plt.title('Convergence to Minimum')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# 2D gradient descent for linear regression
def gradient_descent_linear_regression(X, y, learning_rate=0.01, max_iter=1000):
    """Gradient descent for linear regression"""
    m, n = X.shape
    theta = np.zeros(n)  # Initialize parameters
    cost_history = []
    
    for i in range(max_iter):
        # Predictions
        h = X @ theta
        
        # Cost (MSE)
        cost = (1/(2*m)) * np.sum((h - y)**2)
        cost_history.append(cost)
        
        # Gradients
        gradients = (1/m) * X.T @ (h - y)
        
        # Update parameters
        theta = theta - learning_rate * gradients
        
        # Check convergence
        if i > 0 and abs(cost_history[-2] - cost_history[-1]) < 1e-8:
            break
    
    return theta, cost_history

# Generate data
np.random.seed(42)
m = 100
X_raw = np.random.randn(m, 1)
y = 4 + 3 * X_raw.ravel() + np.random.randn(m)
X = np.column_stack([np.ones(m), X_raw])  # Add bias term

# Run gradient descent
theta_gd, cost_history = gradient_descent_linear_regression(X, y, learning_rate=0.01)

print(f"True parameters: intercept=4, slope=3")
print(f"Gradient descent: intercept={theta_gd[0]:.3f}, slope={theta_gd[1]:.3f}")

# Compare with analytical solution
theta_analytical = np.linalg.inv(X.T @ X) @ X.T @ y
print(f"Analytical solution: intercept={theta_analytical[0]:.3f}, slope={theta_analytical[1]:.3f}")

# Visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Data and fitted line
ax1.scatter(X_raw, y, alpha=0.6, label='Data')
x_line = np.linspace(X_raw.min(), X_raw.max(), 100)
y_line_gd = theta_gd[0] + theta_gd[1] * x_line
y_line_analytical = theta_analytical[0] + theta_analytical[1] * x_line
ax1.plot(x_line, y_line_gd, 'r-', label='Gradient Descent')
ax1.plot(x_line, y_line_analytical, 'g--', label='Analytical Solution')
ax1.set_xlabel('X')
ax1.set_ylabel('y')
ax1.set_title('Linear Regression: Gradient Descent vs Analytical')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Cost function convergence
ax2.plot(cost_history)
ax2.set_xlabel('Iteration')
ax2.set_ylabel('Cost (MSE)')
ax2.set_title('Cost Function Convergence')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Final cost: {cost_history[-1]:.6f}")
print(f"Iterations to convergence: {len(cost_history)}")

## Summary

### Calculus
- **Derivatives**: Find optimal points by setting gradient to zero
- **Partial derivatives**: Handle multivariable functions
- **Chain rule**: Essential for neural network backpropagation

### Linear Algebra
- **Vectors and matrices**: Represent data and transformations
- **Matrix operations**: Efficient computation for ML algorithms
- **Eigenvalues/eigenvectors**: PCA, dimensionality reduction

### Probability and Statistics
- **Distributions**: Model uncertainty and variability
- **Bayes' theorem**: Update beliefs with new evidence
- **Central limit theorem**: Foundation for statistical inference

### Optimization
- **Gradient descent**: Iterative optimization algorithm
- **Cost functions**: Objective functions to minimize
- **Learning rate**: Controls convergence speed and stability

## Next Steps
- Apply these concepts to deep learning frameworks
- Explore advanced optimization techniques (Adam, RMSprop)
- Learn about regularization and constraint optimization