In [1]:
import numpy as np

In [2]:
class Loss:
    # --- 1. Mean Squared Error (for Regression) ---
    @staticmethod
    def mse(y_true, y_pred):
        return np.mean(np.power(y_true - y_pred, 2))

    @staticmethod
    def mse_derivative(y_true, y_pred):
        # The 2 * ... is often ignored in deep learning as it's absorbed by the learning rate
        return 2 * (y_pred - y_true) / y_true.size

    # --- 2. Binary Cross Entropy (for Binary Classification) ---
    @staticmethod
    def binary_cross_entropy(y_true, y_pred):
        # We add a tiny epsilon (1e-15) to prevent log(0) which is undefined
        y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
        return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

    @staticmethod
    def bce_derivative(y_true, y_pred):
        y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
        return (y_pred - y_true) / (y_pred * (1 - y_pred))

In [3]:
y_true = 1.0 # The actual answer is "Yes"

# Scenario A: Model is very confident it's "Yes"
y_pred_A = 0.99 
# Scenario B: Model is very confident it's "No" (Model is wrong!)
y_pred_B = 0.01 

loss_A = Loss.binary_cross_entropy(y_true, y_pred_A)
loss_B = Loss.binary_cross_entropy(y_true, y_pred_B)

print(f"Loss when correct: {loss_A:.4f}")
print(f"Loss when wrong: {loss_B:.4f}")
print(f"The 'Wrong' prediction is {loss_B/loss_A:.0f} times more painful!")

Loss when correct: 0.0101
Loss when wrong: 4.6052
The 'Wrong' prediction is 458 times more painful!


### Why not use MSE for classification?
Answer: MSE is a "convex" (bowl-shaped) function for linear problems, but when combined with a Sigmoid activation (which we use for classification), the loss surface becomes "non-convex" (lots of local bumps and flat spots).
Cross-Entropy "cancels out" the flatness of the Sigmoid curve, making the math much smoother for the optimizer to find the bottom of the hill.