In [2]:
from imports import *

Find using Python the Centraliy and Dispersion measures (mean, Median, mod, variance, std, MAD, IQR)

In [3]:
bi_n1 = 100
bi_p1 = 0.05
bi_p1 = 0.2
binom_dist_p1 = binom(bi_n1, bi_p1)
mean = binom_dist_p1.mean()
print("Mean:", mean)
median = binom_dist_p1.median()
print("Median:", median)

mode = floor((bi_n1 + 1) * bi_p1) #TODO check if this is correct
print("Mode:", mode)

variance = binom_dist_p1.var()

std_dev = binom_dist_p1.std()
print("Standard Deviation:", std_dev)

k_vals = np.arange(bi_n1+1)

# Compute PMF for each outcome
pmf_vals = binom_dist_p1.pmf(k_vals)

# Compute absolute deviations from the mean, weighted by PMF
mad = np.sum(np.abs(k_vals - mean) * pmf_vals)
print("Mean Absolute Deviation (MAD):", mad)

q25 = binom_dist_p1.ppf(0.25)  # 25th percentile
q75 = binom_dist_p1.ppf(0.75)  # 75th percentile
iqr = q75 - q25
print("Interquartile Range (IQR):", iqr)

Mean: 20.0
Median: 20.0
Mode: 20
Standard Deviation: 4.0
Mean Absolute Deviation (MAD): 3.1776068738823895
Interquartile Range (IQR): 6.0


In [17]:

def binomial_stats(n, p):
    """Return stats for a Binomial(n, p) distribution."""
    dist = binom(n, p)
    
    # Mean, Variance, Std from scipy
    mean_val = dist.mean()
    var_val = dist.var()
    std_val = dist.std()
    
    # Median: using SciPy's built-in median
    # (In older scipy versions, .median() might be missing, so one can search via cdf.)
    median_val = dist.median()
    
    # Mode (common single-value formula); if (n+1)*p is an integer, you can have two modes
    mode_val = int(floor((n + 1) * p))
    
    # MAD: sum of |k - mean| * PMF(k) over k=0..n
    k_vals = np.arange(n + 1)
    pmf_vals = dist.pmf(k_vals)
    mad_val = np.sum(np.abs(k_vals - mean_val) * pmf_vals)
    
    # IQR: ppf(0.75) - ppf(0.25)
    q25 = dist.ppf(0.25)
    q75 = dist.ppf(0.75)
    iqr_val = q75 - q25
    
    return {
        "dist_type": f"Binomial(n={n}, p={p})",
        "mean": mean_val,
        "median": median_val,
        "mode": mode_val,
        "variance": var_val,
        "std": std_val,
        "mad": mad_val,
        "iqr": iqr_val
    }

def exponential_stats(lmbda=1.0):
    """Return stats for Exponential(rate = lambda)."""
    # For an Exponential(λ=1), we know:
    #   Mean = 1/λ
    #   Median = (1/λ) * ln(2)
    #   Mode = 0 (for λ>0)
    #   Variance = 1 / λ^2
    #   Std = 1/λ
    #   IQR = exp.ppf(0.75) - exp.ppf(0.25)
    #   MAD = E[|X - mean|] = 2 * e^(-1) for λ=1 (≈ 0.73576)
    
    dist = expon(scale=1.0/lmbda)
    mean_val = dist.mean()
    median_val = dist.median()
    mode_val = 0.0
    var_val = dist.var()
    std_val = dist.std()
    
    q25 = dist.ppf(0.25)
    q75 = dist.ppf(0.75)
    iqr_val = q75 - q25
    
    # For λ=1 specifically, we can calculate the exact MAD = 2*e^(-1)
    # For general λ, one might do a small numeric integration or a sample-based estimate.
    mad_val = 2 * np.exp(-1)
    
    return {
        "dist_type": "Exponential(1)",
        "mean": mean_val,
        "median": median_val,
        "mode": mode_val,
        "variance": var_val,
        "std": std_val,
        "mad": mad_val,
        "iqr": iqr_val
    }

def normal_stats(mu=0.0, sigma=1.0):
    """Return stats for Normal(mu, sigma)."""
    dist = norm(loc=mu, scale=sigma)
    
    # For Normal(0,1):
    #   Mean = Median = Mode = mu
    #   Variance = sigma^2
    #   Std = sigma
    #   IQR = ppf(0.75) - ppf(0.25)
    #   MAD about the mean for Normal(0,1) is sqrt(2/pi) ≈ 0.79788456
    #     (If sigma != 1, then MAD = sigma * sqrt(2/pi).)
    
    mean_val = mu
    median_val = mu
    mode_val = mu
    var_val = sigma**2
    std_val = sigma
    
    q25 = dist.ppf(0.25)
    q75 = dist.ppf(0.75)
    iqr_val = q75 - q25
    
    # MAD for Normal(0,1) is sqrt(2/pi). Scale by sigma if not standard.
    mad_val = sigma * np.sqrt(2 / np.pi)
    
    return {
        "dist_type": f"Normal({mu}, {sigma})",
        "mean": mean_val,
        "median": median_val,
        "mode": mode_val,
        "variance": var_val,
        "std": std_val,
        "mad": mad_val,
        "iqr": iqr_val
    }

# Collect the distributions we want:
stats_list = []
stats_list.append(binomial_stats(n=100, p=0.05))
stats_list.append(binomial_stats(n=100, p=0.2))
stats_list.append(exponential_stats(lmbda=1.0))  # Exponential(1)
stats_list.append(normal_stats(mu=0.0, sigma=1.0))  # Normal(0,1)

# Create a pandas DataFrame
df = pd.DataFrame(stats_list, 
                  columns=["dist_type", "mean", "median", "mode", 
                           "variance", "std", "mad", "iqr"])
print(df)


                 dist_type  mean     median  mode  variance       std  \
0  Binomial(n=100, p=0.05)   5.0   5.000000   5.0      4.75  2.179449   
1   Binomial(n=100, p=0.2)  20.0  20.000000  20.0     16.00  4.000000   
2           Exponential(1)   1.0   0.693147   0.0      1.00  1.000000   
3         Normal(0.0, 1.0)   0.0   0.000000   0.0      1.00  1.000000   

        mad       iqr  
0  1.710169  3.000000  
1  3.177607  6.000000  
2  0.735759  1.098612  
3  0.797885  1.348980  


# One things I learnt

Median of a binomial distribution doesn’t always have a closed-form expression.
A "closed-form expression" means a mathematical formula that can be written using a finite number of standard operations (like addition, multiplication, exponents, etc.)


#### Note to self on learning AKA META-Learning
I should just finish this in one go.
I have a bad habit of paritally starting things, then telling myself I will comeback to.
This takes longer and I do not think I gain anything from learning standpoint.

Also I heavily relied on chatgpt for this.
What other learning could I extract from this?
Personally I think looking at the results, see if they make sense from an intuition perspective and asking deeper questions that come to mind, then answering those.

### Question I wonder

1. When is it avgentagous to use  Mean Absolute Deviation (MAD) vs SD
Use MAD when:

    You need a more intuitive/interpretable measure MAD is easier to explain as "average distance from the mean"

    
    Dealing with outliers

    Less sensitive to outliers than SD

    Good for skewed distributions

    Better for robust statistics

    Teaching basic statistical concepts

    Simpler to understand

    Easier to calculate manually

Use SD when:

    Need mathematical properties
      
    Working with normal distributions

    Directly related to normal distribution properties

    Better for confidence intervals

    Required for z-scores


2. How is MAD solve for in exponential
3. Why is MAD for standard normal $\sqrt{\frac{2}{\pi}}$.

1. **Standard Normal Distribution \( N(0,1) \)**

   The mean absolute deviation is:
   $
   \mathrm{MAD}_{\mathrm{std}} = \sqrt{\frac{2}{\pi}}.
$

2. **General Normal Distribution \( N(\mu, \sigma) \)**

   When scaled by \(\sigma\), the MAD becomes:
    $
   \mathrm{MAD} = \sigma \times \sqrt{\frac{2}{\pi}}.
   $

<!-- 3. Why for binomial we approx with medium with some numerical method?

    - The comment refers to the fact that in older versions of SciPy, the median() method might not be available directly for discrete distributions like the binomial distribution. In such cases, you can find the median by searching for the value where the cumulative distribution function (CDF) first reaches or exceeds 0.5. -->

