# 4.1 Inferential Statistics Tutorial

This notebook covers key concepts in inferential statistics including:
- Population vs Sample
- Parameters vs Statistics
- Sampling Distributions
- Confidence Intervals
- P-values

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Set random seed for reproducibility
np.random.seed(42)

## 1. Population vs Sample

Let's demonstrate sampling from a population and visualize the relationship between population parameters and sample statistics.

In [None]:
# Create a population
population = np.random.normal(loc=100, scale=15, size=10000)

# Take a random sample
sample = np.random.choice(population, size=100, replace=False)

# Plot population and sample distributions
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

sns.histplot(population, ax=ax1, bins=50)
ax1.axvline(np.mean(population), color='red', linestyle='--', label=f'Population Mean = {np.mean(population):.2f}')
ax1.set_title('Population Distribution')
ax1.legend()

sns.histplot(sample, ax=ax2, bins=20)
ax2.axvline(np.mean(sample), color='red', linestyle='--', label=f'Sample Mean = {np.mean(sample):.2f}')
ax2.set_title('Sample Distribution')
ax2.legend()

plt.tight_layout()
plt.show()

## 2. Sampling Distribution

Let's demonstrate the Central Limit Theorem by creating a sampling distribution of means.

In [None]:
# Take multiple samples and calculate their means
sample_means = [np.mean(np.random.choice(population, size=100)) for _ in range(1000)]

# Plot the sampling distribution of means
plt.figure(figsize=(10, 6))
sns.histplot(sample_means, bins=30)
plt.axvline(np.mean(population), color='red', linestyle='--', label='Population Mean')
plt.title('Sampling Distribution of Means')
plt.legend()
plt.show()

print(f"Standard Error of Mean: {np.std(sample_means):.2f}")
print(f"Theoretical SE (σ/√n): {np.std(population)/np.sqrt(100):.2f}")

## 3. Confidence Intervals

Let's calculate and visualize confidence intervals for the mean.

In [None]:
# Function to calculate confidence interval
def calculate_ci(data, confidence=0.95):
    n = len(data)
    mean = np.mean(data)
    se = stats.sem(data)
    ci = stats.t.interval(confidence, n-1, mean, se)
    return ci

# Calculate 95% CI for our sample
ci = calculate_ci(sample)

# Visualize the confidence interval
plt.figure(figsize=(10, 6))
sns.histplot(sample, bins=20)
plt.axvline(np.mean(sample), color='red', linestyle='--', label='Sample Mean')
plt.axvline(ci[0], color='green', linestyle=':', label='95% CI')
plt.axvline(ci[1], color='green', linestyle=':')
plt.title('Sample Distribution with 95% Confidence Interval')
plt.legend()
plt.show()

print(f"95% Confidence Interval: ({ci[0]:.2f}, {ci[1]:.2f})")

## 4. P-values

Let's demonstrate p-value calculation using a one-sample t-test.

In [None]:
# Perform one-sample t-test
hypothesized_mean = 105
t_stat, p_value = stats.ttest_1samp(sample, hypothesized_mean)

# Visualize the test
plt.figure(figsize=(10, 6))
sns.histplot(sample, bins=20, stat='density')
plt.axvline(hypothesized_mean, color='red', linestyle='--', label='Hypothesized Mean')
plt.axvline(np.mean(sample), color='blue', linestyle='--', label='Sample Mean')
plt.title('Sample Distribution with Hypothesized Mean')
plt.legend()
plt.show()

print(f"t-statistic: {t_stat:.2f}")
print(f"p-value: {p_value:.4f}")

## Practice Exercises

1. Create your own population with a different distribution (e.g., uniform or exponential) and explore sampling properties.

2. Investigate how sample size affects:
   - The width of confidence intervals
   - The shape of the sampling distribution
   - The power of hypothesis tests

3. Calculate and compare confidence intervals at different confidence levels (90%, 95%, 99%).

4. Perform two-sample hypothesis tests comparing means of different groups.