# Introduction to Statistics - Tutorial

Statistics is the foundation of data science. This notebook covers essential statistical concepts through hands-on examples.

## Learning Objectives
- Calculate and interpret measures of central tendency
- Understand measures of spread and variability
- Work with probability distributions
- Visualize statistical concepts

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

plt.style.use('seaborn-v0_8-whitegrid')
np.random.seed(42)
print("Libraries loaded!")

## 1. Measures of Central Tendency

Central tendency tells us about the "typical" value in our data.

In [None]:
# Sample data: exam scores
scores = [72, 85, 90, 65, 78, 92, 88, 76, 81, 95, 70, 84, 79, 91, 73]

# Calculate measures
mean = np.mean(scores)
median = np.median(scores)
mode = stats.mode(scores, keepdims=True).mode[0]

print(f"Scores: {sorted(scores)}")
print(f"\nMean (Average): {mean:.2f}")
print(f"Median (Middle): {median:.2f}")
print(f"Mode (Most Common): {mode}")

In [None]:
# Visualize central tendency
fig, ax = plt.subplots(figsize=(10, 5))

ax.hist(scores, bins=10, edgecolor='white', alpha=0.7, color='steelblue')
ax.axvline(mean, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean:.1f}')
ax.axvline(median, color='green', linestyle=':', linewidth=2, label=f'Median: {median:.1f}')

ax.set_xlabel('Score')
ax.set_ylabel('Frequency')
ax.set_title('Distribution of Exam Scores')
ax.legend()
plt.show()

### When to Use Each Measure

| Measure | Best When | Watch Out For |
|---------|-----------|---------------|
| **Mean** | Symmetric distributions | Outliers pull it away |
| **Median** | Skewed data or outliers | Ignores distribution shape |
| **Mode** | Categorical data | May not exist or be unique |

In [None]:
# Effect of outliers
salaries = [50000, 52000, 55000, 48000, 51000, 53000, 49000, 500000]  # CEO salary!

print("Salaries with outlier:")
print(f"  Mean: ${np.mean(salaries):,.0f}")
print(f"  Median: ${np.median(salaries):,.0f}")
print(f"\nThe median better represents 'typical' salary here!")

## 2. Measures of Spread

Spread tells us how much our data varies.

In [None]:
# Calculate measures of spread
data = np.array(scores)

range_val = np.max(data) - np.min(data)
variance = np.var(data, ddof=1)  # Sample variance
std_dev = np.std(data, ddof=1)   # Sample standard deviation
q1, q3 = np.percentile(data, [25, 75])
iqr = q3 - q1

print("Measures of Spread:")
print(f"  Range: {range_val}")
print(f"  Variance: {variance:.2f}")
print(f"  Standard Deviation: {std_dev:.2f}")
print(f"  Q1 (25th percentile): {q1}")
print(f"  Q3 (75th percentile): {q3}")
print(f"  IQR (Q3 - Q1): {iqr}")

In [None]:
# Visualize spread with box plot
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Box plot
axes[0].boxplot(scores, vert=True)
axes[0].set_ylabel('Score')
axes[0].set_title('Box Plot of Scores')

# Histogram with std dev
axes[1].hist(scores, bins=8, edgecolor='white', alpha=0.7)
axes[1].axvline(mean, color='red', linestyle='--', label='Mean')
axes[1].axvline(mean - std_dev, color='orange', linestyle=':', label='±1 Std Dev')
axes[1].axvline(mean + std_dev, color='orange', linestyle=':')
axes[1].set_xlabel('Score')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution with Standard Deviation')
axes[1].legend()

plt.tight_layout()
plt.show()

## 3. The Normal Distribution

The normal (Gaussian) distribution is fundamental to statistics.

In [None]:
# Generate normal distribution
mu, sigma = 100, 15  # Mean = 100, Std = 15 (like IQ scores)
normal_data = np.random.normal(mu, sigma, 10000)

# Plot
fig, ax = plt.subplots(figsize=(10, 6))

# Histogram
ax.hist(normal_data, bins=50, density=True, alpha=0.7, color='steelblue')

# Theoretical curve
x = np.linspace(mu - 4*sigma, mu + 4*sigma, 100)
ax.plot(x, stats.norm.pdf(x, mu, sigma), 'r-', linewidth=2, label='Normal Curve')

# Mark standard deviations
for i in range(1, 4):
    ax.axvline(mu - i*sigma, color='gray', linestyle='--', alpha=0.5)
    ax.axvline(mu + i*sigma, color='gray', linestyle='--', alpha=0.5)

ax.set_xlabel('Value')
ax.set_ylabel('Density')
ax.set_title(f'Normal Distribution (μ={mu}, σ={sigma})')
ax.legend()
plt.show()

In [None]:
# The 68-95-99.7 Rule
within_1_std = np.sum((normal_data >= mu - sigma) & (normal_data <= mu + sigma)) / len(normal_data) * 100
within_2_std = np.sum((normal_data >= mu - 2*sigma) & (normal_data <= mu + 2*sigma)) / len(normal_data) * 100
within_3_std = np.sum((normal_data >= mu - 3*sigma) & (normal_data <= mu + 3*sigma)) / len(normal_data) * 100

print("The 68-95-99.7 Rule:")
print(f"  Within 1 std dev: {within_1_std:.1f}% (expected: ~68%)")
print(f"  Within 2 std dev: {within_2_std:.1f}% (expected: ~95%)")
print(f"  Within 3 std dev: {within_3_std:.1f}% (expected: ~99.7%)")

## 4. Z-Scores (Standardization)

Z-scores tell us how many standard deviations a value is from the mean.

In [None]:
# Calculate z-scores
def calculate_z_score(value, mean, std):
    return (value - mean) / std

# Example: If IQ mean=100, std=15
iq_scores = [85, 100, 115, 130, 145]

print("IQ Score → Z-Score:")
for iq in iq_scores:
    z = calculate_z_score(iq, 100, 15)
    percentile = stats.norm.cdf(z) * 100
    print(f"  IQ {iq} → z = {z:.2f} → {percentile:.1f}th percentile")

## 5. Correlation

Correlation measures the relationship between two variables.

In [None]:
# Generate correlated data
n = 100
study_hours = np.random.uniform(1, 10, n)
exam_scores = 50 + 5 * study_hours + np.random.normal(0, 5, n)

# Calculate correlation
correlation = np.corrcoef(study_hours, exam_scores)[0, 1]

# Visualize
fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(study_hours, exam_scores, alpha=0.6)

# Add trend line
z = np.polyfit(study_hours, exam_scores, 1)
p = np.poly1d(z)
ax.plot(study_hours, p(study_hours), "r--", label=f'Trend (r = {correlation:.2f})')

ax.set_xlabel('Study Hours')
ax.set_ylabel('Exam Score')
ax.set_title('Correlation: Study Hours vs Exam Score')
ax.legend()
plt.show()

print(f"Correlation coefficient: {correlation:.3f}")

In [None]:
# Different types of correlations
fig, axes = plt.subplots(1, 3, figsize=(14, 4))

# Strong positive
x1 = np.random.uniform(0, 10, 50)
y1 = 2*x1 + np.random.normal(0, 1, 50)
axes[0].scatter(x1, y1)
axes[0].set_title(f'Strong Positive\nr = {np.corrcoef(x1, y1)[0,1]:.2f}')

# No correlation
x2 = np.random.uniform(0, 10, 50)
y2 = np.random.uniform(0, 10, 50)
axes[1].scatter(x2, y2)
axes[1].set_title(f'No Correlation\nr = {np.corrcoef(x2, y2)[0,1]:.2f}')

# Strong negative
x3 = np.random.uniform(0, 10, 50)
y3 = -2*x3 + 20 + np.random.normal(0, 1, 50)
axes[2].scatter(x3, y3)
axes[2].set_title(f'Strong Negative\nr = {np.corrcoef(x3, y3)[0,1]:.2f}')

plt.tight_layout()
plt.show()

## 6. Practice Exercises

### Exercise 1: Calculate statistics for this dataset

In [None]:
heights = [165, 170, 175, 168, 182, 159, 177, 171, 180, 163, 188, 172]

# Calculate: mean, median, std, and range
# Your code here


### Exercise 2: What percentile is a score of 85 if mean=75 and std=10?

In [None]:
# Calculate the z-score and then the percentile
# Your code here


---

<details>
<summary>Click to see solutions</summary>

```python
# Exercise 1 Solution
heights = [165, 170, 175, 168, 182, 159, 177, 171, 180, 163, 188, 172]
print(f"Mean: {np.mean(heights):.1f}")
print(f"Median: {np.median(heights):.1f}")
print(f"Std: {np.std(heights, ddof=1):.1f}")
print(f"Range: {max(heights) - min(heights)}")

# Exercise 2 Solution
z = (85 - 75) / 10  # z = 1.0
percentile = stats.norm.cdf(z) * 100
print(f"Z-score: {z}")
print(f"Percentile: {percentile:.1f}th")
```
</details>

## Summary

You've learned:
- **Central Tendency**: Mean, median, mode and when to use each
- **Spread**: Variance, standard deviation, range, IQR
- **Normal Distribution**: The 68-95-99.7 rule
- **Z-Scores**: Standardizing values for comparison
- **Correlation**: Measuring relationships between variables