<span style="color:#333333; font-size:24px; font-weight:bold"> Compiled by <a href=https://github.com/cyterat style="color:#00b2b7;">cyterat</a></span>

# Explanation:

- __confidence_level__ is typically 0.95 for 95% confidence

- __margin_error__ is the acceptable error margin (e.g., 0.05 for ±5%)

- __std_dev__ is the estimated standard deviation of the population

- __p__ is the estimated proportion (0.5 is used when unknown, as it gives the largest sample size)

- __power__ is typically set to 0.8, meaning an 80% chance of detecting an effect if one exists

- __alpha__ is the significance level, typically 0.05

- __r__ is the expected correlation coefficient

- __deff__ is the design effect for cluster sampling, typically between 1 and 4

# 1. Sample Size for Estimating a Proportion

__Use case__: When you're trying to estimate the prevalence of a certain characteristic in a population (e.g., the proportion of successful missions).

In [None]:
import math

def sample_size_proportion(confidence_level, margin_error, p=0.5):
    z_score = {
        0.90: 1.645,
        0.95: 1.96,
        0.99: 2.576
    }.get(confidence_level, 1.96)
    
    sample_size = (z_score**2 * p * (1-p)) / (margin_error**2)
    return math.ceil(sample_size)

# Example usage
n = sample_size_proportion(confidence_level=0.95, margin_error=0.05)
print(f"Required sample size: {n}")

# 2. Sample Size for Comparing Two Proportions

__Use case__: When you're comparing the effectiveness of two different strategies or systems.

In [None]:
import math
from scipy import stats

def sample_size_two_proportions(p1, p2, power=0.8, alpha=0.05):
    z_alpha = stats.norm.ppf(1 - alpha/2)
    z_beta = stats.norm.ppf(power)
    
    p_pooled = (p1 + p2) / 2
    q_pooled = 1 - p_pooled
    
    n = ((z_alpha * math.sqrt(2 * p_pooled * q_pooled) + 
          z_beta * math.sqrt(p1 * (1-p1) + p2 * (1-p2)))**2) / (p1 - p2)**2
    
    return math.ceil(n)

# Example usage
n = sample_size_two_proportions(p1=0.3, p2=0.4, power=0.8, alpha=0.05)
print(f"Required sample size per group: {n}")

# 3. Sample Size for Estimating a Mean

__Use case__: When you're trying to estimate an average value in a population (e.g., average mission duration).

In [None]:
import math

def sample_size_mean(confidence_level, margin_error, std_dev):
    z_score = {
        0.90: 1.645,
        0.95: 1.96,
        0.99: 2.576
    }.get(confidence_level, 1.96)
    
    sample_size = (z_score * std_dev / margin_error)**2
    return math.ceil(sample_size)

# Example usage
n = sample_size_mean(confidence_level=0.95, margin_error=0.5, std_dev=2.5)
print(f"Required sample size: {n}")

# 4. Sample Size for Detecting a Correlation

__Use case__: When you're trying to detect a relationship between two continuous variables (e.g., training hours and performance scores).

In [None]:
import math
from scipy import stats

def sample_size_correlation(r, alpha=0.05, power=0.8):
    z_alpha = stats.norm.ppf(1 - alpha/2)
    z_beta = stats.norm.ppf(power)
    
    n = ((z_alpha + z_beta) / (0.5 * math.log((1+r)/(1-r))))**2 + 3
    
    return math.ceil(n)

# Example usage
n = sample_size_correlation(r=0.3, alpha=0.05, power=0.8)
print(f"Required sample size: {n}")

# 5. Sample Size for Time Series Analysis

__Info__: For time series analysis, the sample size often refers to the number of time points. A general rule of thumb is to have at least 50 observations for basic time series analysis.

In [None]:
def sample_size_time_series(frequency='daily', duration_years=1):
    frequencies = {
        'daily': 365,
        'weekly': 52,
        'monthly': 12,
        'quarterly': 4
    }
    
    n = frequencies.get(frequency, 365) * duration_years
    return max(n, 50)  # Ensure at least 50 observations

# Example usage
n = sample_size_time_series(frequency='weekly', duration_years=2)
print(f"Required number of time points: {n}")

# 6. Sample Size for Time Series Analysis

__Info__: For time series analysis, the sample size often refers to the number of time points. A general rule of thumb is to have at least 50 observations for basic time series analysis.

In [None]:
import math

def sample_size_cluster(confidence_level, margin_error, p=0.5, deff=2):
    z_score = {
        0.90: 1.645,
        0.95: 1.96,
        0.99: 2.576
    }.get(confidence_level, 1.96)
    
    sample_size = ((z_score**2 * p * (1-p)) / (margin_error**2)) * deff
    return math.ceil(sample_size)

# Example usage
n = sample_size_cluster(confidence_level=0.95, margin_error=0.05, deff=2)
print(f"Required sample size: {n}")