# Setup

## Packages

In [1]:
import warnings
warnings.filterwarnings('ignore')
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, Math
from sympy import latex, symbols, diff
import numpy as np
pd.set_option('display.precision', 2)
np.set_printoptions(precision=2)
random.seed(1738)

## Functions

In [2]:
def round_values(numbers, precision):
    """
    Rounds a list of numbers to a given precision.

    Parameters:
    numbers (list of float): The list of numbers to round.
    precision (int): The number of decimal places to round each number to.

    Returns:
    list of float: The list of rounded numbers.
    """
    return [round(num, precision) for num in numbers]

# Population Proportion

## One Proportion

### Estimating Population Proportion

A sample of 659 parents with a toddler was taken and asked if they used a car seat for all travel with their toddler. 540 parents responded 'yes' to this question. Calcualte the proportion and 95% CIs

#### Normal Approximation Method

$$
\hat{p} \pm z \sqrt{\frac{\hat{p}(1 - \hat{p})}{n}}
$$

Breaking it down:

* $\hat{p}$ represents the sample proportion.
* $z$ is the z-score corresponding to the confidence level (1.96 for 95% confidence).
* $n$ is the sample size.
* The square root term $\sqrt{\frac{\hat{p}(1 - \hat{p})}{n}}$ calculates the standard error of the sample proportion.

In [3]:

# Given data
count = 540  # number of parents who responded 'yes'
nobs = 659   # total number of parents asked
# Calculate the proportion
proportion = count / nobs
# Proportion as per the new information
p_hat = proportion
# Total number of parents surveyed
n = nobs
# Z-score for 95% confidence
z = 1.96

# Calculate the standard error
se = np.sqrt(p_hat * (1 - p_hat) / n)

# Calculate the 95% confidence interval using the given formula
ci_lower = p_hat - z * se
ci_upper = p_hat + z * se

round_values([p_hat, ci_lower, ci_upper], 2)

[0.82, 0.79, 0.85]

#### Exact (Binomial) Method

In [4]:
from statsmodels.stats.proportion import proportion_confint

# Calculate the 95% confidence interval for the proportion
# Method 'binom_test' is used for exact CI calculation appropriate for binomial distribution
ci_low, ci_upp = proportion_confint(count, nobs, alpha=0.05, method='binom_test')

round_values([proportion, ci_low, ci_upp], 2)

[0.82, 0.79, 0.85]

### Sample Size Determination

#### Normal Approximation Method

$$MoE = \frac{{1}}{\sqrt{n}}$$

In [5]:
moe = 1/np.sqrt(232)
moe

0.06565321642986127

##### For 95% Confidence

In [6]:
# Margin of Error (MoE)
MoE = 0.03

# Calculate the sample size using the simplified formula from the image
# n = (1 / MoE)^2
sample_size = (1 / MoE)**2

# The sample size should be a whole number
sample_size = int(round(sample_size))

sample_size


1111

In [7]:
# Margin of Error (MoE)
MoE = 0.03

# Calculate the sample size using the simplified formula from the image
# n = (1 / MoE)^2
sample_size = (1 / MoE)**2

# The sample size should be a whole number
sample_size = int(round(sample_size))

sample_size

1111

##### For 99% Confidence


$$
\begin{align*}
& \hat{p} \pm Z^* \cdot \frac{1}{2\sqrt{n}} \\
& MoE = Z^* \cdot \frac{1}{2\sqrt{n}} \\
& n = \left( \frac{Z^*}{2 \cdot MoE} \right)^2 \\
\end{align*}
$$



In [8]:
# Values provided in the image
Z_star = 2.576  # Z-score for 99% confidence
MoE = 0.03      # Margin of Error

# Sample size calculation
n = (Z_star / (2 * MoE)) ** 2

# Since sample size must be a whole number, round up
import math
n_rounded = math.ceil(n)

print(n_rounded)


1844


#### Exact Method
$n = \left( \frac{Z_{\alpha/2} \times \sqrt{p(1-p)}}{MoE} \right)^2$

In [9]:
from scipy.stats import norm

# Desired margin of error
margin_of_error = 0.04

# For a 95% confidence interval, the z-score is approximately 1.96
z_score = norm.ppf(0.975)

# Since the population proportion is unknown, we use the most conservative estimate, p = 0.5
p = 0.5
q = 1 - p

# Calculate the sample size using the formula for margin of error
sample_size = (z_score ** 2 * p * q) / (margin_of_error ** 2)

# Since sample size must be a whole number, we round up
sample_size = int(sample_size) + 1 if sample_size % 1 > 0 else int(sample_size)

sample_size


601

What minimum sample size does the researcher need in order to create a 98% conservative confidence interval with a margin of error of no more than 3%?   

In [10]:
from scipy.stats import norm

# Desired margin of error
margin_of_error = 0.03

# For a 98% confidence interval, the z-score is approximately 2.33 (found using the ppf function)
z_score = norm.ppf(0.99)

# Since the population proportion is unknown, we use the most conservative estimate, p = 0.5
p = 0.5
q = 1 - p

# Calculate the sample size using the formula for margin of error
sample_size = (z_score ** 2 * p * q) / (margin_of_error ** 2)

# Since sample size must be a whole number, we round up
sample_size = int(sample_size) + (sample_size % 1 > 0)

sample_size

1504

## Two Proportions

### Estimating a Difference in Population Proportions
$$
\text{Best Estimate} \pm \text{MoE} \\
\hat{p}_1 - \hat{p}_2 \pm 1.96 \cdot \sqrt{\frac{\hat{p}_1(1-\hat{p}_1)}{n_1} + \frac{\hat{p}_2(1-\hat{p}_2)}{n_2}}
$$

In [11]:
import numpy as np
from scipy.stats import norm

# Given sample sizes and number of successes
n1, x1 = 988, 543 # sample of white children
n2, x2 = 247, 91  # sample of black children


# Calculate the sample proportions
p1 = x1 / n1
p2 = x2 / n2

# Calculate the standard error for the difference in proportions
se = np.sqrt(p1 * (1 - p1) / n1 + p2 * (1 - p2) / n2)

# Calculate the z-score for 95% confidence
z = norm.ppf(0.975)
#z = 1.96

# Calculate the margin of error
moE = z * se

# Calculate the confidence interval
ci_lower = (p1 - p2) - moE
ci_upper = (p1 - p2) + moE
#ci_lower = 0.18 - moE
#ci_upper = 0.18 + moE

round_values([p1 - p2, p1, p2, z,moE, ci_lower, ci_upper], 4)


[0.1812, 0.5496, 0.3684, 1.96, 0.0677, 0.1135, 0.2489]

Interpreting the Confidence Interval
"range of reasonable values for our parameter"
With 95% confidence, the population proportion of parents with white children who have taken swimming lessons is 11.35 to 24.89% higher than the population proportion of parents with black children who have taken swimming lessons.

# Population Means

## One Mean
$$
\bar{x} \pm t^* \left(\frac{s}{\sqrt{n}}\right)
$$

Mean = 82.48 inches

Standard Deviation = 15.06 inches

$n$ = 25 observations > $t*$ = 2.064

$$
\begin{align*}
\bar{x} \pm t^* \left( \frac{s}{\sqrt{n}} \right) &= 82.48 \pm 2.064 \left( \frac{15.06}{\sqrt{25}} \right) \\
&= 82.48 \pm 2.064(3.012) \\
&= 82.48 \pm 6.22 \\
& \text{(76.26 inches, 88.70 inches)} 
\end{align*}
$$


In [12]:
from scipy.stats import t

# Define your confidence level and degrees of freedom
sample_size = 25
confidence_level = 0.95  # for a 95% confidence interval
degrees_of_freedom = sample_size - 1  # degrees of freedom

# Calculate the t-score
# The ppf function returns the inverse of the CDF (Cumulative Distribution Function)
# The argument for the ppf function should be 1 minus half of the alpha level (1 - alpha/2)
# because the t-distribution is symmetric, and we want the cumulative area from -t to t.
t_score = t.ppf((1 + confidence_level) / 2, degrees_of_freedom)
#t_score = 2.064

print(f"t-score for {confidence_level*100}% confidence level and {degrees_of_freedom} degrees of freedom: {t_score}")


t-score for 95.0% confidence level and 24 degrees of freedom: 2.0638985616280205


In [13]:
# Given values for the mean, t-score, standard deviation, and sample size
mean = 82.48

standard_deviation = 15.06


# Calculate the margin of error
margin_of_error = t_score * (standard_deviation / (sample_size ** 0.5))

# Calculate the confidence interval
lower_bound = mean - margin_of_error
upper_bound = mean + margin_of_error

print(f"Margin of Error: {margin_of_error:.2f}")
print(f"Mean (95% CI): {mean:.2f} ({lower_bound:.2f} to {upper_bound:.2f})")


Margin of Error: 6.22
Mean (95% CI): 82.48 (76.26 to 88.70)


# End