# Estimation

In [114]:
import numpy as np
from scipy.stats import norm, chi2
import pandas as pd

## Confidence Interval

In [27]:
m = 1 - 0.18  # Mean

# sd = 96.9  # Standard Deviation of sample
"""This mean calculation assumes that mean is a proportion"""
sd = (m * (1 - m)) ** 0.5
"""Calculation if the variance is known"""
# var = 96.9
# sd = var ** 0.5

n = 272  # Sample size
se = sd / (n ** 0.5)

alpha = 0.96  # Confidence level
ci_min, ci_max = norm.interval(alpha, loc=m, scale=se)

print(f"Confidence interval: [{ci_min}, {ci_max}]")


Confidence interval: [0.7721583584511084, 0.8678416415488918]


In [122]:
m = 183.8  # Mean

var = 132.9  # Variance

n = 106  # Sample Size
df = n - 1

se = (var / n) ** 0.5

alpha = 0.90

chi2_min, chi2_max = chi2.interval(alpha, df=df)

ci_var_min = (df * var) / chi2_max
ci_var_max = (df * var) / chi2_min

print(f"Confidence interval for variance: [{ci_var_min}, {ci_var_max}]")
print(f"Confidence interval for sd:[{ci_var_min ** 0.5}, {ci_var_max ** 0.5}]")

Confidence interval for variance: [107.41009561962488, 169.4458517021722]
Confidence interval for sd:[10.363884195591192, 13.017136847332143]


## Margin of Error
### For Proportion Estimation

In [83]:
p = 52/200  # Proportion of population

alpha = 0.95  # Confidence level
z = norm.ppf((1 + alpha) / 2)

margin_error = 0.03  # Desired margin of error

"""Transformation of equation for MOE"""
sample_size = (z ** 2 * p * (1 - p)) / margin_error ** 2
print(f"Sample size: {sample_size}")

Sample size: 821.2185301128332


## Delta in Confidence Interval
### For Proportion Estimation

In [124]:
n = 474  # Sample size
p = 0.4186  # Proportion of population

alpha = 0.99  # Confidence level
z = norm.ppf((1 + alpha) / 2)

sd = (p * (1 - p)) ** 0.5

delta = 0.1  # Desired difference between intervals

sample_size = ((2 * z * sd) / delta)**2
print(f"Sample size: {sample_size}")

Sample size: 645.9046363091203


## Interval size estimation from datafile

In [112]:
df = pd.read_csv('../../data/dataset.csv', decimal='\n')

# NAme of column!!!
values = np.array(df['x'])

alpha = 0.99  # Confidence lvl
z = norm.ppf((1 + alpha) / 2)

delta = 10  # Difference of CI

mean = values.mean()
sd = values.std(ddof=1)

sample_size = ((2 * z * sd) / delta)**2
print(f"Sample size: {sample_size}")

Sample size: 548.8715053720226


In [None]:
url = "https://raw.githubusercontent.com/kflisikowski/ds/refs/heads/master/data_pizza.csv"
data_pizza = pd.read_csv(url)

del_time_a = df[df['area'] == 'Camden']['quality']

proportion = del_time_a[del_time_a == 'high'].shape[0] / del_time_a.shape[0]
print("prop:", proportion * 100)

sd = np.sqrt(proportion * (1-proportion))

se = sd / np.sqrt(del_time_a.shape[0])

print(f"bias: {se}")

print(se / proportion * 100)

alpha = 0.99
z = norm.ppf((1 + alpha) / 2)
ci_min, ci_max = norm.interval(alpha, loc=proportion, scale=se)
print(f"Confidence Interval: [{ci_min * 100}, {ci_max * 100}]")

delta = 0.1  # Desired difference between intervals

sample_size = ((2 * z * sd) / delta)**2
print(f"Sample size: {sample_size}")