In [None]:
import re
import pandas as pd
import numpy as np
import scipy as sp
import sympy as sym
import sklearn as skl
from IPython.display import display, Math, Latex
import math
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn3
from fractions import Fraction

# Observational Study
***

In [None]:
# Parameter: The corresponding characteristic of the population that is being studied.
# Observational Study: Looking at the information that is already present without adding or changing the population.
# Treatment: The action taken to improve or change a characteristic of the population.

In [None]:
# One-way table: A table that shows the frequency distribution of a single categorical variable from a sample.
# Two-way table: A table that shows the frequency distribution of two categorical variables from a sample.
# Correlation: A measure of the strength and direction of the relationship between two variables.
# Positive correlation: A relationship where one variable increases as the other variable increases.
# Negative correlation: A relationship where one variable increases as the other variable decreases.
# Causality: A relationship between cause and effect.
# Confounding variable: A variable that influences both the dependent and independent variables but is not included.

# Experimental Studies
***

In [None]:
# Experiment: A controlled and repeatable process that aims to establish causality between variables.
# Controlled variable: A variable that is manipulated to eliminate the effects of other variables.
# Treatment group: A group of subjects who receive a specific treatment.
# Placebo group: A group of subjects who receive a placebo (a control treatment).
# Explanatory variable: A variable that is used to explain the relationship between the dependent and independent variables.
# Response variable: A variable that is measured to determine the outcome of the experiment.
# Blinded variable: A variable that is not known to the subjects, but is known to the researchers.
# Double-blind variable: A variable that is not known to the researcher or to the subjects.

# Matched pairs: A type of experimental design where two groups of subjects are matched on a set of characteristics to ensure that the effects of the treatment are not due to confounding variables.
# Blocking: When a researcher groups subjects into blocks based on a variable that is not related to the treatment, and the effects of the treatment are not confounded by this variable.

# Replication: The process of repeating an experiment to increase the reliability and validity of the results.

# Sampling Distribution of the Sample Mean
***

In [None]:
display(Math(r'\text{Population Mean: }(\mu \text{ or N})'))
display(Math(r'\text{Sample Mean: } (\bar{x} \text{ or } n)'))
print('\n')
display(Math(r'\mu \text{ = 10}'))
display(Math(r'\bar{x}_1 \text{ = 9.6}'))
display(Math(r'\bar{x}_2 \text{ = 11.2}'))
display(Math(r'\bar{x}_3 \text{ = 10.4}'))
print('\n')
display(Math(r'N = 30'))
display(Math(r'n = 3'))
print('\n')
display(Math(r'N^n \text{ = 30}^3 \text{ = 27,000}'))

None

In [None]:
# Central limit theorem: A fundamental result in probability theory that states that the distribution of a sample mean approaches a normal distribution as the sample size increases.
# Law of large numbers: A fundamental result in probability theory that states that the sample mean converges to the true population mean as the sample size increases.


In [None]:
# Question 1

"""
The Central Limit Theorem tells us that, if the original population is
normally distributed, then the SDSM will also be normally distributed,
regardless of the sample size n that we use.
If the original population is not normally distributed, or if we don’t know
the shape of the population distribution, then the SDSM is only guaranteed
to be normally distributed when we use a sample size of at least n = 30.
"""
None

In [None]:
# Question 2

"""
A hospital finds that the average birth weight of a newborn is 7.5
lbs with a standard deviation of 0.4 lbs. What is the standard deviation of
the sampling distribution, if the hospital randomly selects 45 newborns to
test this claim?
"""

N_mean = 7.5
N_sd = 0.4
n = 45
Z_score = 1.96  # Z-score corresponding to a 95% confidence interval\
print('\n')
display(Math(
    r'\text{Z-score => } \frac{X \ - \ \mu}{\sigma} \text{ => } \frac{\text{Sample Mean} - \text{Population Mean}}{\text{Standard Deviation}}'))
print('\n')
display(Math(r'\text{The Z-score describes the distance from the mean in terms of standard deviations}'))
print('\n')

SD_sampling_distribution = N_sd / np.sqrt(n)

print(f'The standard deviation of the sampling distribution is {round(SD_sampling_distribution, 5)}')


In [None]:
# Question 3
"""
A group of marathon runners have the following finishing times
in hours: 3.2, 3.5, 3.8, 4.2, 4.5. Given a sample of size 2 if we’re sampling with
replacement, find the standard error σ¯x.
"""

times = [3.2, 3.5, 3.8, 4.2, 4.5]
n = 2
N = 5

"""
Let’s first determine the total number of possible samples, using Nn, given
N = 5 and n = 2.
"""

display(Math(r'\text{Sample Space: }N^n \text{ = }5^2 \text{ = 25}'))

In [None]:
for t in range(len(times)):
    # row
    print([round(float(np.mean([times[i], times[t]])), 5) for i in range(0, len(times))])

df = pd.DataFrame(
    [[round(float(np.mean([times[i], times[t]])), 5) for i in range(0, len(times))] for t in range(len(times))],
    columns=times, index=times)

In [None]:
df

In [None]:
N_elements = (df.shape[0] * df.shape[1])

In [None]:
frequency_dict = {}
for index, row in df.iterrows():
    for value in row.values:
        frequency_dict[value] = frequency_dict.get(value, 0) + 1

print(f'Frequency distribution: {frequency_dict}')

In [None]:
df_frequency = pd.DataFrame(frequency_dict.items(), columns=['Mean', 'Frequency'])

In [None]:
df_frequency['Probability'] = df_frequency['Frequency'] / (df.shape[0] * df.shape[1])

In [None]:
df_frequency

In [None]:
"""
mean of the sampling distribution of the sample
mean, μ¯x, where ¯xi is a given sample mean, P(¯xi) is the probability of that
particular sample mean occurring, and N is the number of samples
"""

mean_of_sampling_distribution = round(sum(df_frequency['Mean'] * df_frequency['Probability']), 5)

In [None]:
mean_of_sampling_distribution

In [None]:
display(Math(r'\text{Population Variance: } \sigma^2 \text{ = } \frac{\sum^N_{i=1} \ (x_i \ - \ \mu)^2}{N}'))

In [None]:
pop_variance = sum([(x - mean_of_sampling_distribution) ** 2 for x in times]) / N

In [None]:
pop_variance

In [None]:
pop_std = np.sqrt(pop_variance)

In [None]:
pop_std_error = pop_std / math.sqrt(n)

# Conditions for Inference with the Sampling Distribution of the Sample Mean (SDSM)
***
- The sample must be randomly selected 
- the population.
- The sample size (n) must be greater than or equal to 30 for the Central Limit Theorem to hold.
- The sample must be independent and identically distributed (IID).

In [None]:
display(Math(
    r'Z-Score \ (z_{\bar{x}}) \ = \ \frac{\bar{x} - \mu}{\sigma_{\bar{x}}} \ = \ \frac{\bar{x} - \mu}{\frac{\sigma}{\sqrt{n}}}'))

In [None]:
# Question 1

"""
A restaurant wants to know the percentage of their customers
who order desert. The restaurant has 1,500 customers in one week and
finds by randomly surveying 100 of them that 35 order desert. What is the
standard error of the SDSP?
"""

N = 1500
n = 100
n_percentage_ordered_desert = 35 / n

# Independence 
print(f'Independence condition met: {n / N * 100 <= 10}')

# Successes
print(f'Successes condition met: {n * n_percentage_ordered_desert >= 5}')

# Failures
print(f'Failures condition met: {n * (1 - n_percentage_ordered_desert) >= 5}')

In [None]:
print('\n')
display(Math(r'\text{p-hat }(\hat{p}) \text{: The sample proportion}'))
display(Math(r'\hat{p} \ = \ \frac{x}{n}'))
display(Math(r'\text{x = the number of individuals in the sample with the studied characteristic}'))
display(Math(r'\text{n = the size of the sample}'))
print('\n')
display(Math(
    r'\sigma_{\hat{p}} \text{: The standard deviation of the sample proportion (standard error) = } \sqrt{\frac{p(1-p)}{n}}'))
print('\n')
display(
    Math(r'\text{z-score for sample proportion (p-hat): } z \ = \ \frac{\hat{p} - \mu_{\hat{p}}}{ \sigma_{\hat{p}}}'))

In [None]:
std_error_p_hat = np.sqrt(n_percentage_ordered_desert * (1 - n_percentage_ordered_desert) / n)
print(f'The standard error of the SDSP is {round(std_error_p_hat, 5)}')

In [None]:
# Question 2

"""
A group of scientists is studying 10,000 manatees and finds that
20 % are calves. We want to verify their claim, but can’t conduct a study of
all 10,000, so we randomly sample just 500. What’s the probability that our
results are within 5 % of the scientists’ study?
"""

N = 10000
p_calves = 0.20
n_calves = 0.20 * 10000
n_sampled = 500

# Independence
print(f'Independence condition met: {n_sampled / N * 100 <= 10}')

# Successes
print(f'Successes condition met: {n_sampled * p_calves >= 5}')

# Failures
print(f'Failures condition met: {n_sampled * (1 - p_calves) >= 5}')

mu_phat = 0.2

std_error_phat = np.sqrt((p_calves * (1 - p_calves)) / n_sampled)
print(f'The standard error of the SDSP is {round(std_error_phat, 5)}')

z_score_high = round((n_sampled / N) / std_error_phat, 5)
z_score_low = -round((n_sampled / N) / std_error_phat, 5)

print(
    f'The probability that our results are within 5% of the scientists\' study is {(1 - (sp.stats.norm.cdf(z_score_high)) + sp.stats.norm.cdf(-z_score_low))}')



In [None]:
1 - sp.stats.norm.cdf(z_score_high)

In [None]:
sp.stats.norm.cdf(-z_score_low)

In [None]:
round(0.002594303776157769 + 0.9974056962238422, 5)

# Sampling Distribution of the Sample Proportion (SDSP)
***

In [None]:
display(Math(
    r'\text{Population Proportion (p) or }(p) \text{: The probability of an event occurring in the entire population}'))
display(Math(
    r'\text{Sample Proportion (p-hat) or }(\hat{p}) \text{: } \frac{x}{n} \text{ which is the number of subjects in sample with the studied characteristic}'))

## Finite Population Correction Factor (N-1)

In [None]:
display(Math(
    r'\text{Standard Error of the Proportion (SE): } \sigma_{\hat{p}} \ = \  \sqrt{\frac{p(1 - p)}{N}} \ \sqrt{\frac{N - n}{N - 1}}'))
display(Math(r''))
display(Math(
    r'\text{Formula for the variance of the SDSP is: } \sigma^2_{\hat{p}} \text{ = } ( \frac{ p(1 - p) }{n} )  \ (\frac{N - n}{N - 1})'))
display(Math(r''))
display(Math(
    r'\text{If the population proportion is unknown, we approximate it with the sameple proportion, and our formulas for standard error and variance with the FPC are: }'))
display(Math(r''))
display(Math(r'\sigma_{\hat{p}} \ = \ \sqrt{\frac{ \hat{p} (1 -  \hat{p} )}{n}} \ \sqrt{\frac{n - 1}{n}}'))
display(Math(r''))
display(Math(r'\sigma^2_{\hat{p}} \ = \ \frac{ \hat{p} (1 -  \hat{p} )}{n} \ \frac{n - 1}{n}'))

In [None]:
 # Question 2

"""
A population proportion is p = 0.7. Find the standard error of the
proportion for samples of size n = 100.
"""
display(Math(r'\text{Standard Error of the Proportion (SE): } \sigma_{\hat{p}} \ = \  \sqrt{\frac{p(1 - p)}{N}}'))

p = 0.7
n = 100

std_error_p_hat = np.sqrt(p * (1 - p) / n)
print(f'The standard error of the SDSP is {round(std_error_p_hat, 5)}')


In [None]:
# Question 3

"""
A group of 3 siblings have the following eye color: blue, blue,
green. Find the mean μ̂p and standard error σ̂p of the sampling distribution
of the sample proportion for the proportion of siblings with blue eyes, if
we take 2-sibling samples, with replacement.
"""

N = 3
n = 2

Nn = N ** n

"Sample proportion P(pi):"

P_pi = [
    [0, (1 / 9)],
    [(1 / 2), (4 / 9)],
    [1, (4 / 9)]
]

mu_p_hat = sum(sample_proportion * probability for sample_proportion, probability in P_pi)
print(f'The mean μ��p of the sampling distribution is {round(mu_p_hat, 5)}')

sdsp = np.sqrt(sum((sample_proportion - mu_p_hat) ** 2 * probability for sample_proportion, probability in P_pi))
print(f'The standard error σ��p of the sampling distribution is {round(sdsp, 5)}')

# Conditions for inference with the SDSP
***
Random

Normal (large counts)

np >= 5
n(1-p) > 5

In [None]:
"""
Conditions for inference with the SDSP
---------------------------------------------
Just like we did with the sampling distribution of the sample mean, we
have to meet specific sampling conditions in order to be able to use the
sampling distribution of the sample proportion to make inferences about
the population proportion.

The conditions for inference that apply to the sampling distribution of the
sample proportion are similar to the conditions we applied to the sampling
distribution of the sample mean.


Random
---------------------------------------------
Any sample we take needs to be a simple random sample. Often we’ll be
told in the problem that sampling was random.


Normal (large counts)
---------------------------------------------

The sampling distribution of the sample proportion can only be
guaranteed to be normal if np ≥ 5 and n(1 − p) ≥ 5, where n is the sample
size and p is the population proportion. If np ≥ 5 is true, it tells us that we
can expect to have at least 5 “successes” in the sample, and if n(1 − p) ≥ 5
is true, it tells us that we can expect to have at least 5 “failures.”

So if our sample size is n = 100 and the population proportion is p = 60 % ,
then we multiply 100 by 0.6 and by 1 − 0.6 = 0.4 to make sure both values
are at least 5.


"""


In [None]:
display(Math(r'\text{Z-score p-hat }(Z_{\hat{p}}) \text{:  } \frac{\hat{p} - p}{\sqrt{\frac{p(1 - p)}{n}}}'))

In [None]:
# Example

N = 1000
p = 0.4
n = 90

# 0.35 < p-hat < 0.45
low_limit = round((p - 0.05), 5)
print(f'low_limit {low_limit}')
high_limit = round(p + 0.05, 5)
print(f'high_limit {high_limit}')

# Random
n_p = n * p
print(f'np {n_p} >= 5: {n_p >= 5}')

# Normal (large counts)
n_1_minus_p = n * (1 - p)
print(f'n(1-p) > 5: {n_1_minus_p > 5}')

# Independence (10% of the total sample size)
print(f'Independence condition met: {n / N * 100 <= 10}')

# mean of p-hat is p
mu_p_hat = n_p / n
print(f'The mean μ^p of the sampling distribution is {round(mu_p_hat, 5)}')

# standard error of p-hat is sqrt(p(1-p)/n)
std_error_p_hat = math.sqrt(p * (1 - p) / n)
print(f'The standard error σ^p of the sampling distribution is {round(std_error_p_hat, 5)}')

# because we are sampling more than 5% of the total population, we must adjust using the Finite Population Correction Factor (N-1)
std_error_p_hat_corrected = math.sqrt(p * (1 - p) / n) * math.sqrt((N - n) / (N - 1))
print(f'The corrected standard error σ^p of the sampling distribution is {round(std_error_p_hat_corrected, 5)}')

# Z-score p-hat low end (95% confidence interval)
z_score_low = (low_limit - p) / std_error_p_hat_corrected
print(f'Z-score p-hat low end: {round(z_score_low, 5)}')
print(f'The probability equivalent of our z-score is:  {sp.stats.norm.cdf(z_score_low)}')

# Z-score p-hat high end (95% confidence interval)
z_score_high = (high_limit - p) / std_error_p_hat_corrected
print(f'Z-score p-hat high end: {round(z_score_high, 5)}')
print(f'The probability equivalent of our z-score is:  {sp.stats.norm.cdf(z_score_high)}')

print(
    f'The probability that our results are within 5% is: {round(sp.stats.norm.cdf(z_score_high) - sp.stats.norm.cdf(z_score_low), 3) * 100}%')




# T-Distribution
***

The t-distribution is used when the sample size is small (n < 30) or the population standard deviation is unknown. It is similar to the normal distribution, but with heavier tails. 

The t-distribution has one additional parameter called the degrees of freedom (df), which is equal to n - 1.


The t-distribution has the following properties:

1. It is symmetric about the mean μ.

2. Its shape depends on the degrees of freedom df. As df increases, the t-distribution approaches the normal distribution.

3. The t-distribution has heavier tails than the normal distribution. This means that the probability of being more than a certain number of standard deviations from the mean increases as the number of degrees of freedom increases.

In [None]:
display(Math(r'Degrees of Freedom (df): n - 1'))

In [None]:
# Question 2

'''
What value in the t-table is associated with a sample size n = 15
and an upper-tail probability of 0.025?
'''

n = 15
df = n - 1

# Using scipy to find the value in the t-table
t_value = sp.stats.t.ppf(1 - 0.025, df)
print(
    f'The value in the t-table associated with a sample size n = 15 and an upper-tail probability of 0.025 is {round(t_value, 5)}')

In [None]:
# Question 3

'''
Find the upper-tail probability associated with a confidence
level of 99 % .
'''

confidence_level = 0.99
a = 1 - confidence_level
upper_tail_probability = a / 2

print(f'The upper-tail probability associated with a confidence level of 99% is {round(upper_tail_probability, 5)}')

# Confidence interval for the mean
***


## Point and Interval Estimates

In [None]:
# Point Estimate: In statistics, point estimation involves the use of sample data to calculate a single value which is to serve as a "best guess" or "best estimate" of an unknown population parameter.
# Interval Estimate: In statistics, interval estimation involves the use of sample data to calculate an interval that covers a population parameter with a certain level of confidence.
# Confidence Level: The probability that the calculated interval estimate will contain the true population parameter.

## Alpha a and the Region of Rejection

In [None]:
# Alpha (a): The probability of rejecting the null hypothesis (H0) when it is true. It is a measure of the risk of Type I error.
# Level of Significance (α): The probability of rejecting the null hypothesis when it is true, expressed as a decimal.

# For a 90% confidence level, z = +/- 1.645
# For a 95% confidence level, z = +/- 1.96
# For a 99% confidence level, z = +/- 2.576

## The Confidence Interval when the Population Standard Deviation is Known

In [None]:
display(Math(r'Confidence \ Interval \ (a,b): \ \bar{x} \ \pm \ z^{*} \frac{\sigma}{\sqrt{n}}'))
display(Math(r'Critical \ Value \  \  =  (z^{*}) \ or \ z_{a/2}'))
display(Math(
    r'\text{Sampling Distribution of the Sample Mean (std error): } \ \sigma_{\bar{x}} \ = \ \sigma/\sqrt{n} \text{ or } \ (a,b) \ = \ \bar{x} \ \pm \ z^{*} \sigma_{\bar{x}}'))

## The Confidence Interval for the Mean

In [None]:
# Question 1

'''
The height of students in our school is normally distributed with
a standard deviation of σ = 4 inches. We sample 50 of our classmates (with
replacement) and get a sample mean of ¯x = 66 inches. What is the
confidence interval for a confidence level of 95 % ?
'''

n = 50
sigma = 4
mean_x = 66
confidence_level = 0.95

display(Math(r'\text{Confidence Level of 95% is associated with z-scores of z = } \pm 1.96'))

display(Math(r'\text{Confidence Interval (a,b): } \bar{x} \ \pm \ z^{*} \frac{\sigma}{\sqrt{n}}'))

a = mean_x + 1.96 * (sigma / math.sqrt(n))
b = mean_x - 1.96 * (sigma / math.sqrt(n))

print(f'The confidence interval for a confidence level of 95% is ({round(a, 2)}, {round(b, 2)}) inches')


In [None]:
# Question 2

'''
The weight of chickens on a farm is normally distributed with a
standard deviation of σ = 3.5 ounces. What is the smallest sample we can
take if we want a margin of error of ±2.5 ounces, and we want to be 99 %
confident?
'''

confidence_level = 0.99
sigma = 3.5
margin_of_error = 2.5

display(Math(r'\text{Confidence Level of 99% is associated with z-scores of z = } \pm 2.576'))

# solve the margin of error for n
display(Math(r'\text{Margin of Error: } \text{ME = } z^{*} \frac{\sigma}{\sqrt{n}}'))

n = (
            (round(2.57, 2) * sigma)
            / margin_of_error
    ) ** 2

print(
    f'The smallest sample size we can take with a margin of error of 2.5 ounces and a confidence level of 99% is {round(math.ceil(n), 0)}')

In [None]:
# Question 3

'''
We want to know the mean number of daylight hours (the time
between sunrise and sunset) in a day in our city over the course of a year.
We take a random sample of 30 days throughout the year and get a
sample mean of x = 13.15 hours and a sample standard deviation of s = 0.85
hours. What is the confidence interval for a confidence level of 90 % ?
'''

mean_x = 13.15
n = 30
sigma = 0.85
confidence_level = 0.90

# Because population standard deviation is unknown, we have to use the t-distribution instead of the z-distribution.

# A 90 % confidence level with n − 1 = 30 − 1 = 29 degrees of freedom is associated with t-scores of t = ± 1.699.

a = mean_x + 1.699 * (sigma / math.sqrt(n))
b = mean_x - 1.699 * (sigma / math.sqrt(n))

print(f'The confidence interval for a confidence level of 90% is ({round(a, 2)}, {round(b, 2)}) hours')

## Confidence Interval for the Proportion

In [None]:
display(Math(r'\hat{p} \text{ = } \frac{text{Number of subjects that meet our criteria}}{n}'))
display(Math(r'Confidence \ Interval \ (a,b): \hat{p} \ \pm \ z^{*} \sqrt{\frac{\hat{p}(1-\hat{p})}{n}}'))
display(Math(r'z^{*} \text{ is the z-score for a given confidence level from z-tables}'))
display(Math(r'\hat{p} \text{ is the sample proportion}'))
display(Math(r'n \text{ is the sample size}'))
display(Math(r'\text{In order to use this formula, we need to have } n\hat{p} \geq 5 \text{ and } n(1-\hat{p}) \geq 5'))
display(
    Math(r'\text{Finite population correction factor applies without replacement of more than 5% of the population}'))
display(Math(
    r'\text{Confidence Interval for the Population Proportion: (a,b) = } \hat{p} \pm z^{*}\sqrt{\frac{\hat{p}(1-\hat{p})}{n}} \sqrt{\frac{\text{N - n}}{\text{N - 1}}}'))

In [None]:
'''
A study shows that 78 % of patients who try a new medication
for migraines feel better within 30 minutes of taking the medicine. If the
study involved 120 patients, construct a 95 % confidence interval for the
proportion of patients who feel better within 30 minutes of taking the
medicine
'''

n = 120
p_hat = 0.78
confidence_level = 0.95

z_star = sp.stats.norm.ppf(1 - (1 - confidence_level) / 2)

print(
    f'The 95% confidence interval for the proportion of patients who feel better within 30 minutes of taking the medication is ({round(p_hat - z_star)})')

In [None]:
a = p_hat - z_star * math.sqrt((p_hat * (1 - p_hat)) / n)
b = p_hat + z_star * math.sqrt((p_hat * (1 - p_hat)) / n)

print(
    f'The 95% confidence interval for the proportion of patients who feel better within 30 minutes of taking the medication is ({round(a, 4)}, {round(b, 4)})')

In [None]:
# Question 2

'''
A study shows that 243 of 500 randomly selected households
were using a family member to care for their young children. Build a 90 %
confidence interval for the proportion of households using a family
member to care for young children.
'''

n = 243
