In [10]:
import re
import pandas as pd
import numpy as np
import scipy as sp
import sympy as sym
import sklearn as skl
from IPython.display import display, Math, Latex
import math
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn3
from fractions import Fraction

# Inferential Statistics and Hypothesis

In [11]:
# Inferential statistics : using information we have about the sample to make inferences about the population.

# Proof vs Support: Inferential statistics does not provide proof, but it provides support. It tells us how much we can be confident that our conclusions are correct.

# Hypotheses for means and proportions: Hypotheses are statements about the population that we want to test.

# H_a (Alternative Hypothesis): The statement that we want to prove or disprove.
# H_0 (Null Hypothesis): The statement that we want to disprove.

In [12]:
# Significance level and type I and II errors: The significance level is the probability of rejecting the null hypothesis when it is true.

 # Test Statistics for One and Two-tailed Tests

In [13]:
'''
We’ve decided to give all of our friends a small box of
homemade cookies. 

We’ve already baked a variety of cookies and
randomly placed them into boxes. We want to make sure that each box is
close to 0.5 pounds, so we sample 10 boxes and find a mean of 0.54 pounds
and a standard deviation of s = 0.3 pounds. 

Assuming that the weights of
all the boxes are normally distributed, calculate the test statistic
'''

display(Math(r'\text{t = }\frac{\bar{x} - \mu_0}{\frac{s}{\sqrt{n}}}'))

x_bar = 0.54
mu_0 = 0.5
s = 0.3
n = 10

t = (x_bar - mu_0) / (s / np.sqrt(n))

print('Test statistic:', round(t, 2))

<IPython.core.display.Math object>

Test statistic: 0.42


# The p-value and rejecting the null hypothesis

In [14]:
'https://www.simplypsychology.org/p-value.html'

'https://www.simplypsychology.org/p-value.html'

# Hypothesis testing for the population proportion

In [15]:
# Question 1

'''
We’ve heard that 10 % of people are left-handed. We want to
verify this claim, so we collect a random sample of 500 people and find that
43 of them are left-handed. What can we conclude at a significance level of
α = 0.10?
'''

# p-value calculation

n = 500
a = 0.10
x = 43
H_0 = 0.10

p_hat = x / n

print('Sample proportion:', p_hat)

sigma_p_hat = np.sqrt((H_0 * (1 - H_0)) / n)

print('Standard error:', round(sigma_p_hat, 5))

z_test = (p_hat - H_0) / sigma_p_hat

print('Z-score:', round(z_test, 5))

'''

The critical z-values for 90 % confidence with a two-tailed test are
z = ± 1.65. Since the test statistic we found is negative (z = − 1.04), we’ll
compare it to z = − 1.65.

Our z-value of z = − 1.04 is not less than z = − 1.65, and therefore falls in the
region of acceptance, which means we’ll fail to reject the null hypothesis
and fail to conclude that the proportion of left-handed people is different
than 10 % .

'''



Sample proportion: 0.086
Standard error: 0.01342
Z-score: -1.0435


'\n\nThe critical z-values for 90 % confidence with a two-tailed test are\nz = ± 1.65. Since the test statistic we found is negative (z = − 1.04), we’ll\ncompare it to z = − 1.65.\n\nOur z-value of z = − 1.04 is not less than z = − 1.65, and therefore falls in the\nregion of acceptance, which means we’ll fail to reject the null hypothesis\nand fail to conclude that the proportion of left-handed people is different\nthan 10 % .\n\n'

In [16]:
# Question 2

H_0 = 80  # mean mL of fluid in a bottle
# H_a != 80 # mean mL of fluid in a bottle
# two tailed test

n = 40  # sample mL of fluid in a bottle
x_bar = 78  # sample mean mL of fluid in a bottle
s = 2.5  # sample std of x mL of fluid in a bottle

# confidence level is 95%
area_of_each_tail = (1 - 0.95) / 2

# critical values for a two-tailed test (point on the x axis at the start of the tail in both positive and negative directions
z_critical = sp.stats.norm.ppf(1 - area_of_each_tail)

# find reject region
z = (x_bar - H_0) / (s / np.sqrt(n))

# if |z| > z_critical, reject H_0
print('Reject H_0:', abs(z) > z_critical)


Reject H_0: True


In [17]:
# Question 3

# H_0 >= 2 # 2 or more years
H_0 = 2  # 2 years

n = 10  # sample size

x_bar = 2.8  # average years of service
s = 0.15  # std of years of service

# confidence level is 99%
area_of_each_tail = (1 - 0.99) / 2

# critical values for a two-tailed test (point on the x axis at the start of the tail in both positive and negative directions)
z_critical = sp.stats.norm.ppf(1 - area_of_each_tail)

# find reject region
z = (x_bar - H_0) / (s / np.sqrt(n))

# if |z| > z_critical, reject H_0
print('Reject H_0:', abs(z) > z_critical)

# using t-test since n < 30
t = (x_bar - H_0) / (s / np.sqrt(n))



Reject H_0: True


# Confidence Interval for the Difference of Means

In [18]:
# Question 1

'''
A professor is interested in whether exam scores differ between
two nearby colleges. He selects a simple random sample of 20 students
each from both colleges and finds a mean test score of 350 with a standard
deviation of 15 at the first college, and a mean test score of 390 with a
standard deviation of 30 at the second college. Assuming exam scores are
normally distributed at both colleges, find a 95 % confidence interval
around the difference in exam scores.
'''

n_1 = 20
n_2 = 20

x_bar_1 = 350
s_1 = 15

x_bar_2 = 390
s_2 = 30

s_pooled = np.sqrt(((n_1 - 1) * s_1 ** 2 + (n_2 - 1) * s_2 ** 2) / (n_1 + n_2 - 2))

t_critical = sp.stats.t.ppf(1 - 0.025, n_1 + n_2 - 2)

lower_bound = (x_bar_1 - x_bar_2) - t_critical * s_pooled * np.sqrt(1 / n_1 + 1 / n_2)
upper_bound = (x_bar_1 - x_bar_2) + t_critical * s_pooled * np.sqrt(1 / n_1 + 1 / n_2)

(lower_bound, upper_bound)


(np.float64(-55.182956229339766), np.float64(-24.817043770660234))

# Hypothesis testing for the difference of means