In [1]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

## T-test, one sample
Small sample size (n < 30), unknown population standard deviation, population variance.

1. Data points are independent

2. Sample values are recored and taken accurately

3. Based on Student-T test distribution (t-table)

4. Critical value t(df, $\alpha$), commonly significant level $\alpha$= 0.05, 0.01, or 0.1

$t_{score} = \frac{\bar{x} - \mu}{SE} = \frac{\bar{x} - \mu}{\sigma/\sqrt{n}}$

Where:

*) df: degree of freedom df= n - 1
    
*) n: sample size
 
*) $\bar{x}$: the sample mean
    
*) $\mu$: the hypothesis mean
    
*) $\sigma$: standard deviation (std) $\sigma = \sqrt{\frac{1}{n - 1}\sum_{i=1}^n(x_i - \bar{x})^2}$
    
*) SE: standard error

## State null hypothesis (H0)
Null hypothesis is allways the accepted fact.

Testing the null-hypothesis, when it is going wrong, what happend?

The altenative hypothesis (H1) is the researcher's beliefs.

## Finding critical value (cv) at the significant level
Depend on the $\alpha$ level, we lookup on the t-table or z-table distribution to know exactly value to test the null-hypothesis. 

For $\alpha$ level at 0.05 (meaning 5% of region under normal distribution), we've got (50% - 5%) or (100% - 5%) of the confident interval (CI) to accept the null-hypothesis

Mapping the CI value on the t-table or z-table to find the t or z critical value.

## Calculate the statistical value
Depend on one-sample or two-sample of the t or z test, applying the formula for each test

$z_{score} = \frac{\bar{X} - \mu}{\sigma/\sqrt{n}}$

$t_{score} = \frac{\bar{x} - \mu}{s/\sqrt{n}}$

*) s: the sample std $s = \sqrt{\frac{1}{n - 1}\sum_{i=1}^n(x_i - \bar{x})^2}$

## Accept or Reject null-hypothesis
Depend on the state of H0:

*) left-tailed test: H1 $\mu < \mu_0$ : z (or t) < -cv : Reject H0

*) right-tailed test: H1 $\mu > \mu_0$ : z (or t) > cv : Reject H0

*) two-tailed test: H1 $\mu \neq \mu_0$ : -cv $\leq$ z (or t) $\leq$ cv : Accept H0

In [2]:
## T-test, one sample
x = [138, 130, 120, 130, 108, 135, 134, 122, 115, 118]
mu = 130
x_bar = np.mean(x)
s = np.std(x, ddof=1)     # std of sample with size (df) N - 1
se = s / np.sqrt(len(x))
t = (x_bar - mu)/se
cv = -1.38
# because left-tail test, t < cv to reject
if t < cv:
    print('reject H0')
else:
    print('accept H0')
print(t)
t_val, p_val = stats.ttest_1samp(x, mu)#, alternative='less')  # greater, two-sided (default)
print(t_val, p_val)

reject H0
-1.6063235132929476
-1.6063235132929476 0.1426652014799611


In [3]:
## Z-test, one sample
## H0: x_bar - mu >= 0
## H1: x_bar - mu < 0 (left-tail test)
mu = 80
sigma = 20
x_bar = 75
n = 60
z_cv = -1.645   # 5% of alpha level
se = sigma / np.sqrt(n)
z = (x_bar - mu) / se
# left-tail test
if z < z_cv:
    print('reject H0')
else:
    print('accept H0')
print(z)

reject H0
-1.9364916731037085


In [4]:
## T-test, two dependent samples (paired test)
# H0: mu_d = 0
# H1: mu_d != 0
x1 = [17, 17, 15, 19, 18, 14, 27, 20, 12, 21, 20, 24, 17, 17, 17]
x2 = [22, 21, 21, 26, 20, 14, 31, 18, 22, 20, 27, 23, 15, 24, 24]
mu = 0
x_dif = [v2 - v1 for v1, v2 in zip(x1, x2)]
x_bar = np.mean(x_dif)
s_dif = np.std(x_dif, ddof=1)   # std of sampling dist.
se = s_dif / np.sqrt(len(x1))
t = (x_bar - mu) / se
# print(x_dif, x_bar, s_dif, se)
print(t)
t_val, p_val = stats.ttest_rel(x1, x2)
print(t_val, p_val)
alpha = 0.05    # if p_value > alpha value, we cannot reject H0
if p_val < alpha:
    print('reject H0')
else:
    print('accept H0')

3.4860662073428483
-3.4860662073428483 0.0036344776518329644
reject H0


In [5]:
## T-test, two independent samples (unpaired test) and unequal variances
print('unpaired test...')
x1_bar = np.mean(x1)
x2_bar = np.mean(x2)
var_s1 = np.var(x1, ddof=1)
var_s2 = np.var(x2, ddof=1)
se = np.sqrt(var_s1/len(x1) + var_s2/len(x2))
t = (x1_bar - x2_bar - mu) / se
print(t, var_s1, var_s2)
t_val, p_val = stats.ttest_ind(x1, x2, equal_var=False, alternative='two-sided')
alpha = 0.05
if p_val < alpha:
    print('reject H0')
else:
    print('accept H0')
print(t_val, p_val)

unpaired test...
-2.3641605822510674 14.238095238095237 19.266666666666662
reject H0
-2.3641605822510674 0.025410959846806334


In [6]:
## T-test, 2 independent sample (unpaired), (equal variances)
x1 = [13, 17, 19, 11, 20, 15, 18, 9, 12, 16]
x2 = [12, 8, 6, 16, 12, 14, 10, 18, 4, 11]
mu = 0
x1_bar = np.mean(x1)
x2_bar = np.mean(x2)
df1 = len(x1) - 1
df2 = len(x2) - 1
s1 = np.var(x1, ddof=1)
s2 = np.var(x2, ddof=1)
# because testing with assuming equal variances, the pooled variance s**2
# we using std s to calculate
s = np.sqrt((df1*s1 + df2*s2)/(df1 + df2))
t = (x1_bar - x2_bar - mu)/(s * np.sqrt(1/len(x1) + 1/len(x2)))
print(t)
t_val, p_val = stats.ttest_ind(x1, x2, equal_var=True, alternative='two-sided')
print(t_val, p_val)
alpha = 0.05
if p_val <= alpha:
    print('reject H0')
else:
    print('accept H0')

2.176767731525196
2.176767731525196 0.04305271652983949
reject H0


In [7]:
## Z-test, by calc probabilities
## give sweets to students improving education by more than 10%
## H0: p_s - p <= 0
## H1: p_s - p > 0 (right-tail test)
p1 = 0.7
p2 = 0.4
n1 = 100
n2 = 100
p = (n1*p1 + n2*p2)/(n1 + n2)
se = np.sqrt(p*(1 - p)*(1/n1 + 1/n2))
z = (p1 - p2) / se
print(p, se, z)
# z_critical at alpha = 10%, lookup on z-table for CI = 50% - 10%
z_cv = 1.29
if z > z_cv:
    print('reject H0')
else:
    print('accept h0')

0.55 0.07035623639735145 4.264014327112208
reject H0


In [8]:
## Z-test, two samples proportions
# drug A: 41 out of 195
# drug B: 351 out of 605
# significant level: 5%. are the 2 drug comparable
# H0: proportions are the same
# H1: proportions are different
x1 = 41
x2 = 351
n1 = 195
n2 = 605
mu = 0
cv = 1.96
p1 = x1/n1
p2 = x2/n2
p = (x1 + x2) / (n1 + n2)
se = np.sqrt(p * (1 - p) * (1/n1 + 1/n2))
z = np.abs((p1 - p2 - mu) / se)
print(se, z)
if z > cv:
    print('reject H0')
else:
    print('accept H0')

0.041165474766825305 8.985900954503084
reject H0


In [9]:
## p-value for Z-test, two-tailed test
## H0: mu = x_bar
## H1: mu != x_bar
mu = 168
x_bar = 169.5
n = 36
sigma = 3.9
se = sigma / np.sqrt(36)
z = (x_bar - mu) / se
# based on z value to map it on the z-table to lookup prob. of reject region
print(z)
# got z = 2.30 and area of z(2.3,0.00) = 0.9893 (whole area from the left to the z value)
# rejection region = 1 - 0.9893 = 0.0107
# because this is the 2-tailed test, the rejection region on the left and right of bell curve
# p_value = 0.0107 * 2 = 0.0214
# p_value < critical value (alpha = 0.05)
# reject the H0

2.3076923076923075
