# Hypothesis Testing
Yang Xi <br>
22 Aug, 2021

In [1]:
import numpy as np
import scipy.stats as ss

# Shapiro-Wilk Test: Exam Normal Distribution
* Null hypothesis: normal

In [2]:
print("Large p-value ==> normally distributed:")
np.random.seed(1)
ss.shapiro(np.random.normal(loc=5, scale=3, size=100))

Large p-value ==> normally distributed:


ShapiroResult(statistic=0.9920048713684082, pvalue=0.8215786218643188)

In [3]:
print("Small p-value ==> significantly NOT normally distributed")
np.random.seed(1)
ss.shapiro(np.random.uniform(low=2, high=4, size=100))

Small p-value ==> significantly NOT normally distributed


ShapiroResult(statistic=0.9471390247344971, pvalue=0.0005401436355896294)

# F-test: Whether the Variance(s) of Two Samples are Equal
* **Null hypothesis**: the two variances are equal
* **Assumption**: sample from normal distributed population

F-test is extremely sensitive to non-normality samples. **Levene's test** or **Bartlett's test** are more robust alternatives.

In [4]:
# This function is to match the implementation of var.test in R
def f_test(ar1, ar2, alternative="two_sided"):
    df1, df2 = len(ar1) - 1, len(ar2) - 1
    var1, var2 = ar1.var(ddof=1), ar2.var(ddof=1)
    f = var1 / var2
    if alternative == "two_sided":
        if f < 1:
            p = 2*ss.f.cdf(f, df1, df2)
        else:
            p = 2*ss.f.cdf(1/f, df2, df1)
    if alternative == "less": # significant if var1 < var2
        p = ss.f.cdf(f, df1, df2)
    if alternative == "greater": # significant if var1 > var2
        p = ss.f.sf(f, df1, df2)
    return f, p

In [5]:
print("Two-sided: large p-value ==> equal variances")
np.random.seed(1)
n = 10000
ar1 = np.random.uniform(low=0, high=1, size=n)
ar2 = np.random.uniform(low=0, high=1, size=n)
f, p = f_test(ar1, ar2, alternative="two_sided")
print(f"f statistics = {f}, p value = {p}")

Two-sided: large p-value ==> equal variances
f statistics = 0.9894128128015265, p value = 0.5946288001782598


In [6]:
print("Two-sided: small p-value ==> significantly different variances")
np.random.seed(1)
n = 10000
v1 = np.random.uniform(low=0, high=1, size=n)
v2 = np.random.uniform(low=0, high=10, size=n)
f, p = f_test(v1, v2)
print(f"f statistics = {f}, p value = {p}")

Two-sided: small p-value ==> significantly different variances
f statistics = 0.009894128128015265, p value = 0.0


In [7]:
print("One-sided: small p-value ==> var1 significantly less than var2")
f, p = f_test(v1, v2, alternative="less")
print(f"f statistics = {f}, p value = {p}")

One-sided: small p-value ==> var1 significantly less than var2
f statistics = 0.009894128128015265, p value = 0.0


In [8]:
print("One-sided: large p-value ==> var1 larger than var2")
f, p = f_test(v1, v2, alternative="greater")
print(f"f statistics = {f}, p value = {p}")

One-sided: large p-value ==> var1 larger than var2
f statistics = 0.009894128128015265, p value = 0.9999999999999999


# Chi-Square Test of Independence between Two Categorical Variables

* **Null hypothesis**: the two variables are independent

Reference: *https://towardsdatascience.com/chi-square-test-for-independence-in-python-with-examples-from-the-ibm-hr-analytics-dataset-97b9ec9bb80a*

In [9]:
import pandas as pd

np.random.seed(0)
p1 = 0.2
p21, p22 = 0.2, 0.3
q1, q2 = 1-p1, 1-p21-p22
ar1 = np.random.choice([0,1], size=10000, p=[p1,q1])
ar2 = np.random.choice([0,1,2], size=10000, p=[p21,p22,q2])

dfContingency = pd.crosstab(ar1, ar2)
dfContingency

col_0,0,1,2
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,422,627,1011
1,1661,2355,3924


In [10]:
# Large p-value ==> independent
arContingency = np.array(dfContingency)
chi2, p, dof, expected = ss.chi2_contingency(arContingency)
print(f"chi2 = {chi2}, pvalue = {p}, dof = {dof}")

chi2 = 0.517964767425645, pvalue = 0.7718366198194897, dof = 2


In [11]:
# Formulation (manual calculation)
print("manual calculation:")
seRowSum = dfContingency.sum(axis=1)
seColSum = dfContingency.sum(axis=0)
itotal = seRowSum.sum()
arExpected = np.array(seRowSum.to_frame().dot(seColSum.to_frame().T) / itotal)
print(f"Expected: {arExpected}")

chi2_manual = ((dfContingency - arExpected)**2 / arExpected).sum().sum()
dof_manual = (dfContingency.shape[0]-1) * (dfContingency.shape[1]-1)
p_manual = ss.chi2.sf(chi2_manual, dof_manual)
print(f"chi2 = {chi2_manual}, pvalue = {p_manual}, dof = {dof_manual}")

manual calculation:
Expected: [[ 429.098  614.292 1016.61 ]
 [1653.902 2367.708 3918.39 ]]
chi2 = 0.517964767425645, pvalue = 0.7718366198194897, dof = 2


# Appendix

Refer to **AB Testing** for
- t-test of two sample means
- z-test of two sample proportions