In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats

In [2]:
np.random.seed(10)

In [3]:
voter_race = np.random.choice(a=["asian","black","hispanic","other","white"],

p=[0.05, 0.15 ,0.25, 0.05, 0.5],
size=1000)

In [4]:
voter_party = np.random.choice(a=["democrat","independent","republican"],

p=[0.4, 0.2, 0.4],
size=1000)

In [5]:
voters = pd.DataFrame({"race":voter_race,
"party":voter_party})

voters

Unnamed: 0,race,party
0,white,democrat
1,asian,republican
2,white,independent
3,white,republican
4,other,democrat
...,...,...
995,white,republican
996,hispanic,independent
997,black,independent
998,white,republican


In [6]:
voter_tab = pd.crosstab(voters.race, voters.party, margins=True)
voter_tab.columns = ["democrat", "independent", "republican", "row_totals"]
voter_tab.index = ["asian", "black", "hispanic", "other", "white", "col_totals"]
# You can check the data of CrossTab by calling it
voter_tab

Unnamed: 0,democrat,independent,republican,row_totals
asian,21,7,32,60
black,65,25,64,154
hispanic,107,50,94,251
other,15,8,15,38
white,189,96,212,497
col_totals,397,186,417,1000


In [7]:
observed = voter_tab.iloc[0:5, 0:3]
# You can check the data of observed table by calling it
observed

Unnamed: 0,democrat,independent,republican
asian,21,7,32
black,65,25,64
hispanic,107,50,94
other,15,8,15
white,189,96,212


In [8]:
expected = np.outer(voter_tab["row_totals"][0:5],

voter_tab.loc["col_totals"][0:3]) / 1000
# Now convert into a DataFrame, Assign the column names and row names
expected = pd.DataFrame(expected)
expected.columns = ["democrat", "independent", "republican"]
expected.index = ["asian", "black", "hispanic", "other", "white"]
# You can check the data of expected table by calling it
expected

Unnamed: 0,democrat,independent,republican
asian,23.82,11.16,25.02
black,61.138,28.644,64.218
hispanic,99.647,46.686,104.667
other,15.086,7.068,15.846
white,197.309,92.442,207.249


In [9]:
chi_squared_stat = (((observed-expected)**2)/expected).sum().sum()
print(chi_squared_stat)

7.169321280162059


In [10]:
crit = stats.chi2.ppf(q = 0.95, # Find the critical value for 95% confidence*

df = 8) # *

print("Critical value")
print(crit)
p_value = 1 - stats.chi2.cdf(x=chi_squared_stat, # Find the p-value

df=8)

print("P value")
print(p_value)

Critical value
15.50731305586545
P value
0.518479392948842


In [12]:
if chi_squared_stat < crit:
    print("""At 0.95 level of significance, we reject the null hypotheses and accept H1.
They are not independent.""" )
else:
    print("""At 0.95 level of significance, we accept the null hypotheses.
They are independent.""" )

At 0.95 level of significance, we reject the null hypotheses and accept H1.
They are not independent.


In [13]:
stats = stats.chi2_contingency(observed=observed)
# You can check the returned data by calling it
# The returned data includes: chi_squared_stat, p_value, df, expected_crosstab
print(stats)

(7.169321280162059, 0.518479392948842, 8, array([[ 23.82 ,  11.16 ,  25.02 ],
       [ 61.138,  28.644,  64.218],
       [ 99.647,  46.686, 104.667],
       [ 15.086,   7.068,  15.846],
       [197.309,  92.442, 207.249]]))
