In [4]:
import pandas as pd
import numpy as np

import scipy.stats as stats
import researchpy as rp

import statsmodels.api as sm
from statsmodels.formula.api import ols
    
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import seaborn as sns
sns.set(style="white", color_codes=True)

In [5]:
# Python version
import sys
print('Python: {}'.format(sys.version))
# scipy
import scipy
print('scipy: {}'.format(scipy.__version__))
# numpy
import numpy
print('numpy: {}'.format(numpy.__version__))
# matplotlib
import matplotlib
print('matplotlib: {}'.format(matplotlib.__version__))
# pandas
import pandas
print('pandas: {}'.format(pandas.__version__))

Python: 3.7.6 (default, Jan  8 2020, 20:23:39) [MSC v.1916 64 bit (AMD64)]
scipy: 1.4.1
numpy: 1.18.2
matplotlib: 3.2.1
pandas: 1.0.1


In [6]:
races =   ["asian", "black", "hispanic", "other", "white"]

#### random.choice

In [9]:
# Generate a uniform random sample from np.arange(5) of size 3:
np.random.choice(10, 3)

array([9, 5, 2])

In [10]:
# Generate random data
np.random.seed(12)
voter_race = np.random.choice(a = races,
                              p = [0.05, 0.15 ,0.25, 0.05, 0.5],
                              size=1000)

voter_age = stats.poisson.rvs(loc=18,
                              mu=30,
                              size=1000)

In [12]:
# Group age data by race
voter_frame = pd.DataFrame({"race":voter_race, "age":voter_age})
voter_frame.sample(5)

Unnamed: 0,race,age
28,hispanic,45
351,white,49
433,white,49
481,white,51
83,hispanic,47


In [13]:
groups = voter_frame.groupby("race").groups

In [14]:
groups

{'asian': Int64Index([  4,   7,  14,  21,  49,  53,  59,  78,  95,  98, 135, 136, 162,
             203, 227, 264, 278, 289, 326, 335, 345, 373, 430, 480, 484, 491,
             516, 587, 602, 684, 692, 708, 715, 761, 776, 826, 828, 832, 853,
             897, 942, 951, 986, 996],
            dtype='int64'),
 'black': Int64Index([  0,   9,  19,  22,  23,  42,  50,  56,  62,  76,
             ...
             948, 956, 961, 965, 968, 972, 982, 984, 989, 990],
            dtype='int64', length=147),
 'hispanic': Int64Index([  2,  10,  24,  28,  31,  32,  38,  40,  44,  45,
             ...
             954, 955, 958, 959, 962, 964, 966, 974, 994, 999],
            dtype='int64', length=244),
 'other': Int64Index([ 17,  26,  39,  46,  48,  65,  67,  72, 146, 237, 246, 255, 284,
             302, 317, 322, 358, 370, 386, 413, 425, 446, 530, 542, 569, 571,
             573, 575, 583, 626, 629, 637, 662, 696, 700, 701, 728, 739, 756,
             757, 773, 813, 819, 880, 923, 936, 939, 971, 

In [18]:
# Extract individual groups
asian    = voter_age[groups["asian"]]
black    = voter_age[groups["black"]]
hispanic = voter_age[groups["hispanic"]]
other    = voter_age[groups["other"]]
white    = voter_age[groups["white"]]

In [20]:
# Perform the ANOVA
stats.f_oneway(asian, black, hispanic, other, white)

F_onewayResult(statistic=1.7744689357329695, pvalue=0.13173183201930463)

The test output yields an F-statistic of 0.5297640654731643 and a p-value of 0.13173183201930463, indicating that there is no significant difference between the means of each group.

Since p-value is NOT less than 0.05 so we cannot reject the null hypothesis. That means there is no significant difference between the means of those classes/races

#### Now let's make new age data where the group means do differ and run a second ANOVA:

In [21]:
np.random.seed(12)

# Generate random data
voter_race = np.random.choice(a= races,
                              p = [0.05, 0.15 ,0.25, 0.05, 0.5],
                              size=1000)

# Use a different distribution for white ages
white_ages = stats.poisson.rvs(loc=18, 
                              mu=32,
                              size=1000)

voter_age = stats.poisson.rvs(loc=18,
                              mu=30,
                              size=1000)

voter_age = np.where(voter_race=="white", white_ages, voter_age)

In [22]:
# Group age data by race
voter_frame = pd.DataFrame({"race":voter_race, "age":voter_age})
groups      = voter_frame.groupby("race").groups  

In [23]:
# Extract individual groups
asian    = voter_age[groups["asian"]]
black    = voter_age[groups["black"]]
hispanic = voter_age[groups["hispanic"]]
other    = voter_age[groups["other"]]
white    = voter_age[groups["white"]]

In [24]:
# Perform the ANOVA
stats.f_oneway(asian, black, hispanic, other, white)

F_onewayResult(statistic=10.164699828386366, pvalue=4.5613242113994585e-08)

Since p-value is __less than 0.05__ so we __reject__ the null hypothesis. That means there is  significant difference between the means of those classes/races