In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
from scipy import stats
import scipy.stats.mstats as mst
import statsmodels.formula.api as smf

%matplotlib inline

data = pd.read_csv('991_CleanedData.csv', low_memory=False)

In [22]:
black = data[data.RACE == 'Black']
asian = data[data.RACE == 'Asian']
white = data[data.RACE == 'Caucasian']
hispanic = data[data.RACE == 'Hispanic']

In [11]:
f_val, p_val = stats.f_oneway(black.DISC_SCORE0, asian.DISC_SCORE0, white.DISC_SCORE0, hispanic.DISC_SCORE0)

In [12]:
print p_val

2.08629804762e-99


In [15]:
disc_data.AGE0.isnull().sum()

4

In [16]:
age_data = disc_data[pd.notnull(disc_data['AGE0'])]

In [17]:
age_data.shape

(3292, 860)

In [19]:
age_data.groupby('RACE').BMI0.describe()

RACE            
Asian      count     527.000000
           mean       23.090833
           std         3.836002
           min        16.501198
           25%        20.515547
           50%        22.236840
           75%        24.686986
           max        44.563200
Black      count     909.000000
           mean       31.706355
           std         7.737673
           min        15.445714
           25%        26.105436
           50%        30.234375
           75%        36.265705
           max        64.838400
Caucasian  count    1532.000000
           mean       27.774711
           std         6.822342
           min        14.989652
           25%        22.868154
           50%        26.050400
           75%        31.294693
           max        59.127300
Hispanic   count     282.000000
           mean       29.440897
           std         5.913857
           min        17.578125
           25%        25.565714
           50%        28.346704
           75%        3

#Significance across Racial Categories#

In [24]:
black = data[data.RACE == 'Black']
asian = data[data.RACE == 'Asian']
white = data[data.RACE == 'Caucasian']
hispanic = data[data.RACE == 'Hispanic']

Testing for significance across racial categories (nominal variable) vs:
- Age (continuous)
- BMI (continuous)
- Discrimination score (continuous)
- Discrimination category (nominal)
- History of CVD (nominal)
- Education (greater than HS) (nominal)
- Income (greater than 50K) (nominal)
- CRP level (continuous)
- Perceived stress (continuous)

**ANOVA (normal) or Kruskal-Wallis (non-parametric) tests for continuous variables:**
- Age
- BMI
- Discrimination score
- CRP
- Perceived stress

In [66]:
cont_list = ['AGE0', 'BMI0', 'DISC_SCORE0', 'CRPRESU0', 'P_STRESS']

results_anova = []

for x in cont_list:
    f_val, p_val = stats.f_oneway(black[x], asian[x], white[x], hispanic[x])
    results_anova.append([x, f_val, p_val])

results_anova

[['AGE0', 1.8526219697166522, 0.13544527182025834],
 ['BMI0', 192.22532048436685, 8.9365946153013265e-115],
 ['DISC_SCORE0', 162.8801318715054, 2.2467131959013777e-98],
 ['CRPRESU0', 63.015209803718591, 1.3379718131206926e-39],
 ['P_STRESS', 23.567572160040672, 4.359403329133781e-15]]

In [67]:
results_kw = []

for x in cont_list:
    h_val, p_val = mst.kruskalwallis(black[x], asian[x], white[x], hispanic[x])
    results_kw.append([x, h_val, p_val])

results_kw

[['AGE0', 6.7611957141592844, 0.079911599310126355],
 ['BMI0', 614.63690588520717, 6.7646596232844907e-133],
 ['DISC_SCORE0', 439.11841853626896, 7.42785659032371e-95],
 ['CRPRESU0', 420.30343362258805, 8.8515674322415942e-91],
 ['P_STRESS', 73.909502800655403, 6.2059284032738504e-16]]

**Chi-Squared test for Categorical Data**

- Discrimination category (high v low)
- Income (% of participants that are above 50K, values = 3 and 4
- Education (% of participants that are greater than high school graduate, values = 3, 4, and 5
- History of CVD

*Discrimination Category*

In [34]:
data.groupby('RACE').DISC_CAT0.value_counts()

RACE       DISC_CAT0
Asian      LOW          278
           HIGH         251
Black      HIGH         601
           LOW          321
Caucasian  LOW          811
           HIGH         718
Hispanic   LOW          206
           HIGH          76
dtype: int64

In [37]:
# order is asian, black, caucasian, hispanic!! [high, low]

chi_array = np.array([[251, 278], [601, 321], [718, 811], [76, 206]])
chi2, p, dof, expected = stats.chi2_contingency(chi_array)
print chi2, p

151.733255584 1.11392792181e-32


*Income*

In [59]:
income_data = data[['RACE', 'INCOME0']]
income_data['NEWINCOME'] = np.where(income_data['INCOME0'] < 3, 0, 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app


In [61]:
income_pivot = pd.pivot_table(income_data, index=['RACE', 'NEWINCOME'], values=['INCOME0'], aggfunc=len)

income_pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,INCOME0
RACE,NEWINCOME,Unnamed: 2_level_1
Asian,0,163
Asian,1,366
Black,0,554
Black,1,368
Caucasian,0,595
Caucasian,1,934
Hispanic,0,248
Hispanic,1,34


In [62]:
# order is asian, black, caucasian, hispanic!! [0, 1]

chi_array = np.array([[163, 366], [554, 368], [595, 934], [248, 34]])
chi2, p, dof, expected = stats.chi2_contingency(chi_array)
print chi2, p

347.457587839 5.2988177676e-75


*Education*

In [63]:
edu_data = data[['RACE', 'DEGREE']]
edu_data['NEWEDU'] = np.where(edu_data['DEGREE'] < 3, 0, 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app


In [64]:
edu_pivot = pd.pivot_table(edu_data, index=['RACE', 'NEWEDU'], values=['DEGREE'], aggfunc=len)

edu_pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,DEGREE
RACE,NEWEDU,Unnamed: 2_level_1
Asian,0,122
Asian,1,407
Black,0,244
Black,1,678
Caucasian,0,246
Caucasian,1,1283
Hispanic,0,195
Hispanic,1,87


In [65]:
# order is asian, black, caucasian, hispanic!! [0, 1]

chi_array = np.array([[122, 407], [244, 678], [246, 1283], [195, 87]])
chi2, p, dof, expected = stats.chi2_contingency(chi_array)
print chi2, p

362.429460227 3.03520372358e-78
