In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
from scipy import stats
import scipy.stats.mstats as mst
import statsmodels.formula.api as smf

%matplotlib inline

data = pd.read_csv('991_CleanedData.csv', low_memory=False)

#Significance across Racial Categories#

In [2]:
black = data[data.RACE == 'Black']
asian = data[data.RACE == 'Asian']
white = data[data.RACE == 'Caucasian']
hispanic = data[data.RACE == 'Hispanic']

Testing for significance across racial categories (nominal variable) vs:
- Age (continuous)
- BMI (continuous)
- Discrimination score (continuous)
- Discrimination category (nominal)
- History of CVD (nominal)
- Education (greater than HS) (nominal)
- Income (greater than 50K) (nominal)
- CRP level (continuous)
- Perceived stress (continuous)

###ANOVA (normal) or Kruskal-Wallis (non-parametric) tests for continuous variables###
- Age
- BMI
- Discrimination score
- CRP
- Perceived stress

In [3]:
cont_race = ['AGE0', 'BMI0', 'DISC_SCORE0', 'CRPRESU0', 'MOD_PSTRESS']

mean_race = []
std_race = []

for x in cont_race:
    mean_race.append([x, black[x].mean(), asian[x].mean(), white[x].mean(), hispanic[x].mean()])
    std_race.append([x, black[x].std(), asian[x].std(), white[x].std(), hispanic[x].std()]) 

In [4]:
mr_array = np.array(mean_race)
stdr_array = np.array(std_race)

In [5]:
print "Average values, order = black, asian, white, hispanic"
print (mr_array)
print ""
print "Std values, order = black, asian, white, hispanic"
print (stdr_array)

Average values, order = black, asian, white, hispanic
[['AGE0' '45.7472885033' '46.0888468809' '45.8358404186' '45.8368794326']
 ['BMI0' '31.639420505' '23.1275045232' '27.7705893783' '29.4249467314']
 ['DISC_SCORE0' '1.92379790311' '1.72354547364' '1.68444153768'
  '1.2328999212']
 ['CRPRESU0' '5.87169303537' '1.478875817' '3.5022310661' '4.04400471128']
 ['MOD_PSTRESS' '8.45661605206' '8.49338374291' '8.43296272073'
  '9.97517730496']]

Std values, order = black, asian, white, hispanic
[['AGE0' '2.66071231702' '2.62953223336' '2.71362134458' '2.77924210329']
 ['BMI0' '7.69146586484' '3.85451094058' '6.78291193244' '5.91095945042']
 ['DISC_SCORE0' '0.533526767596' '0.492036160397' '0.432333073853'
  '0.371459131112']
 ['CRPRESU0' '7.56264085088' '3.76998057149' '5.78484417388'
  '5.39050182258']
 ['MOD_PSTRESS' '3.04878264488' '2.62787607345' '2.87881077728'
  '2.98920134845']]


In [6]:
results_anova = []

for x in cont_race:
    f_val, p_val = stats.f_oneway(black[x], asian[x], white[x], hispanic[x])
    results_anova.append([x, f_val, p_val])

results_anova

[['AGE0', 1.8526219697166522, 0.13544527182025834],
 ['BMI0', 192.22532048436685, 8.9365946153013265e-115],
 ['DISC_SCORE0', 162.8801318715054, 2.2467131959013777e-98],
 ['CRPRESU0', 63.015209803718591, 1.3379718131206926e-39],
 ['MOD_PSTRESS', 23.795581901876769, 3.1336573328681647e-15]]

In [7]:
results_kw = []

for x in cont_race:
    h_val, p_val = mst.kruskalwallis(black[x], asian[x], white[x], hispanic[x])
    results_kw.append([x, h_val, p_val])

results_kw

[['AGE0', 6.7611957141592844, 0.079911599310126355],
 ['BMI0', 614.63690588520717, 6.7646596232844907e-133],
 ['DISC_SCORE0', 439.11841853626896, 7.42785659032371e-95],
 ['CRPRESU0', 420.30343362258805, 8.8515674322415942e-91],
 ['MOD_PSTRESS', 74.045436824279861, 5.8033357671048311e-16]]

###Chi-Squared test for Categorical Data###

- Discrimination category (high v low)
- Income (% of participants that are above 50K, values = 3 and 4
- Education (% of participants that are greater than high school graduate, values = 3, 4, and 5
- History of CVD

In [8]:
data.groupby('RACE').DISC_CAT0.value_counts(normalize=True)

RACE       DISC_CAT0
Asian      LOW          0.525520
           HIGH         0.474480
Black      HIGH         0.651844
           LOW          0.348156
Caucasian  LOW          0.530412
           HIGH         0.469588
Hispanic   LOW          0.730496
           HIGH         0.269504
dtype: float64

In [9]:
data.groupby('RACE').NEWINCOME.value_counts(normalize=True)

RACE       NEWINCOME
Asian      1            0.691871
           0            0.308129
Black      0            0.600868
           1            0.399132
Caucasian  1            0.610857
           0            0.389143
Hispanic   0            0.879433
           1            0.120567
dtype: float64

In [10]:
data.groupby('RACE').NEWEDU.value_counts(normalize=True)

RACE       NEWEDU
Asian      1         0.769376
           0         0.230624
Black      1         0.735358
           0         0.264642
Caucasian  1         0.839111
           0         0.160889
Hispanic   0         0.691489
           1         0.308511
dtype: float64

In [11]:
data.groupby('RACE').CVD.value_counts(normalize=True)

RACE       CVD
Asian      0      0.992439
           1      0.007561
Black      0      0.950108
           1      0.049892
Caucasian  0      0.982995
           1      0.017005
Hispanic   0      0.946809
           1      0.053191
dtype: float64

In [12]:
cat_race = ['DISC_CAT0', 'NEWINCOME', 'NEWEDU', 'CVD']

results_cat = []

# test = pd.pivot_table(data, values='DEGREE', index='RACE', columns='NEWEDU', aggfunc=len)

for x in cat_race:
    chi_array = pd.pivot_table(data, values='SWANID', index='RACE', columns=x, aggfunc=len)
    chi2, p, dof, expected = stats.chi2_contingency(chi_array)
    results_cat.append([x, chi2, p])

results_cat

[['DISC_CAT0', 151.73325558443955, 1.1139279218091261e-32],
 ['NEWINCOME', 347.45758783873111, 5.2988177675961739e-75],
 ['NEWEDU', 362.42946022681332, 3.0352037235847402e-78],
 ['CVD', 37.856509954225558, 3.0312539262732809e-08]]

In [13]:
chi_array = pd.pivot_table(data, values='SWANID', index='RACE', columns='INCOME0', aggfunc=len)
stats.chi2_contingency(chi_array)

(662.0914104876623,
 9.7105207955470354e-137,
 9L,
 array([[  78.32832618,  174.65757204,  201.091355  ,   74.92274678],
        [ 136.5193133 ,  304.41263029,  350.48436542,  130.58369099],
        [ 226.39699571,  504.82311465,  581.22624157,  216.55364807],
        [  41.75536481,   93.10668302,  107.19803801,   39.93991416]]))

In [14]:
chi_array = pd.pivot_table(data, values='SWANID', index='RACE', columns='DEGREE', aggfunc=len)
stats.chi2_contingency(chi_array)

(868.55418710984952,
 3.2398544382360211e-178,
 12L,
 array([[  38.11005518,   92.76149601,  173.36020846,  105.57296137,
          119.19527897],
        [  66.42244022,  161.67504598,  302.15144083,  184.00429185,
          207.74678112],
        [ 110.15174739,  268.11404047,  501.07326793,  305.14377682,
          344.51716738],
        [  20.3157572 ,   49.44941754,   92.41508277,   56.27896996,
           63.54077253]]))

#Signficance between High/Low Discrimination Classification#

In [15]:
high = data[data.DISC_CAT0 == 'HIGH']
low = data[data.DISC_CAT0 == 'LOW']

Testing for significance between High and Low Discrimination classification (nominal variable) vs:
- Age (continuous)
- BMI (continuous)
- History of CVD (nominal)
- Education (greater than HS) (nominal)
- Income (greater than 50K) (nominal)
- CRP level (continuous)
- Perceived stress (continuous)

###ANOVA (normal) or Kruskal-Wallis (non-parametric) tests for continuous variables:###

- Age
- BMI
- CRP
- Perceived Stress

In [16]:
cont_hl = ['AGE0', 'BMI0', 'CRPRESU0', 'MOD_PSTRESS']

mean_hl = []
std_hl = []

for x in cont_hl:
    mean_hl.append([x, high[x].mean(), low[x].mean()])
    std_hl.append([x, high[x].std(), low[x].std()]) 

In [17]:
mhl_array = np.array(mean_hl)
stdhl_array = np.array(std_hl)

In [18]:
print "Average values, order = high, low"
print (mhl_array)
print ""
print "Std values, order = high, low"
print (stdhl_array)

Average values, order = high, low
[['AGE0' '45.6780072904' '46.0290841584']
 ['BMI0' '29.2915615061' '27.1974947383']
 ['CRPRESU0' '4.2923509287' '3.48152059765']
 ['MOD_PSTRESS' '9.07047387606' '8.08601485149']]

Std values, order = high, low
[['AGE0' '2.63358526459' '2.73987107836']
 ['BMI0' '7.54831677555' '6.60042302557']
 ['CRPRESU0' '6.6231227088' '5.76719934717']
 ['MOD_PSTRESS' '3.00312147944' '2.76589623874']]


In [19]:
results_anova = []

for x in cont_hl:
    f_val, p_val = stats.f_oneway(high[x], low[x])
    results_anova.append([x, f_val, p_val])

results_anova

[['AGE0', 13.922986199645944, 0.0001937023530168299],
 ['BMI0', 71.041983699927641, 5.1765059445336388e-17],
 ['CRPRESU0', 13.88434075475412, 0.00019770891514352256],
 ['MOD_PSTRESS', 94.749523102730123, 4.3062471265115382e-22]]

In [20]:
results_kw = []

for x in cont_hl:
    h_val, p_val = mst.kruskalwallis(high[x], low[x])
    results_kw.append([x, h_val, p_val])

results_kw

[['AGE0', 12.240688266485645, 0.00046758578234873762],
 ['BMI0', 73.48293482326568, 1.0151000839236607e-17],
 ['CRPRESU0', 25.863630179251899, 3.6640819276179106e-07],
 ['MOD_PSTRESS', 88.228710189333171, 5.8307441711376673e-21]]

###Chi-Squared test for Categorical Data###

- Income (% of participants that are above 50K, values = 3 and 4
- Education (% of participants that are greater than high school graduate, values = 3, 4, and 5
- History of CVD

In [21]:
data.groupby('DISC_CAT0').NEWINCOME.value_counts(normalize=True)

DISC_CAT0  NEWINCOME
HIGH       0            0.524909
           1            0.475091
LOW        1            0.569307
           0            0.430693
dtype: float64

In [22]:
data.groupby('DISC_CAT0').NEWEDU.value_counts(normalize=True)

DISC_CAT0  NEWEDU
HIGH       1         0.745443
           0         0.254557
LOW        1         0.759901
           0         0.240099
dtype: float64

In [23]:
data.groupby('DISC_CAT0').CVD.value_counts(normalize=True)

DISC_CAT0  CVD
HIGH       0      0.968408
           1      0.031592
LOW        0      0.975866
           1      0.024134
dtype: float64

In [24]:
cat_hl = ['NEWINCOME', 'NEWEDU', 'CVD']

results_cat = []

for x in cat_hl:
    chi_array = pd.pivot_table(data, values='SWANID', index='DISC_CAT0', columns=x, aggfunc=len)
    chi2, p, dof, expected = stats.chi2_contingency(chi_array)
    results_cat.append([x, chi2, p])

results_cat

[['NEWINCOME', 28.631696027909694, 8.7537528767456565e-08],
 ['NEWEDU', 0.83940828081304131, 0.35956605763980687],
 ['CVD', 1.4088083578938995, 0.23525431335003533]]

In [25]:
chi_array = pd.pivot_table(data, values='SWANID', index='DISC_CAT0', columns='INCOME0', aggfunc=len)
stats.chi2_contingency(chi_array)

(37.453845235414747,
 3.688520994033088e-08,
 3L,
 array([[ 243.72103004,  543.45248314,  625.7020233 ,  233.12446352],
        [ 239.27896996,  533.54751686,  614.2979767 ,  228.87553648]]))

In [26]:
chi_array = pd.pivot_table(data, values='SWANID', index='DISC_CAT0', columns='DEGREE', aggfunc=len)
stats.chi2_contingency(chi_array)

(10.265939725152036,
 0.036178482248029449,
 4L,
 array([[ 118.58062538,  288.63028817,  539.41569589,  328.49356223,
          370.87982833],
        [ 116.41937462,  283.36971183,  529.58430411,  322.50643777,
          364.12017167]]))

In [27]:
high.shape

(1646, 869)

In [28]:
low.shape

(1616, 869)