In [10]:
# Import pandas package and functions from statsmodels
import pandas as pd
from statsmodels.stats.proportion import proportions_ztest
from statsmodels.stats.proportion import proportion_confint

In [11]:
# Load the dataset
nhanes = pd.read_csv('nhanes.csv')

# View dataset (first/last 5 rows and the first/last 10 columns)
nhanes

Unnamed: 0,ID,SurveyYr,Gender,Age,AgeDecade,AgeMonths,Race1,Race3,Education,MaritalStatus,...,RegularMarij,AgeRegMarij,HardDrugs,SexEver,SexAge,SexNumPartnLife,SexNumPartYear,SameSex,SexOrientation,PregnantNow
0,51624,2009_10,male,34,30-39,409.0,White,,High School,Married,...,No,,Yes,Yes,16.0,8.0,1.0,No,Heterosexual,
1,51624,2009_10,male,34,30-39,409.0,White,,High School,Married,...,No,,Yes,Yes,16.0,8.0,1.0,No,Heterosexual,
2,51624,2009_10,male,34,30-39,409.0,White,,High School,Married,...,No,,Yes,Yes,16.0,8.0,1.0,No,Heterosexual,
3,51625,2009_10,male,4,0-9,49.0,Other,,,,...,,,,,,,,,,
4,51630,2009_10,female,49,40-49,596.0,White,,Some College,LivePartner,...,No,,Yes,Yes,12.0,10.0,1.0,Yes,Heterosexual,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,71909,2011_12,male,28,20-29,,Mexican,Mexican,9 - 11th Grade,NeverMarried,...,,,,,,,,,,
9996,71910,2011_12,female,0,0-9,5.0,White,White,,,...,,,,,,,,,,
9997,71911,2011_12,male,27,20-29,,Mexican,Mexican,College Grad,Married,...,No,,No,Yes,21.0,1.0,1.0,No,Heterosexual,
9998,71915,2011_12,male,60,60-69,,White,White,College Grad,NeverMarried,...,,,No,Yes,19.0,2.0,,No,,


In [12]:
# Conduct the hypothesis test for testing whether or not the population
# proportion of U.S. adults diagnosed with diabetes in 2012 is 0.07

# Subset full nhanes dataset to only include the 5000 instances from the
# 2011_12 survey year

nhanes2012 = nhanes[nhanes['SurveyYr'] == "2011_12"]

# Find the total number in the 2012 sample diagnosed with
# and without diabetes

countDiabetes = nhanes2012['Diabetes'].value_counts()
print(countDiabetes)

# Find the total number of instances in 2012 for the Diabetes feature
totalInstances2012 = countDiabetes['No'] + countDiabetes['Yes']
print('2012 total:', totalInstances2012)

# Find the sample proportion
sampleProp2012 = countDiabetes['Yes'] / totalInstances2012
print('sample proportion with diabetes =', sampleProp2012)

# Find the z test statistic and p-value using proportions_ztest
proportions_ztest(
    count=countDiabetes['Yes'],
    nobs=totalInstances2012,
    value=0.07,
    alternative='two-sided',
    prop_var=0.07,
)

Diabetes
No     4563
Yes     373
Name: count, dtype: int64
2012 total: 4936
sample proportion with diabetes = 0.07556726094003241


(np.float64(1.53298765871851), np.float64(0.12527889571862286))

In [15]:
# The first value returned is the test statistic,
# the second value is the p-value

testStat, pvalue = proportions_ztest(
    count=countDiabetes['Yes'],
    nobs=totalInstances2012,
    value=0.07,
    alternative='two-sided',
    prop_var=0.07,
)

print('z test statistic =', round(testStat, 3))
print('p-value =', round(pvalue, 3))

z test statistic = 1.533
p-value = 0.125


In [6]:
# Find the 95% confidence interval for the proportion of all U.S. adults
# in 2012 with diabetes

proportion_confint(
    count=countDiabetes['Yes'], nobs=totalInstances2012, alpha=0.05, method='normal'
)

(0.06819390813559631, 0.08294061374446851)

In [7]:
# Conduct the hypothesis test for testing whether or not the population
# proportions of U.S. adults diagnosed with diabetes are the same for
# the 2009_10 and 2011_12 survey years

# Find the total number in the sample diagnosed with diabetes for each
# survey year

countDiabetes2Yrs = nhanes[['SurveyYr', 'Diabetes']].value_counts()
print(countDiabetes2Yrs)

# Find the total number of instances in each survey year for the
# Diabetes feature

totalInstances2010 = (
    countDiabetes2Yrs['2009_10', 'No'] + countDiabetes2Yrs['2009_10', 'Yes']
)
totalInstances2012 = (
    countDiabetes2Yrs['2011_12', 'No'] + countDiabetes2Yrs['2011_12', 'Yes']
)

# Find the sample proportions and difference in sample proportions
sampleProp2010 = countDiabetes2Yrs['2009_10', 'Yes'] / totalInstances2010
sampleProp2012 = countDiabetes2Yrs['2011_12', 'Yes'] / totalInstances2012

sampleDiff = sampleProp2012 - sampleProp2010
print('2010 sample proportion with diabetes =', sampleProp2010)
print('2012 sample proportion with diabetes =', sampleProp2012)
print('2012 proportion - 2010 proportion =', sampleDiff)

# Find the overall proportion of diabetes for calculating
# the test statistic

overallSampleProp = (
    countDiabetes2Yrs['2009_10', 'Yes'] + countDiabetes2Yrs['2011_12', 'Yes']
) / (totalInstances2010 + totalInstances2012)

# Find the z test statistic and p-value using proportions_ztest
proportions_ztest(
    count=[countDiabetes2Yrs['2011_12', 'Yes'], countDiabetes2Yrs['2009_10', 'Yes']],
    nobs=[totalInstances2012, totalInstances2010],
    value=0,
    alternative='two-sided',
    prop_var=overallSampleProp,
)

SurveyYr  Diabetes
2011_12   No          4563
2009_10   No          4535
          Yes          387
2011_12   Yes          373
Name: count, dtype: int64
2010 sample proportion with diabetes = 0.0786265745631857
2012 sample proportion with diabetes = 0.07556726094003241
2012 proportion - 2010 proportion = -0.003059313623153287


(np.float64(-0.5693732052338896), np.float64(0.5691028967054365))

In [8]:
# Known counts, sample sizes, and overall proportion can specify directly
# into the proportions_ztest() function
knownCounts = [373, 387]
knownNobs = [4936, 4922]
knownOverallSampleProp = (373 + 387) / (4936 + 4922)

proportions_ztest(
    count=knownCounts,
    nobs=knownNobs,
    value=0,
    alternative='two-sided',
    prop_var=knownOverallSampleProp,
)

(np.float64(-0.5693732052338896), np.float64(0.5691028967054365))