In [1]:
# Import pandas and numpy packages and functions from scipy.stats 
import pandas as pd
import numpy as np
from scipy.stats import ttest_1samp
from scipy.stats import ttest_ind
from scipy.stats import t


In [2]:
# Load the dataset 
nhanes = pd.read_csv('nhanes.csv')

# View dataset (first/last 5 rows and the first/last 10 columns)
nhanes

Unnamed: 0,ID,SurveyYr,Gender,Age,AgeDecade,AgeMonths,Race1,Race3,Education,MaritalStatus,...,RegularMarij,AgeRegMarij,HardDrugs,SexEver,SexAge,SexNumPartnLife,SexNumPartYear,SameSex,SexOrientation,PregnantNow
0,51624,2009_10,male,34,30-39,409.0,White,,High School,Married,...,No,,Yes,Yes,16.0,8.0,1.0,No,Heterosexual,
1,51624,2009_10,male,34,30-39,409.0,White,,High School,Married,...,No,,Yes,Yes,16.0,8.0,1.0,No,Heterosexual,
2,51624,2009_10,male,34,30-39,409.0,White,,High School,Married,...,No,,Yes,Yes,16.0,8.0,1.0,No,Heterosexual,
3,51625,2009_10,male,4,0-9,49.0,Other,,,,...,,,,,,,,,,
4,51630,2009_10,female,49,40-49,596.0,White,,Some College,LivePartner,...,No,,Yes,Yes,12.0,10.0,1.0,Yes,Heterosexual,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,71909,2011_12,male,28,20-29,,Mexican,Mexican,9 - 11th Grade,NeverMarried,...,,,,,,,,,,
9996,71910,2011_12,female,0,0-9,5.0,White,White,,,...,,,,,,,,,,
9997,71911,2011_12,male,27,20-29,,Mexican,Mexican,College Grad,Married,...,No,,No,Yes,21.0,1.0,1.0,No,Heterosexual,
9998,71915,2011_12,male,60,60-69,,White,White,College Grad,NeverMarried,...,,,No,Yes,19.0,2.0,,No,,


In [3]:
# Find descriptive statistics for SleepHrsNight feature
# A total of count=7755 instances have a value for the feature

nhanes['SleepHrsNight'].describe()

count    7755.000000
mean        6.927531
std         1.346729
min         2.000000
25%         6.000000
50%         7.000000
75%         8.000000
max        12.000000
Name: SleepHrsNight, dtype: float64

In [4]:
# Subset dataset to drop instances with missing values for the 
# SleepHrsNight feature

nhanesSleep=nhanes.dropna(axis=0, subset=['SleepHrsNight'])
nhanesSleep

Unnamed: 0,ID,SurveyYr,Gender,Age,AgeDecade,AgeMonths,Race1,Race3,Education,MaritalStatus,...,RegularMarij,AgeRegMarij,HardDrugs,SexEver,SexAge,SexNumPartnLife,SexNumPartYear,SameSex,SexOrientation,PregnantNow
0,51624,2009_10,male,34,30-39,409.0,White,,High School,Married,...,No,,Yes,Yes,16.0,8.0,1.0,No,Heterosexual,
1,51624,2009_10,male,34,30-39,409.0,White,,High School,Married,...,No,,Yes,Yes,16.0,8.0,1.0,No,Heterosexual,
2,51624,2009_10,male,34,30-39,409.0,White,,High School,Married,...,No,,Yes,Yes,16.0,8.0,1.0,No,Heterosexual,
4,51630,2009_10,female,49,40-49,596.0,White,,Some College,LivePartner,...,No,,Yes,Yes,12.0,10.0,1.0,Yes,Heterosexual,
7,51647,2009_10,female,45,40-49,541.0,White,,College Grad,Married,...,No,,No,Yes,13.0,20.0,0.0,Yes,Bisexual,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,71909,2011_12,male,28,20-29,,Mexican,Mexican,9 - 11th Grade,NeverMarried,...,,,,,,,,,,
9995,71909,2011_12,male,28,20-29,,Mexican,Mexican,9 - 11th Grade,NeverMarried,...,,,,,,,,,,
9997,71911,2011_12,male,27,20-29,,Mexican,Mexican,College Grad,Married,...,No,,No,Yes,21.0,1.0,1.0,No,Heterosexual,
9998,71915,2011_12,male,60,60-69,,White,White,College Grad,NeverMarried,...,,,No,Yes,19.0,2.0,,No,,


In [5]:
# Conduct the hypothesis test for testing whether the population mean
# self-reported number of hours of sleep per night is 7 or whether 
# the population mean is less than 7

ttest_1samp(a=nhanesSleep['SleepHrsNight'], popmean=7, alternative='less')


TtestResult(statistic=np.float64(-4.738764444811445), pvalue=np.float64(1.094270021990124e-06), df=np.int64(7754))

In [6]:
# Construct a 95% confidence interval for the population mean

# Find sample mean, sample standard deviation, and sample size
sampleMean=nhanesSleep['SleepHrsNight'].mean()
sampleStDev=nhanesSleep['SleepHrsNight'].std()
sampleSize=nhanesSleep['SleepHrsNight'].count()

# Find multiplier using Confidence Level and t-distribution
confLevel=0.95
tMult=t.ppf(q=1-((1-confLevel)/2), df=sampleSize-1)

# Construct interval using general equation: 
# estimate +/- multiplier * standard deviation

lowerBound=sampleMean-tMult*(sampleStDev/sampleSize**0.5)
upperBound=sampleMean+tMult*(sampleStDev/sampleSize**0.5)

print(lowerBound, upperBound)

6.897552444244942 6.957508806560989


In [7]:
# Conduct the hypothesis test for testing whether or not the population
# mean self-reported number of hours of sleep per night is the same for 
# the 2009_10 and 2011_12 survey years

# Find descriptive statistics for SleepHrsNight feature for each survey year
# Provides an initial comparison of the two samples, notice similar means

statsByYear=nhanes.groupby(['SurveyYr'])['SleepHrsNight'].describe()
print(statsByYear)

# Find statistic and p-value using ttest_ind()
ttest_ind(a=nhanes[nhanes['SurveyYr']=='2009_10']['SleepHrsNight'], 
          b=nhanes[nhanes['SurveyYr']=='2011_12']['SleepHrsNight'],
          equal_var=False, nan_policy='omit', alternative='two-sided')



           count      mean       std  min  25%  50%  75%   max
SurveyYr                                                      
2009_10   3921.0  6.948483  1.367031  2.0  6.0  7.0  8.0  12.0
2011_12   3834.0  6.906103  1.325481  2.0  6.0  7.0  8.0  12.0


TtestResult(statistic=np.float64(1.3860618020279534), pvalue=np.float64(0.16576789777027626), df=np.float64(7752.449941521515))