In [None]:
from datascience import *
import numpy as np

import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
%matplotlib inline

## More Examples

In [None]:
births = Table.read_table('baby.csv')
births

In [None]:
births.hist("Gestational Days")
plots.scatter(258, 0, color='red', s=400)

In [None]:
births.with_column(
    'Birth Weight (lbs)', births.column('Birth Weight')/16
).hist('Birth Weight (lbs)')

In [None]:
babies = births.select('Birth Weight', 'Gestational Days')
babies

In [None]:
ratios = babies.with_column(
    'Ratio BW/GD', babies.column(0)/babies.column(1)
)
ratios

In [None]:
ratios.hist('Ratio BW/GD')

In [None]:
pop_median = np.median(ratios.column('Ratio BW/GD'))
pop_median

In [None]:
pop_median*258 / 16

In [None]:
# use bootstrap to estimate 95% confidence interval
# for median BW/GD in population

resampled_medians = make_array()
for i in np.arange(1000):
    resample = ratios.sample()
    median = np.median(resample.column('Ratio BW/GD'))
    resampled_medians = np.append(resampled_medians, median)
    
interval_95 = [percentile(2.5, resampled_medians),
               percentile(97.5, resampled_medians)]

In [None]:
Table().with_column('Resampled median', resampled_medians).hist('Resampled median')
plots.plot(interval_95, [0, 0], color='gold', lw=5)
plots.scatter(pop_median, 0, color='red', s=400)

# Maternal Ages

In [None]:
# how old are mothers in the sample?
births.hist("Maternal Age")

In [None]:
pop_mean = np.mean(births.column("Maternal Age"))
pop_mean

In [None]:
# use bootstrap to estimate 95% confidence interval
# for mean age of mothers in population

resampled_means = make_array()
for i in np.arange(1000):
    resample = births.sample()
    mean = np.mean(resample.column('Maternal Age'))
    resampled_means = np.append(resampled_means, mean)
    
interval_95 = [percentile(2.5, resampled_means),
               percentile(97.5, resampled_means)]

Table().with_column('Resampled mean', resampled_means).hist('Resampled mean')
plots.plot(interval_95, [0, 0], color='gold', lw=5)
plots.scatter(pop_mean, 0, color='red', s=400)

* **Null:** Population mean is 28
* **Alternative:** Population mean is not 28

# Does smoking affect fetal growth?

* **Null:** There is no difference in growth rates for smokers vs nonsmokers
* **Alternative:** Growth rate is different for smokers

In [None]:
def growth_rate(bw, gd):
    return bw/gd

In [None]:
bw_per_gd = births.apply(growth_rate, "Birth Weight", "Gestational Days")
births = births.with_column("Ratio BW/GD", bw_per_gd)
births

In [None]:
smoker_median = np.median(births.where("Maternal Smoker", True).column("Ratio BW/GD"))
nonsmoker_median = np.median(births.where("Maternal Smoker", False).column("Ratio BW/GD"))
print("Median BW/GD for smokers is", smoker_median)
print("Median BW/GD for nonsmokers is", nonsmoker_median)

In [None]:
nonsmoker_median*258/16

In [None]:
def test_statistic(sample):
    smoker_median = np.median(sample.where("Maternal Smoker", True).column("Ratio BW/GD"))
    nonsmoker_median = np.median(sample.where("Maternal Smoker", False).column("Ratio BW/GD"))
    return nonsmoker_median - smoker_median

In [None]:
observed = test_statistic(births)
observed

In [None]:
# use bootstrap to estimate 95% confidence interval
# for test statistic

sample_stats = make_array()
for i in np.arange(1000):
    resample = births.sample()
    sample_stat = test_statistic(resample)
    sample_stats = np.append(sample_stats, sample_stat)
    
interval_95 = [percentile(2.5, sample_stats),
               percentile(97.5, sample_stats)]

Table().with_column('Difference in growth ratio', sample_stats).hist('Difference in growth ratio')
plots.plot(interval_95, [0, 0], color='gold', lw=5)
plots.scatter(observed, 0, color='red', s=400)

Conclusion of hypothesis test?