In [None]:
import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')

# Normal Distributions

## Standard Units ##

In [None]:
# Read in births data set
births = Table.read_table('data/baby.csv')
births.show(3)

In [None]:
# A function to convert an array to standard units
def standard_units(x):
    """Convert array x to standard units."""
    return (x - np.mean(x)) / np.std(x)

In [None]:
# Create an array to hold the maternal ages
ages = births.column('Maternal Age')
ages

In [None]:
# Convert the maternal ages to standard units
ages_standard_units = standard_units(ages)
print(ages_standard_units)

In [None]:
# Verify that the mean is 0 and standard deviation is 1
np.mean(ages_standard_units), np.std(ages_standard_units)

In [None]:
# Create a table with both original and standard units
both = Table().with_columns('Age in Years', ages, 'Age in Standard Units', ages_standard_units)
both

In [None]:
# Display average and and standard deviation in original units
np.mean(ages), np.std(ages)

In [None]:
both.hist('Age in Years', bins = np.arange(15, 46, 2))

In [None]:
# Confirm distributions are the same shape in standard units
both.hist('Age in Standard Units', bins = np.arange(-2.2, 3.4, 0.35))
plots.xlim(-2, 3.1);

## The SD and Bell Shaped Curves

In [None]:
births.hist('Maternal Height', bins = np.arange(56.5, 72.6, 1))

In [None]:
# Compute the mean and standard deviation
heights = births.column('Maternal Height')
np.mean(heights), np.std(heights)

In [None]:
# Estimate the inflection points
np.mean(heights) + np.std(heights), np.mean(heights) - np.std(heights)

## Central Limit Theorem ##

In [None]:
# Read in United data
united = Table.read_table('data/united.csv')
united_bins = np.arange(-20, 300, 10)
united

In [None]:
# Inspect histogram of the Delay column
united.hist('Delay', bins=united_bins)

In [None]:
# Determine mean and sd of the Delay column
delays = united.column('Delay')
delay_mean = np.mean(delays)
delay_sd = np.std(delays)
delay_mean, delay_sd

In [None]:
def one_sample_mean(sample_size):
    """ Takes a sample from the population of flights and computes its mean"""
    sampled_flights = united.sample(sample_size)
    return np.mean(sampled_flights.column('Delay'))

In [None]:
one_sample_mean(100)

In [None]:
def ten_thousand_sample_means(sample_size):
    means = make_array()
    for i in np.arange(10000):
        mean = one_sample_mean(sample_size)
        means = np.append(means, mean)
    return means

In [None]:
# Create an array with 10000 sample means of the Delays
sample_means_100 = ten_thousand_sample_means(100)

In [None]:
# Inspect the array of sample means
sample_means_100

In [None]:
# Verify there are 10000 sample means
len(sample_means_100)

In [None]:
# Inspect the histogram of the 10,000 samples of flight delays
Table().with_column('Mean of 100 flight delays', sample_means_100).hist(bins=20)
print('Population Average:', delay_mean)

In [None]:
# How does increasing the sample size impact the distribution?
sample_means_400 = ten_thousand_sample_means(400)
Table().with_column('Mean of 400 flight delays', sample_means_400).hist(bins=20)
print('Population Average:', delay_mean)

In [None]:
# How does increasing the sample size impact the distribution?
sample_means_900 = ten_thousand_sample_means(900)
Table().with_column('Mean of 900 flight delays', sample_means_900).hist(bins=20)
print('Population Average:', delay_mean)

In [None]:
Table().with_columns('100', sample_means_100, '400',sample_means_400, '900', sample_means_900).hist(bins=40)