This notebook seeks to convey a core concept of inferential thinking - given some set of observations about a small sample of a population, attempt to draw robust conclusions about the (unobservable) population.

Here we create a hypothetical population through simulation.  It is based on the historical discussion in the data8 lecture about estimating the size of foreign bomber fleets from the observations of tail markings.

In [None]:
# HIDDEN
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')
# datascience version number of last run of this notebook
version.__version__

In [None]:
# The magic number - size of the population that (in the real world) 
# we don't know and want to estimate

def createPopulation():
    def serNo(x):
        return "{:05d}".format(x)
    p = Table([np.arange(1,37*55)],["Ser No"])
    p.set_format("Ser No", serNo)
    return p

In [None]:
# Create a simulation of the population as a table - ordered collection of named columns
population = createPopulation()
population

In [None]:
# computational thinking - simulate observing a sample of the population
sample_size = 10

In [None]:
population.sample(sample_size,with_replacement=True)

In [None]:
# Simulate observing multiple samples
nsamples = 30

In [None]:
# use iteration to create a table of samples 
samples = Table()
for i in range(nsamples):
    name = "sample-"+str(i)
    a_sample = population.sample(sample_size,with_replacement=True)
    samples[name] = a_sample["Ser No"]
samples

In [None]:
# gracefully transition between tables and arrays
samples['sample-0']

In [None]:
# define a function to capture formally a idea about how to do the estimation
def estimateA(smpl) :
    return np.max(smpl)

In [None]:
# you might come up with lots of other estimators
def estimateB(smpl) :
    return 2*np.mean(smpl)

In [None]:
#verify it works
estimateA(samples["sample-0"])

In [None]:
# illustrate list comprehension to explore data
[estimateA(samples[s]) for s in samples]

In [None]:
# Build a tables of estimates
estA = Table([[estimateA(samples[s]) for s in samples]],['ests'])
estA

In [None]:
# Look at the behavior of this estimator as a histogram
estA.hist(range=(1,np.max(estA['ests'])),bins=20)

In [None]:
# Computational thinking: estimator as a higher order function 
# passed in to a function that creates a table of estimate
def estimate(estimator):
    return Table([[estimator(samples[s]) for s in samples]],['ests'])

In [None]:
estB = estimate(estimateB)

In [None]:
estB.hist(range=(1,np.max(estB['ests'])),bins=20)

In [None]:
comp = Table([estA['ests'],estB['ests']],['estA','estB'])

In [None]:
comp

In [None]:
comp.hist(overlay=True, bins=np.arange(1000,2500,50))

In [None]:
# How does these estimates compare with the true size of the population?
population.num_rows

In [None]:
# Produce a table containing the data associated with a histogram
ebins = comp.bin(bins=np.arange(1000,2500,50))
ebins.show()