In [None]:
from datascience import *
import numpy as np

import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
%matplotlib inline

In [None]:
#Quick demo that may help with Homework 7: Part 2, question 14
#Assume an array of different common greetings, represented as an array
common_greetings = make_array("Hi", "Howdy", "Hello", "Hey")

# A second array of greetings
large_array_greetings = make_array("Hi", "Bonjour", "Hi", "Hello", "Hola", "Cheers", "Hi", 
                                   "Yo", "What's up", "Hello", "Hi", "Howdy", 
                                   "Hi", "Hey", "Hi", "Hello", "Hola", "Cheers", "Hello" )

# Let's create a table with a column that has all of the retrieved greetings
all_greetings = Table().with_column("Greetings", large_array_greetings)
all_greetings

#Use are.contained_in() to find 
#how many times we have a greeting that is also part of common_greetings array
all_greetings.where("Greetings", are.contained_in(common_greetings)).num_rows




## REVIEW: Estimation

**Please run all cells before this cell, including the previous example cells and the import cell at the top of the notebook.**

In [None]:
#We looked at SB data from 2020 on Wages of SB City Government Employees
sb = Table.read_table('santa_barbara_2020.csv').select(0, 1, 4)
sb.set_format('TotalWages', NumberFormatter(0))
sb = sb.where('TotalWages', are.above(10000))
sb.show(3)

In [None]:
fiftieth = percentile(50, sb.column('TotalWages'))
fiftieth

In [None]:
median = np.median(sb.column('TotalWages'))
median

#### What is the difference between percentile and median?

In [None]:
comp_bins = np.arange(10000, 350000, 10000)
#Let's look at the population distribution
sb.hist('TotalWages', bins=comp_bins, unit="dollar")
plots.scatter(median, 0, color='red', s=30, zorder=3)


In [None]:
# Here we have the whole population, but if we were not able to get the whole population
# We would get a sample

In [None]:
sample_from_population = sb.sample(200, with_replacement=False)
sample_from_population.show(3)

In [None]:
#What is the median and 50%ile of this sample?
percentile(50, sample_from_population.column('TotalWages'))

In [None]:
#What is the true percentile
percentile(50, sb.column('TotalWages'))

In [None]:
np.median(sample_from_population.column('TotalWages'))

What is the difference between the median and the 50th percentile?

If we keep drawing samples like this over and over again from the population, how close are we to the true parameter?  Is there a pattern that emerges? Can we capture this uncertainty in the way we describe our findings?

In [None]:
#Let's simulate this!
medians = make_array()
repetitions = np.arange(100)
for i in repetitions:
    sample = sb.sample(200, with_replacement=False)
    median = np.median(sample.column('TotalWages'))
    medians = np.append(medians, median)


In [None]:
#Visualize the variability of our estimate in a scatterplot 
parameter = np.median(sb.column('TotalWages')) 

Table().with_columns('trial', repetitions, 'median', medians).scatter('trial')
plots.ylim(60000, 100000)
plots.plot([0,100], [parameter, parameter], 0, color='green', lw=3, zorder=1)

In [None]:
#Visualize the variability of our estimate in a histogram

Table().with_column('medians', medians).hist(0)
plots.scatter(fiftieth, 0, color='red', s=30, zorder=3)

## The Bootstrap

**Please run all cells before this cell, including the previous example cells and the import cell at the top of the notebook.**

In [None]:
medians = make_array()

for i in np.arange(1000):
    resample = sample_from_population.sample()
    median = np.median(resample.column('TotalWages'))
    medians= np.append(medians, median)
    
Table().with_column('Resampled median', medians).hist()

In [None]:
intervals = Table(['Lower', 'Upper'])

for j in np.arange(50):
    sample_from_population = sb.sample(200, with_replacement=False)
    medians =  make_array()
    for i in np.arange(1000):
        resample = sample_from_population.sample()
        median = np.median(resample.column('TotalWages'))
        medians = np.append(medians, median)
        
    interval_90 = [percentile(5, medians),
                   percentile(95, medians)]
    
    intervals.append(interval_90)

In [None]:
parameter = np.median(sb.column('TotalWages'))
correct = intervals.where('Lower', are.not_above(parameter)).where('Upper', are.not_below(parameter))
correct.num_rows

In [None]:
Lower = intervals.column(0)
Upper = intervals.column(1)
plots.ylim(-1, 50)
plots.xlim(60000,110000)

for i in np.arange(50):
    plots.plot([Lower[i], Upper[i]], [i, i], color='blue', lw=1, zorder=1)
plots.plot([parameter, parameter], [0,50], 0, color='green', lw=3, zorder=1)

## Bootstrap: example 2: Mean maternal age

In [None]:
#Remember this table?
births = Table.read_table('baby.csv')
births.show(5)

In [None]:
#TODO: Can we first have a look at the distribution focussing on 'Maternal Age' in this sample?
births.hist(2)

In [None]:
#TODO: Can we estimate the average maternal age from this sample?

In [None]:
#TODO: Let's calculate the average/mean birth weight of this sample. Is this a good estimate?
mean_age = np.mean(births.column(2))
mean_age

In [None]:
# Let's look at the distribution and add a dot for where the mean is in this sample
births.hist(2)
plots.scatter(mean_age,0.002,color="red");

This is just a sample , we know that this was a convenience/deterministic sample (we just sampled a convenient group of babies who were born at a specific hospital, etc.).
But let's now assume this is a random sample that represents the population. Can we estimate the average age 
and provide a confidence interval using the bootstrap technique

In [None]:
# Step 1: Resample
#Step 2: Look at the average
# Step 3: Do this many times over (repeat!)
# Step 4: Decide on an acceptable confidence interval for our process (in %) decided 99%
# Step 5: Look at the data

In [None]:
# Let's capture this in a simulation and look at the variation in averages
averages = make_array()
for i in np.arange(1000):
    resample = births.sample()
    sample_average = np.average(resample.column(2))
    averages = np.append(averages, sample_average)

averages


In [None]:
# Before we look at the data, let's decide on what we an acceptable confidence interval would be (90, 95, 97, 99??)
# What is the 95% confidence interval?
left = percentile(2.5, averages)
right = percentile(97.5, averages)
(left,right)


In [None]:
#TODO: Calculate left and right values for our confidence interval (use percentile)

In [None]:
#Visualize
Table().with_column('Bootstrap Average', averages).hist()

plots.plot([left,right], [0,0], color="gold", lw=10, zorder=1);
plots.scatter(mean_age,0.05,color="blue", zorder=2);
plots.title('Bootstrap Means (1K Bootstraps from our Sample)');
