# Confidence Intervals

In [None]:
from datascience import *
from cs104 import *
import numpy as np
%matplotlib inline

## 1. Pea plants

Population: all 2nd generation plants 

Sample: Mendel's garden: 929 plants, 709 which had purple flowers

Statistic: Percent Purple

### Load Data 

In [None]:
mendel_garden = Table().read_table('data/mendel_garden_sample.csv')
mendel_garden.show(4)

In [None]:
mendel_garden.num_rows

In [None]:
color_array = mendel_garden.column("Color")

Our statistic is the percent purple. 

In [None]:
def percent_purple(color): 
    proportion = sum(color == "Purple") / len(color)
    return proportion * 100 

In [None]:
observed_stat = percent_purple(color_array)
observed_stat

### Bootstrapping 

Now we're ready for our `bootstrap_statistic` function from our [inference library](https://www.cs.williams.edu/~cs104/auto/inference-library-ref.html).

In [None]:
results = bootstrap_statistic(color_array, percent_purple, 1000)

In [None]:
table = Table().with_columns("Bootstrap Samples Percent Purple", results)
plot = table.hist("Bootstrap Samples Percent Purple")
plot.dot(observed_stat)

## 2. Confidence Intervals

### Percentiles

In [None]:
tiny_purple_stat = make_array(78, 70, 88, 82)
tiny_purple_stat

In [None]:
percentile(50, tiny_purple_stat)

In [None]:
percentile(75, tiny_purple_stat)

### Confidence Intervals for Pea Plants 

In [None]:
ci_percent = 95

In [None]:
percent_in_each_tail = (100 - ci_percent) / 2
percent_in_each_tail

In [None]:
left_end = percentile(percent_in_each_tail, results)
left_end

In [None]:
right_end = percentile(100 - percent_in_each_tail, results)
right_end

This function, which is also in our [inference library](https://www.cs.williams.edu/~cs104/auto/inference-library-ref.html), computes the desired confidence interval for an array of statistics. 

In [None]:
def confidence_interval(ci_percent, statistics):
    """
    Return an array with the lower and upper bound of the ci_percent confidence interval.
    """
    # percent in each of the the left/right tails
    percent_in_each_tail = (100 - ci_percent) / 2   
    left = percentile(percent_in_each_tail, statistics)
    right = percentile(100 - percent_in_each_tail, statistics)
    return make_array(left, right)

In [None]:
ci_95 = confidence_interval(95, results)
ci_95

In [None]:
table = Table().with_columns("Bootstrap Samples Percent Purple", results)
plot = table.hist("Bootstrap Samples Percent Purple")
plot.interval(ci_95)
plot.dot(observed_stat)

### Different Confidence Intervals

We can use confidence levels other than 95% too!  Here is how the level impacts the size of the interval.

Our starting point:

In [None]:
confidence_interval(95, results)

If we're okay with less confidence:

In [None]:
confidence_interval(90, results)

If we want more confidence:

In [None]:
confidence_interval(99, results)

We can see the impact of confidence level on the width of the interval more easily in the plots below.

In [None]:
def visualize_ci(ci_percent):
    """
    Plot the desired confidence interval for our Mendel bootstrap run above.
    """
    table = Table().with_columns("Bootstrap Samples Percent Purple", results)
    plot = table.hist("Bootstrap Samples Percent Purple")
    plot.set_title(str(ci_percent) + "% Confidence Interval")
    plot.interval(confidence_interval(ci_percent, results))
    plot.dot(observed_stat)    

In [None]:
with Figure(1,4, figsize=(5,4)):
    visualize_ci(50)
    visualize_ci(90)
    visualize_ci(95)
    visualize_ci(99)    

In [None]:
interact(visualize_ci, ci_percent=Slider(0,100,1))

### Interpreting Confidence

Here are 25 runs of our process on random samples.  We expect 95% of our runs to produce confidence intervals containing the true parameter (75%).

In [None]:
def one_interval(ci_percent):
    # new random sample from population...
    sample = np.random.choice(make_array('Purple', 'Purple', 'Purple', 'White'), 929) 

    boostrapped_stats = bootstrap_statistic(sample, percent_purple, 1000)    
    table = Table().with_columns("Bootstrap Samples Percent Purple", boostrapped_stats)
    plot = table.hist("Bootstrap Samples Percent Purple")
    lo,hi = confidence_interval(ci_percent, boostrapped_stats)
    plot.interval(lo,hi)
    plot.dot(percent_purple(sample))
    plot.square(75)
    
    # if our interval doesn't contain the true parameter, make the plot reddish
    if 75 < lo or 75 > hi:
        plot.ax.set_facecolor("#F2DDD9")

In [None]:
with Figure(5,5, figsize=(6,4)):
    for i in np.arange(0,25):
        one_interval(95)