In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Lecture 14 ##

## Random Sampling ##

In [None]:
united = Table.read_table('united_summer2015.csv')
united_bins = np.arange(-50,250,10)
united.hist('Delay', bins=united_bins)

In [None]:
#two deterministic Sample - no randomness
deterministic = united.where("Date", are.equal_to("7/4/15"))
deterministic.hist('Delay', bins=united_bins)

deterministic2 = united.where('Delay', are.below(0))
deterministic2.hist('Delay', bins=united_bins)

In [None]:
#Random Samples
united.sample(1000).hist('Delay', bins=united_bins)
united.sample(1000, with_replacement = False).hist('Delay', bins=united_bins)

## Distributions ##

In [None]:
die = Table().with_column('Face', np.arange(1, 7))
die

In [None]:
die_bins = np.arange(0.5,7,1) #to make these histograms pretty

In [None]:
#Probability Distribution of the result of rolling a die
die.hist('Face',bins=die_bins)

In [None]:
#rolling a die = sampling from the probability distribution of the faces
die.sample(10).hist(bins=die_bins)

In [None]:
die.sample(100).hist(bins=die_bins)

In [None]:
die.sample(1000).hist(bins=die_bins)

In [None]:
die.sample(50000).hist(bins=die_bins)

## Large Random Samples ##

In [None]:
#original data
united.hist('Delay', bins=united_bins)

In [None]:
united.sample(10).hist('Delay', bins=united_bins)
united.sample(100).hist('Delay', bins=united_bins)
united.sample(500).hist('Delay', bins=united_bins)
united.sample(5000).hist('Delay', bins=united_bins)


## Distributions of Statistics ##

In [None]:
#statistic: proportion of 6s in 100 rolls of a die
hundred_rolls = die.sample(100)
hundred_rolls

In [None]:
proportion_sixes = np.count_nonzero(hundred_rolls.column('Face') == 6)/hundred_rolls.num_rows
proportion_sixes

In [None]:
#let's not think about the distribution of the rolls
#hundred_rolls.hist(bins=die_bins)

#instead, we write a function that calculates a statistic from 100 rolls
def prop_sixes():
    hundred_rolls = die.sample(100)
    proportion_sixes = np.count_nonzero(hundred_rolls.column('Face') == 6)/hundred_rolls.num_rows
    return proportion_sixes

In [None]:
prop_sixes() #run this cell a bunch of times

In [None]:
# make python run this for us lots of times
simulated_proportions = make_array()
for ...:
    one_result = ...
    ...

In [None]:
Table().with_column('Simulated Proportions', simulated_proportions).hist('Simulated Proportions', bins=np.arange(0.05,0.3,0.03))

In [None]:
#this time, 15000 repetitions
#so we should be even closer to the theoretical distribution of this statistic
simulated_proportions2 = make_array()
for i in np.arange(15000):
    one_result = prop_sixes()
    simulated_proportions2 = np.append(simulated_proportions2, one_result)
Table().with_column('Simulated Proportions', simulated_proportions2).hist('Simulated Proportions', bins=np.arange(0.05,0.3,0.03))


### A different statistic ###

In [None]:
#a weird statistic: absolute value of difference between mean and median in 100 rolls
def simulate_and_get_weird_stat():
    hundred_rolls = die.sample(100)
    one_stat = ...
    return ...


In [None]:
stats = make_array()
for i in np.arange(5000):
    one_result = simulate_and_get_weird_stat()
    stats = np.append(stats, one_result)

Table().with_column('Simulated Stats', stats).hist('Simulated Stats')

### Another statistic distribution - back to flights ###

In [None]:
def flight_sample_mean():
    sample = united.sample(500)
    return ...

In [None]:
means = make_array()
for i in np.arange(1000):
    one_sample_mean = flight_sample_mean()
    means = np.append(means, one_sample_mean)
    

In [None]:
Table().with_column('Sample Means', means).hist('Sample Means', bins=np.arange(11,23,1))

### Gary's Model ###

In [None]:
#question: what's the distribution of the # of heads in 10 flips of a fair coin?

#experiment: 10 flips of a fair coin

#statistic of interest: number of heads

#strategy: approximate the probability distribution by simulating the experiment lots of times

In [None]:
def ten_flips():
    '''a function that returns the number of heads in ten flips of a fair coin'''
    model = make_array(0.5, 0.5)
    ten_flips_proportions = sample_proportions(10, model) #will explain tomorrow
    num_heads = ten_flips_proportions.item(0)*10
    return num_heads

In [None]:
repetitions = 500 #does this count as "lots of times?"

coin_stats = make_array()

for i in np.arange(repetitions):
    one_statistic = ten_flips()
    coin_stats = np.append(coin_stats, one_statistic)
    
Table().with_column('Coin Statistics', coin_stats).hist()