BerkeleyX: Data8.2x

Foundations of Data Science: Inferential Thinking by Resampling

In [None]:
from datascience import *
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Lec 4.1 Probability & Sampling

In [None]:
# Sample two people without repetition from a population of 100 including Mo & Jo

# P(Mo & Jo are in the sample)
print(2*(1/100 * 1/99))
# P(neither Mo nor Jo in the sample)
print(98/100 * 97/99)


Lec 4.2 Sampling

In [None]:
top = Table.read_table('../../data/top_movies_2017.csv')
top = top.with_column('Row Index', np.arange(top.num_rows)).move_to_start('Row Index')
top.set_format(['Gross', 'Gross (Adjusted)'], NumberFormatter)

In [None]:
# deterministic sample
top.take([3, 5, 7])

In [None]:
# another deterministic sample based on data content
top.where('Title', are.containing('and the'))

In [None]:
# probability sample
#  not uniform, just a few film could be selected, and only the first one is randomly chosen
start = np.random.choice(range(10))
top.take(range(start, start + 5))

In [None]:
# uniform random sample with repetition
top.sample(5)

In [None]:
# the default sampling is _with_ repetition
len(top.sample(100).group('Title').rows)

In [None]:
# uniform random sample without repetition
top.sample(5, with_replacement=False)

In [None]:
# ensure sampling is now without repetition
len(top.sample(100, with_replacement=False).group('Title').rows)

Be careful not getting _samples of convenience_ - they are not random

Lec 4.3 Distributions

In [None]:
# table w/ outcomes throwing a die
die = Table().with_column('face', range(7))
die

In [None]:
# explicitly remove the 'normed' param, using 'density' instead
def face_hist(t):
    t.hist('face', bins=np.arange(0.5, 7, 1), unit='face', normed=None, density=True)
    plt.xlabel('Face')

face_hist(die)

In [None]:
# empirical histogram, based on sampling
face_hist(die.sample(10))

In [None]:
# as the number of samples grows, the distribution gets closer to the probability histogram
face_hist(die.sample(10000))

Lec 4.4 Large Random Samples

If the sample size is large, the empirical distribution of a uniform random sample
resamble the distribution of the population with high probability

In [None]:
# delay for United flights in a year
united = Table.read_table('../../data/united.csv')
united

In [None]:
# actual population distribution
def delay_hist(t):
    t.hist('Delay', unit='minute', bins=range(-30, 301, 10), normed=None, density=True)
    
delay_hist(united)

In [None]:
# sample distribution varies with size, tending to actual distribution
delay_hist(united.sample(10))
delay_hist(united.sample(1000))

Lec 4.5 Simulation

In [None]:
# Roll a fair die 4 times, chance of getting at least one six?

# by calculation:
k = 4
no_sixes = (5/6) ** 4
1 - no_sixes

In [None]:
# a single simulation
dice = np.arange(1, 7)
rolls = np.random.choice(dice, k)
sum(rolls == 6)

In [None]:
# let's set up an experiment
trials = 1000
successes = 0

for _ in np.arange(trials):
    rolls = np.random.choice(dice, k)
    if sum(rolls == 6) > 0:
        successes = successes + 1
        
successes / trials

Lec 4.6 Statistics

In [None]:
# empirical distribution of a statistic

def estimate_by_simulation(trials):
    successes = 0

    for _ in range(trials):
        rolls = np.random.choice(dice, k)
        if sum(rolls == 6) > 0:
            successes = successes + 1

    return successes / trials

estimates = []
for _ in range(1000):
    estimates.append(estimate_by_simulation(1000))

In [None]:
Table().with_column('Estimate', estimates).hist(bins=50, normed=None, density=False)