In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

# Configure for presentation
np.set_printoptions(threshold=50, linewidth=50)
import matplotlib as mpl
mpl.rc('font', size=16)

#Table.interactive_plots()
Table.static_plots()

## Sample size & its relation to distributions

In [None]:
nba_salaries = Table.read_table("nba_salaries.csv")
nba_salaries

In [None]:
## This is population level data of all of the players in the NBA
# Let's learn about the true info:
def nba_dist(tbl):
    pop_mean = np.mean(tbl.column("'15-'16 SALARY"))
    print("The average salary in millions of dollars for NBA players is " + str(pop_mean))
    print(str(tbl.num_rows) + " players are shown in this distribution.")
    tbl.hist(3, bins = np.arange(0, 26, 2.5))

nba_dist(nba_salaries)

In [None]:
sample = nba_salaries.sample(10, with_replacement = False)

nba_dist(sample)

In [None]:
## In general: as n approaches N, the empirical/sampled distribution
# greater resembles the theoretical/probability distribution

## Coin flipping - simulations

In [None]:
coin = Table().with_column("Side", make_array(0, 1))
coin #Let's treat 1 as a "Heads" and 0 as a "Tails"

In [None]:
## The probability distribution: both heads and tails are equally likely, so...
coin.hist(0, bins = np.arange(1+2))

In [None]:
def sample_hist(sample_size):
    """Flip a coin sample_size times and create a histogram showing the numbnr """
    coin.sample(sample_size).hist(0, bins=np.arange(1+2))
    ## Why do I not need to return anything in this function? 

In [None]:
## Flip a coin once and see how many heads/tails we get.
sample_hist(1)

In [None]:
sample_hist(5)

In [None]:
sample_hist(20)

In [None]:
sample_hist(100)

In [None]:
sample_hist(1000)

In [None]:
sample_hist(1000000)

## Many experiments - the law of large numbers

In [None]:
## Let's continue using the coin example.
# Make sure you differentiate the sample size and the number of trials/experiments
# In this case, sample size = num_coins and trials = num_flips

def coin_experiment(num_coins, num_flips):
    results = make_array()
    for i in np.arange(num_flips):
        prop_heads = np.mean(coin.sample(num_coins).column(0))
        results = np.append(results, prop_heads)
    Table().with_column("Proportion Heads", results).hist(0, bins = np.arange(0.25, 0.8, 0.05))

So why do we want to use computation in this case? For example, imagine we are flipping a coin 5 times and measuring the proportion of heads.

If we want to know the probability distribution, we need to know all the ways to get every combination to get each proportion of heads.

i.e.: just for 1 heads, HTTTT, THTTT, TTHTT, TTTHT, TTTTH; 2 heads, HHTTT, THHTT, TTHHT, TTTHH, HTTTH, ... (to list a few) 

This isn't too bad for small numbers of flips, but it gets very difficult when it's not this simple and when there's many flips. Simulation lets us approximate this instead!

The issue is that there is random chance involved, so although we know a coin should be 50/50, there is some variation when we do an experiment.

In [None]:
coin_experiment(50, 10)

In [None]:
coin_experiment(50, 100)

In [None]:
coin_experiment(50, 1000)

In [None]:
coin_experiment(50, 10000)

## Mendelian genetics

In [None]:
flowers = Table().with_columns("Color", make_array("Pink", "Pink", "Pink", "White"))

def proportion_in_one_experiment(sample_size):
    return flowers.sample(sample_size).where("Color", are.equal_to("Pink")).num_rows / sample_size

def proportions_in_many_experiments(sample_size, num_experiments):
    sizes = Table().with_columns("Sample size", np.repeat(sample_size, num_experiments))
    experiments = sizes.with_columns("Proportion pink", sizes.apply(proportion_in_one_experiment, "Sample size"))
    return experiments

def proportions_hist(sample_size, num_experiments):
    experiments = proportions_in_many_experiments(sample_size, num_experiments)
    bin_width = min(.1, max(.005, 4 / sample_size))
    experiments.hist("Proportion pink", bins=np.arange(0, 1+2*bin_width, bin_width))

In [None]:
## How does sample size play a role in changing these graphs?
# the Law of Large Numbers in this case works because we have 10,000 trials
proportions_hist(1, 10000)

In [None]:
proportions_hist(2, 10000)

In [None]:
proportions_hist(100, 10000)