##### <h1> Lecture 15  

Data Science 8, Summer 2021 </h1>

In [None]:
from datascience import *
import numpy as np
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
plots.rcParams["patch.force_edgecolor"] = True

#The following allows porting images into a Markdown window
from IPython.display import Image

## Random Sampling ##

Load in the dataset.

All United flights national flights from 6/1/15 to 8/9/15, their destination and how long they were delayed, in minutes

In [None]:
united = Table.read_table('united.csv')
united.show(3)

<h3>How large is this data set?</h3>

In [None]:
united.num_rows

<h3>Deterministic Sampling</h3>
<h4>Example 1: All flights to JFK Airport in NY<br><br>
No matter how many times we run this code, we get the same set of rows/flights.</h4>

In [None]:
united.where('Destination', 'JFK') # Destination JFK

<h3>Deterministic Sampling</h3>
<h4>Example 2: Take every $1000^\textsf{th}$ row/flight in the data set.
</h4>

In [None]:
united.take(np.arange(0, united.num_rows, 1000)).show()

<h3>Deterministic Sampling</h3>
<h4>Example 3: Take specific rows in the data set.
</h4>

<h5>Recall: <tt>.take</tt> takes the rows that meet the specified criterion.</h5>

In [None]:
united.take(make_array(34, 6321, 10040))

<h3>Random Sampling</h3>
<h4>SLIDE: Sampling<br><br>
Each time we run this code the produced sample set can be different.</h4>

In [None]:
# Select a number randomly from 0 to 999, inclusive.
start = np.random.choice(np.arange(1000))
systematic_sample = united.take(np.arange(start, united.num_rows, 1000))
systematic_sample.show()

### tbl.sample(n, with_replacement=...)

In [None]:
united.sample(100, with_replacement=True) # With 

In [None]:
united.sample(25, with_replacement=False)

In [None]:
united.sample(100000, with_replacement=False) # Too Big

## Distributions ##

In [None]:
die = Table().with_column('Face', np.arange(1, 7))
die

In [None]:
die.sample(10)

<h3>Let's count how many times each face comes up<br> in a set of ten rolls of the die:</h3>

In [None]:
die.sample(10).group('Face') # count it

In [None]:
die.hist()

In [None]:
die.hist(bins=roll_bins)

In [None]:
die.sample(10).hist(bins=roll_bins)

In [None]:
#Try the sampling with 100,1000, 10000, 100000, and 1000000 rolls
die.sample(1000).hist(bins=roll_bins)

In [None]:
die.sample(100000).hist(bins=roll_bins)

<h3>Alternatively, we can define an array of numbers denoting the six faces of the die, and run <tt>np.random.choice</tt> on that array.</h3>

In [None]:
die_faces_array=np.arange(1,7)
die_faces_array

In [None]:
rolls_outcome=np.random.choice(die_faces_array,10)
rolls_outcome

SLIDE: Law of Large Numbers

## Large Random Samples ##

In [None]:
united 

<h4>Median Delay</h4>

Median: the value separating the higher half from the lower half of a dataset. There are a few ways you can break ties if there are an even number of data values.

"The middle value"

### np.median(array)

Our population is the `united` table

In [None]:
np.median(united.column("Delay")) # The parameter

In [None]:
np.median(united.sample(10).column("Delay")) # median of a sample? (A statistic)

In [None]:
np.median(united.sample(1000).column("Delay")) # median of a bigger sample? (A statistic)

## Simulating Statistics ##

In [None]:
np.median(united.column('Delay'))

In [None]:
np.median(united.sample(10).column('Delay'))

In [None]:
def sample_median(size):
    return np.median(united.sample(size).column('Delay'))

In [None]:
sample_median(10)

In [None]:
sample_medians = make_array()

for i in np.arange(1000):
    new_median = sample_median(10)
    sample_medians = np.append(sample_medians, new_median)

In [None]:
Table().with_column('Sample medians', sample_medians).hist(bins = np.arange(-10,31))

In [None]:
sample_medians = make_array()

for i in np.arange(1000):
    new_median = sample_median(1000)
    sample_medians = np.append(sample_medians, new_median)

In [None]:
Table().with_column(
    'Sample medians', sample_medians).hist(bins = np.arange(-10,31))