In [1]:

import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt


# Sampling Distribution of a Statistic

## Random Sampling ##

We load in a dataset of all United flights national flights from 6/1/15 to 8/9/15, their destination and how long they were delayed, in minutes.

In [2]:
united = pd.read_csv('united.csv')
#united = united.with_column('Row', np.arange(united.num_rows)).move_to_start('Row')
united

Unnamed: 0,Date,Flight Number,Destination,Delay
0,6/1/15,73,HNL,257
1,6/1/15,217,EWR,28
2,6/1/15,237,STL,-3
3,6/1/15,250,SAN,0
4,6/1/15,267,PHL,64
...,...,...,...,...
13820,8/31/15,1978,LAS,-4
13821,8/31/15,1993,IAD,8
13822,8/31/15,1994,ORD,3
13823,8/31/15,2000,PHX,-1


A random sample:

In [7]:
# Use the df.sample() method.  Note, the default is replace=False
united.sample(6, replace=True)

Unnamed: 0,Date,Flight Number,Destination,Delay
11792,8/17/15,1727,IND,22
7517,7/21/15,642,SEA,-1
3426,6/23/15,1717,ORD,66
13344,8/28/15,1252,IAH,1
1570,6/11/15,1087,BWI,5
9082,7/31/15,760,JFK,10


## Population Distribution ##

In [5]:
united_bins = np.arange(-20, 201, 5);

plt.hist(united["Delay"], bins=united_bins, density=True);
plt.title("Time of Delayed United Flights:  POPULATION DISTRIBUTION")
plt.xlabel("Time (min)")
plt.ylabel("Density (Percent per min)")
plt.ylim(0,0.06);

## Calculating Population Parameters

In [7]:
## Population Median:
united["Delay"].median()

2.0

In [9]:
## Population  min
min(united['Delay'])

-16

In [11]:
## Population max

max(united['Delay'])

580

In [19]:
# Population mean
np.average(united['Delay'])  

16.658155515370705

In [21]:
# Population std
np.std(united["Delay"])

39.48019985160957

## Sample Distributions ##

In [92]:
a = united.sample(500, replace=True);

plt.hist(a["Delay"],bins=united_bins, density=True);

plt.title("1st Sample Distribution:, n=100")

plt.xlabel("Time (min)")
plt.ylabel("Density (Percent per min)")
plt.ylim(0,0.06);

In [94]:
b = united.sample(500, replace=True);

plt.hist(b["Delay"],bins=united_bins, density=True);

plt.title("2nd Sample Distribution:, n=100")


plt.xlabel("Time (min)")
plt.ylabel("Density (Percent per min)")
plt.ylim(0,0.06);

In [96]:
c = united.sample(500, replace=True);

plt.hist(c["Delay"],bins=united_bins, density=True);

plt.title("3rd Sample Distribution:, n=100")


plt.xlabel("Time (min)")
plt.ylabel("Density (Percent per min)")
plt.ylim(0,0.06);

## Sampling Distribution of Sample Median

In [98]:
# Statistic (median) from 1st sample:
a["Delay"].median()

3.0

In [100]:
b["Delay"].median()

2.5

In [102]:
c["Delay"].median()

3.0

In [104]:
def sample_median(size):
    return np.median(united.sample(size, replace=True)["Delay"])

In [106]:
sample_median(500)

2.0

In [108]:
sample_medians = [ ]

for i in np.arange(100000):
    new_median = sample_median(500)
    sample_medians = np.append(sample_medians, new_median)

In [109]:
plt.hist(sample_medians,bins = np.arange(-10,31), density=True);
plt.title("Sampling Distribution of Sample Median")


plt.xlabel("Time (min)")
plt.ylabel("Density (Percent per min)")
plt.xlim(-1,10);


### Standard Error

Definition:  The standard error of a statistic is the standard deviation of the sampling distribution of that statistic.

**Practice**:  Estimate the standard error of the sample median of a sample of size 500 using your simulation above.  

In [110]:
std_error = sample_medians.std()
std_error

0.7454591363045999

## Sampling Distribution of Sample Mean

Let's repeat the same process as above, however this time with the sample mean:

In [114]:
def sample_mean(size):
    return np.mean(united.sample(size, replace=True)["Delay"])

In [116]:
sample_means= [ ]

for i in np.arange(100000):
    new_mean = sample_mean(500)
    sample_means = np.append(sample_means, new_mean)

In [117]:
plt.hist(sample_means,bins = np.arange(-10,31), density=True);
plt.title("Sampling Distribution of Sample Mean")


plt.xlabel("Time (min)")
plt.ylabel("Density (Percent per min)")
plt.xlim(-1,31);


### Standard Error


Estimate the standard error of the sample mean of a sample of size 500 using your simulation above

In [121]:

np.std(sample_means)


1.766725331993701

We showed in the slides that the standard error for the sample mean is equal to 
$\frac{\sigma_{pop}}{\sqrt{n}}$, where $\sigma_{pop}$ is the population standard deviation

In [123]:
#Answer above should be equal to 
np.std(united['Delay']) /np.sqrt(500) 


1.765608212669522

## Sample Distribution of Sample Proportions:



Let's consider a new scenario: Suppose the population is a bernoulli distribution with p = 0.7

In [42]:
x=[0,1]

x_pmf = pd.Series([0.3, 0.7], index=x)


x_pmf.plot.bar(rot=0,width=1, ec='white')

#Always include a title
plt.title("Distribution (aka PMF) of X");

#Label what the x and y axes represent:
plt.ylabel("P(X=k)");
plt.xlabel("k");


In [125]:
#Simulate sample of size 500 from this distribution:
sample = np.random.choice([0,1],p=[0.3, 0.7], size = 500)



In [127]:
plt.hist(sample,width=1, ec='white' , density =True, bins=[-.5,.5,1.5])

(array([0.29, 0.71]),
 array([-0.5,  0.5,  1.5]),
 <BarContainer object of 2 artists>)

In [129]:
# Calculate the sample proportion:
np.mean(sample)

0.71

In [131]:
# Repeat!

sample2 = np.random.choice([0,1],p=[0.3, 0.7], size = 500)
plt.hist(sample2,width=1, ec='white' , density =True, bins=np.arange(-.5, 2.5))



(array([0.304, 0.696]),
 array([-0.5,  0.5,  1.5]),
 <BarContainer object of 2 artists>)

In [54]:
np.mean(sample2)

0.7

In [133]:
sample3 = np.random.choice([0,1],p=[0.3, 0.7], size = 500)
plt.hist(sample3,width=1, ec='white' , density =True, bins=np.arange(-.5, 2.5))



(array([0.316, 0.684]),
 array([-0.5,  0.5,  1.5]),
 <BarContainer object of 2 artists>)

In [144]:
np.mean(sample3)

0.64

In [135]:
sample_means = np.array([np.mean(np.random.choice([0,1],p=[0.3, 0.7], size = 500)) for i in range(10000)])
    

In [137]:
plt.hist(sample_means,density=True);
plt.title("Sampling Distribution of Sample Means")


Text(0.5, 1.0, 'Sampling Distribution of Sample Means')