<h1> Lecture 23 

Data Science 8, Summer 2021 </h1>

<h3>
<b>
<ul>
<li>Measures of Spread</li><br>
    <ul> 
        <li>Variance</li><br>
        <li>Standard Deviation</li><br>
        <li>Chebyshev's Inequality</li><br>
        <li>Standard Units</li><br>
    </ul>
    <li>The Normal Distribution</li>
</ul>
</b>
</h3>

In [None]:
from datascience import *
import numpy as np
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
plots.rcParams["patch.force_edgecolor"] = True

#The following allows porting images into a Markdown window
#Syntax: ![title](image_name.png)
from IPython.display import Image

<h3>Measures of Spread: Variance and Standard Deviation </h3>

In [None]:
values = make_array(2, 3, 3, 9)
values

In [None]:
sd_table = Table().with_columns('Value', values)
sd_table

In [None]:
average_value = np.average(sd_table.column(0))
average_value

In [None]:
deviations = values - average_value
sd_table = sd_table.with_column('Deviation', deviations)
sd_table

Let's square the deviations:

In [None]:
sd_table = sd_table.with_columns('Squared Deviation', 
                                 deviations ** 2)
sd_table

In [None]:
# Variance of the data

variance = np.mean(sd_table.column('Squared Deviation'))
variance

In [None]:
# Standard Deviation (SD) is the square root of the variance

sd = variance ** 0.5
sd

One-Stop Tool for Standard Deviation: NumPy Function np.std

In [None]:
np.std(values)

<h4>Units of Standard Deviation: Same as the units of the underlying data.</h4>

## Standard Units ##

In [None]:
births = Table.read_table('baby.csv')

In [None]:
def standard_units(x):
    """Convert array x to standard units."""
    return (x - np.mean(x)) / np.std(x)

In [None]:
ages = births.column('Maternal Age')

In [None]:
ages_standard_units = standard_units(ages)
ages_standard_units

In [None]:
print('Mean Ages in Standard Units:', np.mean(ages_standard_units))
print('SD of Ages in Standard Units:', np.std(ages_standard_units)
, )

<h3>Converting to Standard Units is also called "Normalizing" Random Data (Variables): Convert to Zero-Mean, Unit-Variance Data.</h3>

## Standard Units

In [None]:
scores = Table.read_table('scores.csv')
scores.show(5)

In [None]:
scores.hist(overlay=False)

Quiz 1 Average

In [None]:
np.mean(scores.column('Quiz 1'))

Quiz 1 Std Dev

In [None]:
np.std(scores.column('Quiz 1'))

Quiz 2 Average

In [None]:
np.mean(scores.column('Quiz 2'))

Quiz 2 Std Dev

In [None]:
np.std(scores.column('Quiz 2'))

<h4>Define a function that converts values in an array to standard units</h4>

In [None]:
def standard_units(x):
    """Convert array x to standard units."""
    return (x - np.average(x)) / np.std(x)

In [None]:
quiz1_su = standard_units(scores.column('Quiz 1'))
scores = scores.with_column('Quiz 1 in Standard Units', quiz1_su)

quiz2_su = standard_units(scores.column('Quiz 2'))
scores = scores.with_column('Quiz 2 in Standard Units', quiz2_su)

scores.show(10)

<h3>Now let's look at the histograms of the standard units</h3>

In [None]:
scores.hist('Quiz 1 in Standard Units', bins=10)

In [None]:
scores.hist('Quiz 2 in Standard Units', bins=10)

<h3>Discussion Question<h3>

In [None]:
ages = births.column('Maternal Age')
ages_standard_units = standard_units(ages)

In [None]:
both = Table().with_columns(
    'Age in Years', ages,
    'Age in Standard Units', ages_standard_units
)
both

In [None]:
np.mean(ages), np.std(ages)

<h4>Accordingly,</h4>

In [None]:
np.mean(ages) + both.column('Age in Standard Units').item(1) * np.std(ages)

## Chebyshev's Bound

In [None]:
births = Table.read_table('baby.csv')
births.show(3)

In [None]:
births.drop('Maternal Smoker').hist(overlay = False)

<h4>Let's take a closer look at the Maternal Pregnancy Weights</h4>

In [None]:
mpw = births.column('Maternal Pregnancy Weight')
mean = np.mean(mpw)
sd = np.std(mpw)
median_percentile_50=percentile(50,mpw)
print('Median MPW:', median_percentile_50)
print('Mean MPW:', np.round(mean,2))
print('MPW Standard Deviation:', np.round(sd,2))

<h4>Let's consider data within $\pm 3$ SDs:</h4>

In [None]:
within_3_SDs = births.where(
    'Maternal Pregnancy Weight', 
    are.between(mean - 3*sd, mean + 3*sd))

<h4>What fraction of the total cases are within that range?</h4>

In [None]:
# Proportion within 3 SDs of the mean

within_3_SDs.num_rows / births.num_rows

<h4>What does Chebyshev say?</h4>

In [None]:
# Chebyshev's bound: 
# The proportion we calculated above should be at least

1 - 1/(3**2)

In [None]:
births.labels

In [None]:
# See if Chebyshev's bounds work for distributions with various shapes

for feature in births.labels:
    values = births.column(feature)
    mean = np.mean(values)
    sd = np.std(values)
    print()
    print(feature)
    for z in make_array(2, 3, 4, 5):
        chosen = births.where(feature, are.between(mean - z*sd, mean + z*sd))
        proportion = chosen.num_rows / births.num_rows
        percent = round(proportion * 100, 2)
        print('Average plus or minus', z, 'SDs:', percent, '% of the data')

<h3> The SD and Bell Shaped Curves</h3>

In [None]:
births.hist('Maternal Height', bins = np.arange(56.5, 72.6, 1))

In [None]:
heights = births.column('Maternal Height')
np.mean(heights), np.std(heights)

In [None]:
np.mean(heights) - np.std(heights)

In [None]:
np.mean(heights) + np.std(heights)

<h2> Central Limit Theorem </h2>

In [None]:
united = Table.read_table('united.csv')
united_bins = np.arange(-20, 301, 10)
united

In [None]:
united.hist('Delay', bins=united_bins)

<h4>Compute the Median, Mean, and Standard Deviation of the Delays</h4>

In [None]:
delays = united.column('Delay')
delay_median = percentile(50, delays)
delay_mean = np.mean(delays)
delay_sd = np.std(delays)
print('Median Delay:', np.round(delay_median,2))
print('Mean Delay:', np.round(delay_mean,2))
print('Delay Standard Deviation:', np.round(delay_sd,2))

<h4><u>Question:</u> Why is the Mean greater than the Median? </h4>

<h4>Assume we can only sample the data&mdash;that is, we don't have practical access to the complete data.</h4>

In [None]:
def one_sample_mean(sample_size):
    """ 
    Takes a sample from the population of flights 
    and computes its mean
    """
    # Recall that the "sample" method without an 
    # argument performs the sampling with replacement
    sampled_flights = united.sample(sample_size)
    return np.mean(sampled_flights.column('Delay'))

<h4>Run the function <tt>one_sample_mean</tt> several times:</h4>

In [None]:
one_sample_mean(100)

<h4>To understand the variability of the Sample Mean, run a large number of trials&mdash;that is, take a large number of samples:</h4>

In [None]:
def many_sample_means(sample_size,num_simulations):
    means = make_array()
    for i in np.arange(num_simulations):
        mean = one_sample_mean(sample_size)
        means = np.append(means, mean)
    return means

In [None]:
sample_means_100 = many_sample_means(100,10000)

In [None]:
sample_means_100

In [None]:
len(sample_means_100)

In [None]:
Table().with_column(
    'Mean of 100 flight delays', sample_means_100).hist(bins=20)

print('Population Average:', np.round(delay_mean,2))

<h4>What happens if we take 10,000 sample populations (10,000 trials), each containing info about 400 flights (sample size of 400)?</h4>

In [None]:
sample_means_400 = many_sample_means(400,10000)
Table().with_column(
    'Mean of 400 flight delays', sample_means_400).hist(bins=20)

print('Population Average:', np.round(delay_mean,2))

In [None]:
sample_means_400 = many_sample_means(400,10)
Table().with_column(
    'Mean of 400 flight delays', sample_means_400).hist(bins=20)

print('Population Average:', np.round(delay_mean,2))

In [None]:
sample_means_400 = many_sample_means(400,50)
Table().with_column(
    'Mean of 400 flight delays', sample_means_400).hist(bins=20)

print('Population Average:', np.round(delay_mean,2))

In [None]:
sample_means_400 = many_sample_means(400,100)
Table().with_column(
    'Mean of 400 flight delays', sample_means_400).hist(bins=20)

print('Population Average:', np.round(delay_mean,2))

In [None]:
sample_means_400 = many_sample_means(400,1000)
Table().with_column(
    'Mean of 400 flight delays', sample_means_400).hist(bins=20)

print('Population Average:', np.round(delay_mean,2))

In [None]:
sample_means_400 = many_sample_means(400,10000)
Table().with_column(
    'Mean of 400 flight delays', sample_means_400).hist(bins=20)

print('Population Average:', np.round(delay_mean,2))

In [None]:
sample_means_400 = many_sample_means(400,50000)
Table().with_column(
    'Mean of 400 flight delays', sample_means_400).hist(bins=20)

print('Population Average:', np.round(delay_mean,2))