In [None]:
# HIDDEN
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [None]:
def r_scatter(r):
    plots.figure(figsize=(5,5))
    "Generate a scatter plot with a correlation approximately r"
    x = np.random.normal(0, 1, 1000)
    z = np.random.normal(0, 1, 1000)
    y = r*x + (np.sqrt(1-r**2))*z
    plots.scatter(x, y, color='darkblue', s=20)
    plots.xlim(-4, 4)
    plots.ylim(-4, 4)

## Review: Central Limit Theorem

In [None]:
united = Table.read_table('united.csv')
united.hist('Delay', bins = np.arange(-20, 300, 10))

In [None]:
delays = united.column('Delay')
population_mean = np.mean(delays)
population_sd = np.std(delays)

population_mean, population_sd

In [None]:
def one_sample_mean(sample_size):
    sampled_flights = united.sample(sample_size)
    return np.mean(sampled_flights.column('Delay'))

In [None]:
#TODO: try sampling with sample size 100


In [None]:
def ten_thousand_sample_means(sample_size):
    means = make_array()
    for i in np.arange(10000):
        means = np.append(means, one_sample_mean(sample_size))
    return means

In [None]:
#TODO: try calling function above to iterate and sample 10000 and accumulate means in an array of averages


In [None]:
"""Empirical distribution of random sample means"""

def plot_sample_means(sample_size):
    means = ten_thousand_sample_means(sample_size)
    sample_means = Table().with_column('Sample Means', means)
    
    # Display empirical histogram and print all relevant quantities
    sample_means.hist(bins=20)
    plots.xlabel('Sample Means')
    plots.title('Sample Size ' + str(sample_size))
    print("Sample size: ", sample_size)
    print("Population mean:", np.mean(united.column('Delay')))
    print("Average of sample means: ", np.mean(means))
    print("Population SD:", np.std(united.column('Delay')))
    print("SD of sample means:", np.std(means))

In [None]:
#TODO: call function above to visualize (and try different sample sizes)


## Prediction ##

In [None]:
galton = Table.read_table('galton.csv')

In [None]:
heights = Table().with_columns(
    'MidParent', galton.column('midparentHeight'),
    'Child', galton.column('childHeight')
    )

In [None]:
heights

In [None]:
#TODO: Visualize with a scatter plot
heights.scatter('MidParent')

#### Do we see an association? Is this association linear? 

In [None]:
def predict_child(h):
    """Return a prediction of the height of a child 
    whose parents have a midparent height of h.
    
    The prediction is the average height of the children 
    whose midparent height is in the range h plus or minus 0.5 inches.
    """
    close_points = heights.where("MidParent", are.between(h-0.5, h + 0.5))
    
    prediction_child = np.average(close_points.column('Child'))
    
    return prediction_child

predict_child(68)     

In [None]:
#TODO: let's add this prediction as a column in the table. Do you remember how to use apply?
predixtions = heights.apply(predict_child,'MidParent')
heights_with_predictions = heights.with_column("Predicted Height", predixtions)
heights_with_predictions

In [None]:
#TODO: Let's look at the scatter plot with MidParent as x-axis
heights_with_predictions.scatter("MidParent")

## Association ##

In [None]:
hybrid = Table.read_table('hybrid.csv')

In [None]:
hybrid

In [None]:
#Let's sort by manufacturer's suggested retail price (msrp) and see which is the most expensive model on this list
hybrid.sort('msrp', descending = True)

In [None]:
#What if we want to see if there is an association between the mpg (fuel efficiency and the msrp)?
#How should we visualize?
hybrid.scatter('mpg', 'msrp')

Is there an association? positive/negative? linear association?


How about looking at accelaration versus msrp?

In [None]:
#TODO: visualize
hybrid.scatter('acceleration', 'msrp')

In [None]:
# What if we only looked at a specific classs of hybrid cars
suv = hybrid.where('class', 'SUV')
suv
#TODO: How big is this table?

In [None]:
#TODO: visualize (let's look at mpg vs msrp)
suv.scatter('mpg','msrp')

In [None]:
#TODO: visualize (let's look at accelaration vs msrp)
suv.scatter('acceleration','msrp')

#### Let's review standard units
Can we visualize the same data but in standard units?

In [None]:
def standard_units(x):
    "Convert any array of numbers to standard units."
    return (x - np.average(x)) / np.std(x)

In [None]:
Table().with_columns(
    'mpg (standard units)',  standard_units(suv.column('mpg')), 
    'msrp (standard units)', standard_units(suv.column('msrp'))
).scatter(0, 1)
plots.xlim(-3, 3)
plots.ylim(-3, 3);

In [None]:
Table().with_columns(
    'acceleration (standard units)', standard_units(suv.column('acceleration')), 
    'msrp (standard units)',         standard_units(suv.column('msrp'))
).scatter(0, 1)
plots.xlim(-3, 3)
plots.ylim(-3, 3);

When we see a linear association, it would be nice to quantify the "strength"
Have a measure of this linear association
-> the correlation coefficent r/R is measures the linear association based on standard units (-1<= r <=1)

## Correlation ##
Let's visualize!!

In [None]:
# Try using r_scatter() with different values

## Calculating $r$ ##

In [None]:
x = np.arange(1, 7, 1)
y = make_array(2, 3, 1, 5, 2, 7)
t = Table().with_columns(
        'x', x,
        'y', y
    )
t

In [None]:
t.scatter('x', 'y', s=30, color='red')

In [None]:
t = t.with_columns(
        'x (standard units)', standard_units(x),
        'y (standard units)', standard_units(y)
    )
t

In [None]:
#Take the product of the standard units and add it as a column to the table t
standard_units(x) * standard_units(y)

In [None]:
# r is the average of the products of the standard units
r = np.mean(standard_units(x) * standard_units(y))
r

In [None]:
def correlation(aTable, xLabel, yLabel):
    """t is a table; x and y are column labels"""
    #TODO: implement: take standard units of x and y column and return the average of the product of standard units
    r = np.mean(standard_units(aTable.column(xLabel)) * standard_units(aTable.column(yLabel)))
    return r


In [None]:
#Try the function above
correlation(t,'x','y')

In [None]:
suv = hybrid.where('class', 'SUV')
suv

In [None]:
#TODO: try the function correlation on the suv table to find r for mpg vs msrp
correlation(suv,'mpg','msrp')

In [None]:
#TODO: try the function correlation on the suv table to find r for acceleration vs msrp
correlation(suv,'acceleration','msrp')

 ### What happens if we switch Axes? ###

In [None]:
t

In [None]:
#TODO: look at correlation coefficient for table t

In [None]:
#TODO: look at scatter plot of table t (what do you see when you switch axes?)

In [None]:
t.scatter('y (standard units)','x (standard units)')

### Nonlinearity ###

In [None]:
new_x = np.arange(-4, 4.1, 0.5)
nonlinear = Table().with_columns(
        'x', new_x,
        'y', new_x**2
    )
nonlinear.scatter('x', 'y', s=30, color='r')

In [None]:
#TODO: let's look at the correlation coefficient for nonlinear
correlation(nonlinear,'x','y')

### Outliers ###

In [None]:
#Table without any outliers
line = Table().with_columns(
        'x', make_array(1, 2, 3, 4),
        'y', make_array(1, 2, 3, 4)
    )
line.scatter('x', 'y', s=30, color='r')

In [None]:
#What is the correlation coefficient for line above?
correlation(line,'x','y')

In [None]:
outlier = Table().with_columns(
        'x', make_array(1, 2, 3, 4, 5),
        'y', make_array(1, 2, 3, 4, 0)
    )
outlier.scatter('x', 'y', s=30, color='r')

In [None]:
#What is the correlation coefficient for outlier above?
correlation(outlier,'x','y')

### Ecological Correlations ###

In [None]:
sat2014 = Table.read_table('sat2014.csv').sort('State')
sat2014

In [None]:
sat2014.scatter('Critical Reading', 'Math')

In [None]:
correlation(sat2014, 'Critical Reading', 'Math')

## How should we interpret the data/correlation coefficient above?
## Can we make a prediction? Why or why not?