In [None]:
from datascience import *
import numpy as np

import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
%matplotlib inline

## Correlation

In [None]:
hybrid = Table.read_table('http://inferentialthinking.com/notebooks/hybrid.csv')
hybrid

In [None]:
hybrid.scatter('acceleration', 'msrp')

In [None]:
hybrid.scatter('mpg', 'msrp')

In [None]:
suv = hybrid.where('class', 'SUV')
suv.scatter('mpg', 'msrp')

In [None]:
suv.scatter('acceleration', 'msrp')

In [None]:
def standard_units(any_numbers):
    "Convert any array of numbers to standard units."
    return (any_numbers - np.mean(any_numbers)) / np.std(any_numbers)  

In [None]:
msrp = suv.column('msrp')
standard_units(msrp)

In [None]:
Table().with_columns(
    'MSRP', msrp,
    'MSRP (su)', standard_units(msrp)
).scatter(0, 1)

In [None]:
np.count_nonzero(standard_units(msrp) > 0) / len(msrp)

In [None]:
def standardize(t):
    """Return a table in which all columns of t are converted to standard units."""
    t_su = Table()
    for label in t.labels:
        t_su = t_su.with_column(label + ' (su)', standard_units(t.column(label)))
    return t_su

standardize(suv.select('mpg', 'msrp'))

In [None]:
standardize(suv.select('mpg', 'msrp')).scatter(0, 1)
plots.xlim(-3, 3)
plots.ylim(-3, 3)

In [None]:
standardize(suv.select('acceleration', 'msrp')).scatter(0, 1)
plots.xlim(-3, 3)
plots.ylim(-3, 3)

## Visualizing Correlations

In [None]:
def r_scatter(r):
    "Generate a scatter plot with a correlation approximately r"
    plots.figure(figsize=(5,5))
    x = np.random.normal(0, 1, 1000)
    z = np.random.normal(0, 1, 1000)
    y = r*x + (np.sqrt(1-r**2))*z
    plots.scatter(x, y)
    plots.xlim(-4, 4)
    plots.ylim(-4, 4)

In [None]:
r_scatter(0)

In [None]:
r_scatter(0.2)

In [None]:
r_scatter(0.5)

In [None]:
r_scatter(0.8)

In [None]:
r_scatter(0.99)

In [None]:
r_scatter(-0.5)

## Calculating r

In [None]:
x = np.arange(1, 7, 1)
y = make_array(2, 3, 1, 5, 2, 7)
t = Table().with_columns(
        'x', x,
        'y', y
    )
t

In [None]:
t.scatter(0, 1, s=40)

In [None]:
standardize(t).scatter(0, 1, s=40)

In [None]:
su_products = standard_units(t.column('x')) * standard_units(t.column('y'))

In [None]:
standardize(t).with_column('product', su_products)

In [None]:
np.mean(su_products)

In [None]:
def correlation(t, x, y):
    return np.mean(standard_units(t.column(x)) * standard_units(t.column(y)))

correlation(t, 'x', 'y')

In [None]:
correlation(suv, 'acceleration', 'msrp')

In [None]:
suv.scatter('acceleration', 'msrp')

In [None]:
correlation(suv, 'mpg', 'msrp')

In [None]:
suv.scatter('mpg', 'msrp')

In [None]:
correlation(suv, 'msrp', 'mpg')

In [None]:
suv_thousands = suv.with_column('msrp', suv.column('msrp') / 1000)
suv_thousands.scatter('mpg', 'msrp')

In [None]:
correlation(suv_thousands, 'mpg', 'msrp')