# Lecture 23: Correlation

In [None]:
from datascience import *
import numpy as np

import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
%matplotlib inline

## Prediction

In [None]:
galton = Table.read_table('galton.csv')
galton.show(3)

In [None]:
def predict_child(new_midparent_height):
    close_points = galton.where('midparentHeight', 
                                are.between(new_midparent_height - 0.5, 
                                            new_midparent_height + 0.5))
    return close_points.column('childHeight').mean()    

In [None]:
# Apply predict_child to all the parent averages

height_pred = galton.with_column(
    'prediction', galton.apply(predict_child, 'midparentHeight')
)
height_pred

In [None]:
height_pred.select('childHeight', 'midparentHeight', 'prediction')\
.scatter('midparentHeight')

In [None]:
small = galton.sample(30)
small = small.with_column(
    'prediction', small.apply(predict_child, 'midparentHeight')
)
small.select('childHeight', 'midparentHeight', 'prediction').scatter('midparentHeight')

In [None]:
small.select('childHeight', 'midparentHeight', 'prediction')\
.scatter('midparentHeight',fit_line=True)

## Visualize relation between two variables

In [None]:
# hybrid cars manufactured through 2013
# - msrp is adjusted to 2013 dollars
# - acceleration is in km per hour per second
hybrid = Table.read_table('hybrid.csv')
hybrid

In [None]:
# price vs. mileage
hybrid.scatter('mpg', 'msrp') 

**Q:** Is there a relation between msrp and mpg?  Is it linear?

* A. No relation  
* B. Yes, linear  
* C. Yes, non-linear  

In [None]:
# price vs. mileage
hybrid.scatter('mpg', 'msrp', fit_line=True)

In [None]:
# price vs. acceleration
hybrid.scatter('acceleration', 'msrp')

**Q:** Is there a relation between msrp and acceleration? Is it linear?

* A. No relation  
* B. Yes, linear  
* C. Yes, non-linear  


In [None]:
# price vs. acceleration
hybrid.scatter('acceleration', 'msrp', fit_line=True)

**Q:** What are people paying for (at least with hybrid cars)?  Do the data tell us why?

In [None]:
hybrid.scatter('mpg', 'acceleration', fit_line=True)

## Quantify linearity

In [None]:
# ignore this code
def r_scatter(r):
    "Generate a scatter plot with a correlation approximately r"
    plots.figure(figsize=(5,5))
    x = np.random.normal(0, 1, 1000)
    z = np.random.normal(0, 1, 1000)
    y = r*x + (np.sqrt(1-r**2))*z
    plots.scatter(x, y)
    plots.xlim(-4, 4)
    plots.ylim(-4, 4)

In [None]:
r_scatter(1)

In [None]:
r_scatter(0)

In [None]:
r_scatter(-1)

**Q:** What other r values would you like to see?

In [None]:
r_scatter(0.5)

In [None]:
r_scatter(0.9)

## Computation of *r*
### Standard units

In [None]:
def standard_units(numbers):
    "Convert any array of numbers to standard units."
    std = np.std(numbers)
    mean = np.mean(numbers)
    return (numbers - mean) / std
    #return (numbers - np.mean(numbers)) / np.std(numbers)  

In [None]:
msrp = hybrid.column('msrp')
standard_units(msrp)

In [None]:
# price vs. acceleration
hybrid.scatter('acceleration', 'msrp')

# price vs. acceleration in standard units
hybrid_msrp_su = hybrid.with_column(
    'msrp (su)', 
    standard_units(hybrid.column('msrp'))
)
hybrid_msrp_su.scatter('acceleration', 'msrp (su)')

**Q:** How do those two charts differ?

In [None]:
# price vs. acceleration
hybrid.scatter('acceleration', 'msrp')

# price in standard units vs. acceleration in standard units
hybrid_msrp_accel_su = hybrid_msrp_su.with_column(
    'acceleration (su)', 
    standard_units(hybrid.column('acceleration'))
)
hybrid_msrp_accel_su.scatter('acceleration (su)', 'msrp (su)')

In [None]:
def standardize(t):
    """Return a table in which all columns of t are converted to standard units."""
    t_su = Table()
    for label in t.labels:
        t_su = t_su.with_column(
            label + ' (su)', 
            standard_units(t.column(label))
    )
    return t_su

In [None]:
accel_msrp_su = standardize(hybrid.select('acceleration', 'msrp'))
accel_msrp_su

In [None]:
accel_msrp_su.scatter('acceleration (su)', 'msrp (su)')

In [None]:
standardize(hybrid.select('mpg', 'msrp')).scatter('mpg (su)', 'msrp (su)', )

### Average of (product of (variables in standard units))

In [None]:
# Compute r value for acceleration and msrp
accel_msrp = hybrid.select('acceleration', 'msrp')
accel_msrp

In [None]:
# variables in standard units
accel_msrp_su = standardize(accel_msrp)
accel_msrp_su

In [None]:
# product of (variables in standard units)
product = accel_msrp_su.column('acceleration (su)') * accel_msrp_su.column('msrp (su)')
product

In [None]:
accel_msrp_su_product = accel_msrp_su.with_column(
    'product', product
)
accel_msrp_su_product

In [None]:
# average of (product of (variables in standard units))
np.mean(product)

**Q:** Does that seem right?  How would you know?

<br/><br/><br/><br/><br/><br/><br/><br/>

In [None]:
accel_msrp.scatter('acceleration', 'msrp', fit_line=True)

In [None]:
r_scatter(0.7)

### A function to compute *r*

In [None]:
def correlation(t, x, y):
    """The correlation coefficient of columns x and y in table t."""
    return np.mean(standard_units(t.column(x)) * standard_units(t.column(y)))

In [None]:
correlation(hybrid.select('acceleration', 'msrp'), 'acceleration', 'msrp')

**Q:** What do you expect the correlation to be for the next chart?

* A. Below -1  
* B. Between -1 and 0  
* C. About 0  
* D. Between 0 and 1  
* E. Above 1  

In [None]:
hybrid.scatter('mpg', 'msrp')

In [None]:
correlation(hybrid.select('mpg', 'msrp'), 'mpg', 'msrp')