# Correlation Coefficients

In [None]:
from datascience import *
from cs104 import *
import numpy as np
%matplotlib inline

## 1. Finch data

In [None]:
# Load finch data 
finch_1975 = Table().read_table("data/finch_beaks_1975.csv")
finch_1975.show(6)

In [None]:
fortis = finch_1975.where('species', 'fortis')
fortis.num_rows

In [None]:
scandens = finch_1975.where('species', 'scandens')
scandens.num_rows

In [None]:
plot = fortis.scatter('Beak length, mm', 'Beak depth, mm')
plot.set_title('Fortis Finches, 1975')

In [None]:
plot = fortis.scatter('Beak length, mm', 'Beak depth, mm', fit_line=True)
plot.set_title('Fortis Finches, 1975') 

In [None]:
finch_1975.scatter('Beak length, mm', 'Beak depth, mm', group='species')
plot.set_title('Fortis Finches, 1975') 

## 2. Correlation Intuition

Visualize different values of r:

In [None]:

def make_table(r, x_mean, y_mean, size, seed=10):
    """
    Make a table of size random (x,y) points with a 
    correlation coefficient of approximately r.
    The points will be centered at (x_mean,y_mean).
    """
    rng = np.random.default_rng(seed)
    x = rng.normal(x_mean, 1, size)
    z = rng.normal(y_mean, 1, size)
    y = r*x + (np.sqrt(1-r**2))*z
    
    # Make sure the mean is *exactly* what we want :).
    table = Table().with_columns("x", x - np.mean(x) + x_mean, "y", y - np.mean(y) + y_mean)
    
    return table

def plot_table(table, color='C4', fit_line=True, **kwargs):
    """
    Plot a table of (x,y).
    """
    plot = table.scatter("x", "y",alpha=0.5, fit_line=fit_line, color=color, **kwargs)
    plot.line(x = 0, color='white', width=4, zorder=0.9)
    plot.line(y = 0, color='white', width=4, zorder=0.9)    
    plot.set_xlim(-4, 4)
    plot.set_ylim(-4, 4)
    return plot

def r_scatter(r):
    "Generate a scatter plot with a correlation approximately r"
    table = make_table(r, 0, 0, 500)
    plot = plot_table(table)
    plot.set_title('r = '+str(r))

In [None]:
interact(r_scatter, r = Slider(-1,1,0.01))

Here are several scatter plots with representative values of r.

## 3. Implementing Pearson's Correlation Coefficient

Here is our formula for $r$:

$$
r = \frac{
  \sum_i (x_i - \bar{x})(y_i - \bar{y})
}{
  \sqrt{\sum_i(x_i - \bar{x})^2}\sqrt{\sum_i(y_i - \bar{y})^2}
}
$$

And its implementation:

In [None]:
def pearson_correlation(table, x_label, y_label):
    """
    Return the correlation coefficient capturing the sign
    and strength of the association between the given columns in the
    table.
    """
    x = table.column(x_label)
    y = table.column(y_label)
    x_mean = np.mean(x)
    y_mean = np.mean(y)
    numerator = sum((x - x_mean) * (y - y_mean)) 
    denominator = np.sqrt(sum((x - x_mean)**2)) * np.sqrt(sum((y - y_mean)**2))
    return numerator / denominator 

In [None]:
fortis_r = pearson_correlation(fortis, 'Beak length, mm', 'Beak depth, mm')
fortis_r

In [None]:
scandens_r = pearson_correlation(scandens, 'Beak length, mm', 'Beak depth, mm')
scandens_r

In [None]:
finch_1975.scatter('Beak length, mm', 'Beak depth, mm', fit_line=True, group="species")

In [None]:
fortis_r = pearson_correlation(fortis, 'Beak length, mm', 'Beak depth, mm')
fortis_r

In [None]:
scandens_r = pearson_correlation(scandens, 'Beak length, mm', 'Beak depth, mm')
scandens_r

In [None]:
finch_1975.scatter('Beak length, mm', 'Beak depth, mm', fit_line=True, group="species")

Switching axes shouldn't change anything. 

In [None]:
scandens_r = pearson_correlation(scandens, 'Beak depth, mm', 'Beak length, mm')
scandens_r

## 4. Watch out for... 

### Nonlinearity

In [None]:
new_x = np.arange(-4, 4.1, 0.5)
nonlinear = Table().with_columns(
        'x', new_x,
        'y', new_x**2
    )
nonlinear.scatter('x', 'y', s=50, color='red')

In [None]:
pearson_correlation(nonlinear, 'x', 'y')

### Outliers

What can cause outliers?  What to do when you encounter them?

In [None]:
line = Table().with_columns(
        'x', make_array(1, 2, 3, 4,5),
        'y', make_array(1, 2, 3, 4,5)
    )
line.scatter('x', 'y', s=50, color='red')

In [None]:
pearson_correlation(line, 'x', 'y')

In [None]:
outlier = Table().with_columns(
        'x', make_array(1, 2, 3, 4, 5),
        'y', make_array(1, 2, 3, 4, 0)
    )
outlier.scatter('x', 'y', s=50, color='red')

In [None]:
pearson_correlation(outlier, 'x', 'y')

### False Correlations due to Data Aggregation

In [None]:
sat2014 = Table.read_table('data/sat2014.csv').sort('State')
sat2014

In [None]:
sat2014.scatter('Critical Reading', 'Math')

In [None]:
pearson_correlation(sat2014, 'Critical Reading', 'Math')

In [None]:
sat_fake = Table.read_table('data/sat-bogus.csv').sort('State')  

Here is some fake data to illustrate this:

In [None]:
sat_fake_g = sat_fake.group('State', np.mean)

with Figure(1,2, figsize=(6,4)):
    sat_fake_g.scatter('Critical Reading mean', 'Math mean', fit_line=True)
    sat_fake.scatter('Critical Reading', 'Math', group='State')

# print(pearson_correlation(sat_fake_g, 'Critical Reading mean', 'Math mean'))
# print(pearson_correlation(sat_fake, 'Critical Reading', 'Math'))

r is 0.98 on the left, but -0.09 for the un-aggregated data on the right!