BerkeleyX: Data8.3x

Foundations of Data Science: Prediction and Machine Learning

In [None]:
from datascience import *
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Correlation

Lec 3.1 Visualization

In [None]:
galton = Table.read_table('../../data/galton.csv')

In [None]:
heights = Table().with_columns(
    'MidParent', galton.column('midparentHeight'),
    'Child', galton.column('childHeight'))
print(heights)
heights.scatter('MidParent')
plt.title('(Sort of) linear correlation between average parent and child height')

In [None]:
hybrid = Table.read_table('../../data/hybrid.csv')
print(hybrid)
hybrid.scatter('mpg', 'msrp')
plt.title('Negative correlation consume/price (!)')
hybrid.scatter('acceleration', 'msrp')
plt.title('Positive correlation acceleration/price')

In [None]:
# restricting the population

suv = hybrid.where('class', 'SUV')
print(suv.num_rows)
suv.scatter('mpg', 'msrp')
plt.title('More linear negative correlation consume/price')

In [None]:
def standard_units(x):
    "Convert any array of numbers to standard units."
    return (x - np.average(x)) / np.std(x)

In [None]:
Table().with_columns(
    'mpg (standard units)',  standard_units(suv.column('mpg')), 
    'msrp (standard units)', standard_units(suv.column('msrp'))
).scatter(0, 1)
plt.xlim(-3, 3)
plt.ylim(-3, 3);
plt.title('Same as above but in standard units')

Lec 3.2 Calculation

The correlation coefficien r, based on standard units, measures linear association as [-1, 1] value

If r is zero there is no linear association

In [None]:
def r_scatter(r):
    """r is a correlation coefficient - in [-1, 1]"""
    plt.figure(figsize=(5,5))
    "Generate a scatter plot with a correlation approximately r"
    x = np.random.normal(0, 1, 1000)
    z = np.random.normal(0, 1, 1000)
    y = r*x + (np.sqrt(1-r**2))*z
    plt.scatter(x, y, color='darkblue', s=20)
    plt.xlim(-4, 4)
    plt.ylim(-4, 4)

In [None]:
# Draws a scatter diagram of variables that have the specified correlation

r_scatter(0.6)
r_scatter(0)
r_scatter(-0.9)

Calculating  r

In [None]:
# a simple table as example to calculate r
x = range(1, 7)
y = [2, 3, 1, 5, 2, 7]
t = Table().with_columns(
    'x', x,
    'y', y)

# we expect a positive - but less than one - value for r
print(t)
print('---')
t.scatter('x', 'y', s=30, color='red')
plt.title('Original data')

t= t.with_columns(
        'x (standard units)', standard_units(x),
        'y (standard units)', standard_units(y)
    )
print(t)
print('---')
su_product = t.column(2) * t.column(3)
t = t.with_column('product of standard units', su_product)
print(t)

In [None]:
# r is the average of the products of standard units
r = np.mean(t.column(4))
print(r)

In [None]:
def correlation(tbl, x, y):
    """tbl is a table; 
    x and y are column labels"""
    x_in_standard_units = standard_units(tbl.column(x))
    y_in_standard_units = standard_units(tbl.column(y))
    return np.average(x_in_standard_units * y_in_standard_units)  

In [None]:
print(correlation(t, 'x', 'y'), '=', correlation(t, 'y', 'x'))

t.scatter('x', 'y', s=30, color='red')
t.scatter('y', 'x', s=30, color='red')

In [None]:
correlation(suv, 'mpg', 'msrp')

In [None]:
from scipy.stats.stats import pearsonr

print(pearsonr(t.column('x'), t.column('y'))[0], "=", pearsonr(t.column('y'), t.column('x'))[0])
print(pearsonr(suv.column('mpg'), suv.column('msrp'))[0])

Lec 3.3 Interpretation

Nonlinearity

In [None]:
# r makes no sense in this case

new_x = np.arange(-4, 4.1, 0.5)
nonlinear = Table().with_columns(
        'x', new_x,
        'y', new_x**2
    )
nonlinear.scatter('x', 'y', s=30, color='r')
print(correlation(nonlinear, 'x', 'y'), pearsonr(nonlinear.column('x'), nonlinear.column('y')))

Outliers

In [None]:
line = Table().with_columns(
        'x', make_array(1, 2, 3, 4),
        'y', make_array(1, 2, 3, 4)
    )
line.scatter('x', 'y', s=30, color='r')

print(correlation(line, 'x', 'y'))

In [None]:
# a single outlier ruining all!

outlier = Table().with_columns(
        'x', make_array(1, 2, 3, 4, 5),
        'y', make_array(1, 2, 3, 4, 0)
    )
outlier.scatter('x', 'y', s=30, color='r')

print(correlation(outlier, 'x', 'y'))

Ecological Correlation

In [None]:
# keep in mind what a record represents! Here they are aggregate for each state + DC
sat2014 = Table.read_table('../../data/sat2014.csv').sort('State')
sat2014

In [None]:
sat2014.scatter('Critical Reading', 'Math')
print(correlation(sat2014, 'Critical Reading', 'Math'))