BerkeleyX: Data8.3x

Foundations of Data Science: Prediction and Machine Learning

In [None]:
from datascience import *
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Lab 2: Regression

1: How Faithful is Old Faithful? Revisited

In [None]:
faithful = Table.read_table("../../data/faithful.csv")
faithful

In [None]:
duration_mean = np.mean(faithful.column("duration"))
duration_std = np.std(faithful.column("duration"))
wait_mean = np.mean(faithful.column("wait"))
wait_std = np.std(faithful.column("wait"))

faithful_standard = Table().with_columns(
    "duration (standard units)", (faithful.column("duration") - duration_mean) / duration_std,
    "wait (standard units)", (faithful.column("wait") - wait_mean) / wait_std
)
print(faithful_standard)
print(duration_std, wait_std)

In [None]:
r = np.mean(faithful_standard.column(0) * faithful_standard.column(1))
r

2: The regression line

In [None]:
def plot_data_and_line(dataset, x, y, point_0, point_1):
    """Makes a scatter plot of the dataset, along with a line passing through two points."""
    dataset.scatter(x, y, label="data")
    xs, ys = zip(point_0, point_1)
    plt.plot(xs, ys, label="regression line")
    plt.legend(bbox_to_anchor=(1.5,.8))

plot_data_and_line(faithful_standard, 
                   "duration (standard units)", 
                   "wait (standard units)", 
                   [-2, -2*r], 
                   [2, 2*r])

In [None]:
# Question 2.1 What is the slope of the regression line in original units?
slope = r * duration_std / wait_std
print(slope)

intercept = slope*(-duration_mean) + wait_mean
print(intercept)

3: Investigating the regression line

In [None]:
# Question 3.1 Compute the predicted waiting time for an eruption that lasts 2 minutes,
# and for an eruption that lasts 5 minutes
def prediction_su(x):
    return (((x - duration_mean) / duration_std) * r * wait_std) + wait_mean

two_minute_predicted_waiting_time = prediction_su(2)
five_minute_predicted_waiting_time = prediction_su(5)

# Here is a helper function to print out your predictions
# (you don't need to modify it):
def print_prediction(duration, predicted_waiting_time):
    print("After an eruption lasting", duration,
          "minutes, we predict you'll wait", predicted_waiting_time,
          "minutes until the next eruption.")

print_prediction(2, two_minute_predicted_waiting_time)
print_prediction(5, five_minute_predicted_waiting_time)

In [None]:
plot_data_and_line(faithful, "duration", "wait", 
                   [2, two_minute_predicted_waiting_time], 
                   [5, five_minute_predicted_waiting_time])

In [None]:
# Question 3.2 Make predictions for the waiting time after each eruption in the faithful table
# duration,wait,predicted wait
# 3.6,79,72.1011

a = r * (wait_std / duration_std)
b = wait_mean - a * duration_mean

faithful_predictions = faithful.with_column(
    'predicted wait', a * faithful.column(0) + b
)

faithful_predictions

In [None]:
# Question 3.3 How close were we? We computed the residual for each eruption in the dataset

residual = faithful_predictions.column(1) - faithful_predictions.column(2)
faithful_residuals = faithful_predictions.with_column('residual', residual)

faithful_residuals.scatter("duration", "residual", color="r")

4: How accurate are different predictions?

In [None]:
faithful_residuals.scatter("duration", "wait", label="actual waiting time", color="blue")
plt.scatter(faithful_residuals.column("duration"), faithful_residuals.column("residual"), label="residual", color="r")
plt.plot([2, 5], [two_minute_predicted_waiting_time, five_minute_predicted_waiting_time], label="regression line")
plt.legend(bbox_to_anchor=(1.7,.8));

In [None]:
# Question 4.1 
# predict waiting time for eruptions lasting 0 minutes, 2.5 minutes, an hour
zero_minute_predicted_waiting_time = b
two_point_five_minute_predicted_waiting_time = a * 2.5 + b
hour_predicted_waiting_time = a * 60 + b

print_prediction(0, zero_minute_predicted_waiting_time)
print_prediction(2.5, two_point_five_minute_predicted_waiting_time)
print_prediction(60, hour_predicted_waiting_time)

5: Divide and Conquer

In [None]:
faithful.scatter("duration", "wait", label="actual waiting time", color="blue")
plt.plot([3, 3], [40, 100]);

In [None]:
def standard_units(any_numbers):
    "Convert any array of numbers to standard units."
    return (any_numbers - np.mean(any_numbers)) / np.std(any_numbers)  

def standardize(t):
    """Return a table in which all columns of t are converted to standard units."""
    t_su = Table()
    for label in t.labels:
        t_su = t_su.with_column(label + ' (su)', standard_units(t.column(label)))
    return t_su

In [None]:
# Question 5.1 - r for the points with a duration below 3 and then for all the points with a duration above 3
def reg_coeff(t):
    """Return the regression coefficient for columns 0 & 1."""
    t_su = standardize(t)
    return np.mean(t_su.column(0) * t_su.column(1))

below_3 = faithful.where('duration', are.below(3))
above_3 = faithful.where('duration', are.above(3))
below_3_r = reg_coeff(below_3)
above_3_r = reg_coeff(above_3)
print("For points below 3, r is", below_3_r, "; for points above 3, r is", above_3_r)

In [None]:
# Question 5.2 - Write functions slope_of() and intercept_of()
def slope_of(t, r):
    """Return the slope of the regression line for t in original units.
    
    Assume that column 0 contains x values and column 1 contains y values.
    r is the regression coefficient for x and y.
    """
    return r * np.std(t.column(1))/np.std(t.column(0))


def intercept_of(t, r):
    """Return the slope of the regression line for t in original units."""
    s = slope_of(t, r)
    return np.average(t.column(1)) - s * np.average(t.column(0))


below_3_a = slope_of(below_3, below_3_r)
below_3_b = intercept_of(below_3, below_3_r)
above_3_a = slope_of(above_3, above_3_r)
above_3_b = intercept_of(above_3, above_3_r)


def wait_below_3(duration):
    return below_3_a * duration + below_3_b


def wait_above_3(duration):
    return above_3_a * duration + above_3_b

In [None]:
faithful.scatter(0, 1)
plt.plot([1, 3], [wait_below_3(1), wait_below_3(3)])
plt.plot([3, 6], [wait_above_3(3), wait_above_3(6)]);

In [None]:
# Question 5.3 Write predict_wait()
def predict_wait(duration):
    """Return the wait predicted by the appropriate one of the two regression lines above."""
    return wait_below_3(duration) if duration < 3 else wait_above_3(duration)

In [None]:
faithful.with_column('predicted', faithful.apply(predict_wait, 'duration')).scatter(0)