# Lecture 5 Supplementary Notebook

## DSC 40A, Summer 2024

The following cell sets up the necessary imports – don't worry too much about it.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import seaborn as sns

from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats("svg")

pd.options.plotting.backend = "plotly"

# DSC 80 preferred styles
pio.templates["dsc80"] = go.layout.Template(
    layout=dict(
        margin=dict(l=30, r=30, t=30, b=30),
        autosize=True,
        xaxis=dict(showgrid=True),
        yaxis=dict(showgrid=True),
        title=dict(x=0.5, xanchor="center"),
    )
)
pio.templates.default = "simple_white+dsc80"

from IPython.display import HTML

Let's load in the commute times dataset as a `pandas` DataFrame.

In [None]:
df = pd.read_csv('data/commute-times.csv')
df.head()

There are many columns in here, but the only ones we're interested in for now are `'departure_hour'` and `'minutes'`.

In [None]:
df[['departure_hour', 'minutes']]

In [None]:
pio.renderers.default = 'plotly_mimetype+notebook' # If the plot doesn't load for you, run this first.

In [None]:
fig = px.scatter(df,
           x='departure_hour',
           y='minutes',
           size=np.ones(len(df)) * 50,
           size_max=8)
fig.update_xaxes(title='Home Departure Time (AM)')
fig.update_yaxes(title='Minutes to School')
fig.update_layout(title='Commuting Time vs. Home Departure Time')
fig.update_layout(width=700)

## Correlation

$$\begin{align*} r &= \text{the average of the product of $x$ and $y$, when both are in standard units} \\ &= \frac{1}{n} \sum_{i = 1}^n \left( \frac{x_i - \bar{x}}{\sigma_x} \right) \left( \frac{y_i - \bar{y}}{\sigma_y} \right)  \end{align*}$$

In [None]:
def correlation(x, y):
    x = np.array(x)
    y = np.array(y)
    
    x_su = (x - np.mean(x)) / np.std(x, ddof=0) # make sure we divide by n, not n-1
    y_su = (y - np.mean(y)) / np.std(y, ddof=0)
    
    return np.mean(x_su * y_su)

In [None]:
xs = df['departure_hour'] # How we access columns in pandas.
ys = df['minutes']

In [None]:
correlation(xs, ys)

In [None]:
# Symmetric!
correlation(ys, xs)

In [None]:
# Doesn't change if we multiply x or y by constants!
correlation(xs * 1000, ys * 545)

In [None]:
# DataFrames in pandas have a built-in correlation method
df.corr()

## Implementing $w_0^*$ and $w_1^*$

Recall, the formulas for the optimal intercept and slope are

$$w_1^* = r \frac{\sigma_y}{\sigma_x}$$

$$w_0^* = \bar{y} - w_1^* \bar{x}$$

In [None]:
def slope(x, y):
    return correlation(x, y) * np.std(y) / np.std(x)

In [None]:
def intercept(x, y):
    return np.mean(y) - slope(x, y) * np.mean(x)

In [None]:
w0_star = intercept(xs, ys)
w1_star = slope(xs, ys)

# Just fancy printing – ignore these next two lines.
rule_string = ('$$\\text{Predicted Commute Time (in Minutes)} = ' + 
               f'{round(w0_star, 2)} + {round(w1_star, 2)}' + 
               '\cdot \\left( \\text{Departure Hour} \\right)$$')
display(HTML(f'<h4>The best linear predictor for this dataset is</h4><br><center>{rule_string}</center>'))

In [None]:
hline = px.line(x=[5.5, 11.5], y=[97.405, 48.265]).update_traces(line={'color': 'red', 'width': 4})
fline1 = go.Figure(fig.data + hline.data)
fline1.update_xaxes(title='Home Departure Time (AM)')
fline1.update_yaxes(title='Minutes to School')
fline1.update_layout(title='<span style="color:red">Predicted Commute Time</span> = 142.45 - 8.19 * Departure Hour')
fline1.update_layout(width=700, margin={'t': 60})

Now that we have $w_0^*$ and $w_1^*$, we can use them to make predictions.

In [None]:
def predict_commute(x_new):
    return w0_star + w1_star * x_new

In [None]:
predict_commute(11)

In [None]:
predict_commute(7.25)

In [None]:
# Does this make sense?
predict_commute(4.5)

## What does $R_{\text{sq}}(w_0, w_1)$ look like?

Let's draw a plot of $R_{\text{sq}}(w_0, w_1)$, the empirical risk that we're trying to minimize.
- When we only had a single parameter, $h$, $R(h)$ was in 2D.
    - One axis for $h$, one axis for $R(h)$.
- Now that we have two parameters, $w_0$ and $w_1$, $R(w_0, w_1)$ will be in 3D!
    - One axis for $w_0$, one axis for $w_1$, one axis for $R(w_0, w_1)$.
    - The bottom plane consists of all possible combinations of slope and intercept.
    - The height of the function above any pair of points on the bottom plane represents the MSE for that combination of slope and intercept.

In [None]:
def mse(y_actual, y_pred):
    return np.mean((y_actual - y_pred)**2)

def mse_for_departure_model(w):
    w0, w1 = w
    return mse(df['minutes'], w0 + w1 * df['departure_hour'])

num_points = 50 # increase for better resolution, but it will run more slowly. 

# if (num_points <= 100):

uvalues = np.linspace(90, 190, num_points)
vvalues = np.linspace(-13, -3, num_points)
(u,v) = np.meshgrid(uvalues, vvalues)
thetas = np.vstack((u.flatten(),v.flatten()))

MSE = np.array([mse_for_departure_model(t) for t in thetas.T])

loss_surface = go.Surface(x=u, y=v, z=np.reshape(MSE, u.shape))

# opt_point = go.Scatter3d(x = [ahat], y = [bhat], z = [mse_for_height_model((ahat, bhat))],
#             mode = 'markers', name = 'optimal parameters',
#             marker=dict(size=10, color='gold'))

minimizer = go.Scatter3d(x=[w0_star], y=[w1_star], z=[mse_for_departure_model([w0_star, w1_star])], 
                         mode='markers', name='optimal parameters',
                         marker=dict(size=10, color='gold'))

fig = go.Figure(data=[loss_surface, minimizer])
# fig.add_trace(opt_point)

fig.update_layout(scene = dict(
    xaxis_title = "w0",
    yaxis_title = "w1",
    zaxis_title = r"R(w0, w1)"))


fig.show()
# else:
#     print("Picking num points > 100 can be really slow. If you really want to try, edit the code above so that this if statement doesn't trigger.")

## Aside: Pitfalls of correlation

In [None]:
anscombe = pd.read_csv('data/anscombe.csv')

In [None]:
plt.figure(figsize=(12, 10))

for i, n in enumerate(['I', 'II', 'III', 'IV']):
    rows = anscombe[anscombe.get('dataset') == n]
    x = rows['x']
    y = rows['y']
    
    plt.subplot(2, 2, i+1)
    plt.scatter(x, y, label=f'Dataset {n}', alpha=0.65, s=65)
    plt.title(f'Dataset {n}');

What do all four of these datasets have in common?

In [None]:
for i, n in enumerate(['I', 'II', 'III', 'IV']):
    rows = anscombe[anscombe.get('dataset') == n]
    x = rows['x']
    y = rows['y']
    
    r = correlation(x, y)
    outstr = f'''
    <b>Dataset {n}</b><br>
    $\\bar x$: {np.round(np.mean(x), 2)}<br>
    $\\bar y$: {np.round(np.mean(y), 2)}<br>
    $\\sigma_x$: {np.round(np.std(x), 2)}<br>
    $\\sigma_y$: {np.round(np.std(y), 2)}<br>
    $r$: {np.round(r, 2)}
    '''
    display(HTML(outstr))

They all share the exact same mean and standard deviation of $x$ and $y$, and the same correlation coefficient $r$! This means they all have the same best linear hypothesis function, in the sense of minimizing squared loss.

However, that linear hypothesis function **looks** better for some datasets than it does for others:

In [None]:
plt.figure(figsize=(12, 10))

for i, n in enumerate(['I', 'II', 'III', 'IV']):
    rows = anscombe[anscombe.get('dataset') == n]
    x = rows['x']
    y = rows['y']
    
    w0_ans = intercept(x, y)
    w1_ans = slope(x, y)
    
    plt.subplot(2, 2, i+1)
    plt.scatter(x, y, label=f'Dataset {n}', alpha=0.65, s=65)
    plt.plot(x, w0_ans + w1_ans * x, color='red');
    plt.title(f'Dataset {n}');

Moral of the story – visualize your data before trying to fit a prediction rule!

Another example of this phenomenon is the [Datasaurus Dozen 🦕](https://www.autodesk.com/research/publications/same-stats-different-graphs).