# Lecture 4 Supplementary Notebook

## DSC 40A, Summer 2024

The following cell sets up the necessary imports – don't worry too much about it.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import seaborn as sns

from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats("svg")

pd.options.plotting.backend = "plotly"

# DSC 80 preferred styles
pio.templates["dsc80"] = go.layout.Template(
    layout=dict(
        margin=dict(l=30, r=30, t=30, b=30),
        autosize=True,
        xaxis=dict(showgrid=True),
        yaxis=dict(showgrid=True),
        title=dict(x=0.5, xanchor="center"),
    )
)
pio.templates.default = "simple_white+dsc80"

Let's load in the commute times dataset as a `pandas` DataFrame.

In [None]:
df = pd.read_csv('commute-times.csv')
df.head()

There are many columns in here, but the only ones we're interested in for now are `'departure_hour'` and `'minutes'`.

In [None]:
df[['departure_hour', 'minutes']]

In [None]:
pio.renderers.default = 'plotly_mimetype+notebook' # If the plot doesn't load for you, run this first.

In [None]:
fig = px.scatter(df,
           x='departure_hour',
           y='minutes',
           size=np.ones(len(df)) * 50,
           size_max=8)
fig.update_xaxes(title='Home Departure Time (AM)')
fig.update_yaxes(title='Minutes to Work')
fig.update_layout(title='Commuting Time vs. Home Departure Time')
fig.update_layout(width=700)

Let's implement the formulas for the best slope, $w_1^*$, and intercept, $w_0^*$, we just found.

In [None]:
def find_best_slope(x, y):
    # Assume x and y are two Series.
    numerator = ((x - np.mean(x)) * (y - np.mean(y))).sum()
    denominator = ((x - np.mean(x)) ** 2).sum()
    return numerator / denominator

def find_best_intercept(x, y):
    return y.mean() - find_best_slope(x, y) * x.mean()

In [None]:
best_slope = find_best_slope(df['departure_hour'], df['minutes'])
best_slope

In [None]:
best_intercept = find_best_intercept(df['departure_hour'], df['minutes'])
best_intercept

The results above tell us that the linear hypothesis function with the lowest mean squared error on our dataset is:

$$\text{predicted commute time (minutes)} = 142.45 - 8.19 \cdot \text{departure hour}$$

We can use it to make predictions:

In [None]:
def predict_commute(x_new):
    return best_intercept + best_slope * x_new

What if I leave at 8AM? 10:45AM?

In [None]:
predict_commute(8)

In [None]:
predict_commute(10 + 45 / 60)

What do all of our predictions look like?

In [None]:
hline = px.line(x=[5.5, 11.5], y=[predict_commute(5.5), predict_commute(11.5)]).update_traces(line={'color': 'red', 'width': 4})
fline1 = go.Figure(fig.data + hline.data)
fline1.update_xaxes(title='Home Departure Time (AM)')
fline1.update_yaxes(title='Minutes to School')
fline1.update_layout(title='<span style="color:red">Predicted Commute Time</span> = 142.25 - 8.19 * Departure Hour')
fline1.update_layout(width=700, margin={'t': 60})