# Lecture 7 – More Simple Linear Regression

## DSC 40A, Fall 2021

In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from IPython.display import display, HTML, Math
from utils import *

In [None]:
# Run this cell to load in our dataset. Don't worry about what it's doing.
np.random.seed(25)

salaries_raw = pd.read_csv('data/data_scientist_salaries.csv')
salaries = salaries_raw.get(['YearsCodingProf', 'Age', 'FormalEducation', 'Salary']).dropna()

def extract_years(year_str):
    if isinstance(year_str, float):
        return year_str
    if 'older' in year_str:
        years = 65
    elif 'more' in year_str:
        years = 30
    elif 'Under' in year_str:
        years = 18
    else:
        extracted = re.findall('\d+', year_str)
        try:
            lower, upper = int(extracted[0]), int(extracted[1])
        except:
            print(extracted)
        years = np.random.randint(lower, upper + 1)
    return years + np.round(np.random.normal(0, 1), 2)

salaries['Age'] = salaries['Age'].apply(extract_years)
salaries['YearsExperience'] = salaries['YearsCodingProf'].apply(extract_years)
salaries = salaries[['YearsExperience', 'Age', 'FormalEducation', 'Salary']]
salaries = salaries[(salaries['Salary'] < 500000) & (salaries['Salary'] > 1000) & (salaries['YearsExperience'] > 0)]

In [None]:
salaries

In [None]:
px.scatter(salaries, x='YearsExperience', y='Salary', title='Salary vs. Years of Experience')

In [None]:
np.mean(salaries.get('Salary'))

In [None]:
np.median(salaries.get('Salary'))

## Correlation

$$\begin{align*} r &= \text{the average of the product of $x$ and $y$, when both are in standard units} \\ &= \frac{1}{n} \sum_{i = 1}^n \left( \frac{x_i - \bar{x}}{\sigma_x} \right) \left( \frac{y_i - \bar{y}}{\sigma_y} \right)  \end{align*}$$

In [None]:
def correlation(x, y):
    x = np.array(x)
    y = np.array(y)
    
    x_su = (x - np.mean(x)) / np.std(x)
    y_su = (y - np.mean(y)) / np.std(y)
    
    return np.mean(x_su * y_su)

In [None]:
xs = np.array(salaries.get('YearsExperience'))
ys = np.array(salaries.get('Salary')) / 1000 # Will measure salary in 1000s

In [None]:
correlation(xs, ys)

In [None]:
# Symmetric!
correlation(ys, xs)

In [None]:
# Doesn't change if we multiply x or y by constants!
correlation(xs * 1000, ys * 545)

In [None]:
# DataFrames in pandas have a built-in correlation method
salaries.corr()

## Implementing $w_0^*$ and $w_1^*$

Recall, the formulas for the optimal intercept and slope are

$$w_1^* = r \frac{\sigma_y}{\sigma_x}$$

$$w_0^* = \bar{y} - w_1^* \bar{x}$$

In [None]:
def slope(x, y):
    return correlation(x, y) * np.std(y) / np.std(x)

In [None]:
def intercept(x, y):
    return np.mean(y) - slope(x, y) * np.mean(x)

In [None]:
w0_star = intercept(xs, ys)
w1_star = slope(xs, ys)

# Just fancy printing – ignore these next two lines.
rule_string = '$$\\text{Predicted Salary (in \$1000s)} = ' + f'{int(w0_star)} + {int(w1_star)}' + '\cdot \\left( \\text{Years of Experience} \\right)$$'
display(HTML(f'<h4>The best linear predictor, under squared loss, for this dataset is</h4><br><center>{rule_string}</center>'))

In [None]:
px.scatter(salaries, x='YearsExperience', y='Salary', title='Salary vs. Years of Experience')

fig = go.Figure()
fig.add_trace(go.Scatter(x = xs, y = ys, mode = 'markers', name = 'actual'))
fig.add_trace(go.Scatter(x = xs, y = w0_star + w1_star * xs, name = 'linear prediction rule', line=dict(color='red')))
fig.update_layout(xaxis_title = 'Years of Experience', yaxis_title = 'Salary ($1000s)')

Now that we have $w_0^*$ and $w_1^*$, we can use them to make predictions.

In [None]:
def predict_salary(yoe):
    return w0_star + w1_star * yoe

In [None]:
predict_salary(4)

In [None]:
predict_salary(20)

In [None]:
predict_salary(1000)

## Peak into the future – how do we use linear regression in practice?

In practice, most of this stuff is already implemented by various packages. The goal of discussing loss functions and empirical risk is to show you how it all works.

One of the more common packages in Python for machine learning work is `scikit-learn`, also called `sklearn`.

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model = LinearRegression()

In [None]:
model.fit(xs.reshape(-1, 1), ys)

In [None]:
model.intercept_

In [None]:
model.coef_

Note that these numbers match those from our manual calculations!

In [None]:
w0_star, w1_star

Don't worry at all about how `sklearn` works – that's for DSC 80.

## What does $R_{sq}(w_0, w_1)$ look like?

Let's draw a plot of $R_{sq}(w_0, w_1)$, the empirical risk that we're trying to minimize.
- When we only had a single parameter, $h$, $R(h)$ was in 2D.
    - One axis for $h$, one axis for $R(h)$.
- Now that we have two parameters, $w_0$ and $w_1$, $R(h)$ will be in 3D!
    - One axis for $w_0$, one axis for $w_1$, one axis for $R(h)$.
    - The x-y plane consists of all possible combinations of slope and intercept.

In [None]:
def mean_squared_error(w):
    w0 = w[0]
    w1 = w[1]
    return np.mean((ys - (w0 + w1 * xs))**2)

In [None]:
show_mse(mean_squared_error, [w0_star, w1_star], show_min=True)

## Aside: pitfalls of correlation

In [None]:
anscombe = pd.read_csv('data/anscombe.csv')

In [None]:
anscombe.head()

In [None]:
plt.figure(figsize=(12, 10))

for i, n in enumerate(['I', 'II', 'III', 'IV']):
    rows = anscombe[anscombe.get('dataset') == n]
    x = rows['x']
    y = rows['y']
    
    plt.subplot(2, 2, i+1)
    plt.scatter(x, y, label=f'Dataset {n}', alpha=0.65, s=65)
    plt.title(f'Dataset {n}');

What do all four of these datasets have in common?

In [None]:
for i, n in enumerate(['I', 'II', 'III', 'IV']):
    rows = anscombe[anscombe.get('dataset') == n]
    x = rows['x']
    y = rows['y']
    
    r = correlation(x, y)
    outstr = f'''
    <b>Dataset {n}</b><br>
    $\\bar x$: {np.round(np.mean(x), 2)}<br>
    $\\bar y$: {np.round(np.mean(y), 2)}<br>
    $\\sigma_x$: {np.round(np.std(x), 2)}<br>
    $\\sigma_y$: {np.round(np.std(y), 2)}<br>
    $r$: {np.round(r, 2)}
    '''
    display(HTML(outstr))

They all share the exact same mean and standard deviation of $x$ and $y$, and the same correlation coefficient $r$! This means they all have the same best linear prediction rule, under squared loss.

However, that linear prediction rule looks better for some datasets than it does for others:

In [None]:
plt.figure(figsize=(12, 10))

for i, n in enumerate(['I', 'II', 'III', 'IV']):
    rows = anscombe[anscombe.get('dataset') == n]
    x = rows['x']
    y = rows['y']
    
    w0_ans = intercept(x, y)
    w1_ans = slope(x, y)
    
    plt.subplot(2, 2, i+1)
    plt.scatter(x, y, label=f'Dataset {n}', alpha=0.65, s=65)
    plt.plot(x, w0_ans + w1_ans * x, color='red');
    plt.title(f'Dataset {n}');

Moral of the story – visualize your data before trying to fit a prediction rule!

If that was interesting, [check out this article](https://www.autodesk.com/research/publications/same-stats-different-graphs).