# The Whole Game

Some defs:

| What | Formula |
| -- | -- |
| predictors | $$X=(X_1,X_2,\dots,X_p)$$ |
| response | $$Y$$ |
| error | $$\epsilon \sim \mathcal{N}(0,\sigma^2)$$ |
| truth | $$Y=f(X)+\epsilon$$ |
| estimate | $$\hat{Y}=\hat{f}(X)$$ |

Errors for the estimate:

$$\begin{eqnarray}
E[(Y-\hat{Y})^2] & = & E[(f(X)+\epsilon-\hat{f}(X))^2] \\
                 & = & \dots \\
                 & = & \underbrace{[f(X)-\hat{f}(X)]^2}_\text{reducible} - \underbrace{\text{Var}(\epsilon)}_\text{irreducible}
\end{eqnarray}$$

In [None]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
plt.style.use('seaborn-white')

## Some Data

In [None]:
def some_data():
    x=1*np.random.rand(100)
    y=1+x**3
    eps=np.random.normal(1,.25,len(y))
    ys=y+eps
    return x, y, ys

x,y,ys = some_data()

In [None]:
plt.plot(x,ys,'bo',alpha=0.3);

In [None]:
xx=np.arange(0,1,0.01)
yp=lambda m,x: 1.8+m*x
plt.plot(x, ys, 'bo',alpha=0.3);
plt.plot(xx,yp(1,xx),'r');

## Mean Squared Error

In [None]:
MSE=lambda y,yp: 1/len(y)*np.dot(y-yp,y-yp)

In [None]:
print(MSE(ys, yp(1,x)))

In [None]:
g=np.arange(0.5,1.5,0.005)
e=[MSE(ys, yp(gg,x)) for gg in g]
plt.plot(g,e);
m0=g[np.argmin(e)]
print(m0,np.min(e))

In [None]:
import scipy.signal as ss
import scipy.interpolate as si

In [None]:
n=50
xss=np.linspace(x.min(),x.max(),n)
_,yss0=zip(*sorted([v for v in zip(x,ys)], key=lambda v: v[0]))
yss=ss.resample(yss0,n)

In [None]:
t, c, k = si.splrep(xss, yss, s=0.2, k=4)

N = 100
xmin, xmax = x.min(), x.max()
xx = np.linspace(xmin, xmax, N)
spline = si.BSpline(t, c, k, extrapolate=False)

plt.plot(x, ys, 'bo', alpha=0.3);
plt.plot(xx, spline(xx), 'r');

print(MSE(ys, spline(x)))

In [None]:
ype,yse=[],[]
for i in range(1000):
    x2,y2,ys2 = some_data()
    ype.append(MSE(ys2,yp(m0, x2)))
    yse.append(MSE(ys2,spline(x2.clip(xmin,xmax))))

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8,4))
sns.histplot(ype,ax=ax1);
sns.histplot(yse,ax=ax2);
pd.DataFrame({'ype':ype,'yse':yse}).describe()

## Least Squares for reference on linear fit

In [None]:
import statsmodels.formula.api as smf
df=pd.DataFrame({'x':x,'y':ys})
mod = smf.ols(formula='y ~ x', data=df)
res = mod.fit()
print(res.params)

## Just for fun, look at population of possible fits over a family of random samples

In [None]:
xb,xm,ype,yse=[],[],[],[]
for i in range(1000):
    x,y,ys = some_data()
    df=pd.DataFrame({'x':x,'y':ys})
    mod = smf.ols(formula='y ~ x', data=df)
    res = mod.fit()
    xb.append(res.params.Intercept)
    xm.append(res.params.x)
    yp=lambda x: xb[-1]+xm[-1]*x
    
    n=50
    xss=np.linspace(x.min(),x.max(),n)
    _,yss0=zip(*sorted([v for v in zip(x,ys)], key=lambda v: v[0]))
    yss=ss.resample(yss0,n)
    
    N = 100
    xmin, xmax = x.min(), x.max()
    xx = np.linspace(xmin, xmax, N)
    spline = si.BSpline(t, c, k, extrapolate=False)
    
    x2,y2,ys2 = some_data()
    ype.append(MSE(ys2,yp(x2)))
    yse.append(MSE(ys2,spline(x2.clip(xmin,xmax))))

In [None]:
fig, ax = plt.subplots(1, 4, figsize=(12,3))
sns.histplot(xb,ax=ax[0]);
sns.histplot(xm,ax=ax[1]);
sns.histplot(ype,ax=ax[2]);
sns.histplot(yse,ax=ax[3]);
pd.DataFrame({'ype':ype,'yse':yse}).describe()

# Homework

Do problem 2.8. See how far you can get with the finding a good linear model after doing some data cleaning and using the simple `smf.ols`.

# A couple usage examples for `smf.ols`

## Least squares on other data

In [None]:
ic = pd.read_csv('Data/Income1.csv')
ic.describe()

Figure 2.2

In [None]:
sns.regplot(data=ic, x='Education', y='Income');

In [None]:
mod = smf.ols(formula='Income ~ Education', data=ic)
res = mod.fit()
print(res.summary())

In [None]:
ad = pd.read_csv('Data/Advertising.csv')
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(12,4))
sns.regplot(data=ad, x='TV', y='Sales', ax=ax1);
sns.regplot(data=ad, x='Radio', y='Sales', ax=ax2);
sns.regplot(data=ad, x='Newspaper', y='Sales', ax=ax3);

In [None]:
mod = smf.ols(formula='Sales ~ TV + Radio + Newspaper', data=ad)
res = mod.fit()
print(res.summary())