In [None]:
import os
import numpy as np
import scipy
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf

$$
\newcommand{\cor}{\text{Cor}}
\newcommand{\var}{\text{Var}}
\newcommand{\cov}{\text{Cov}}
\newcommand{E}{\text{E}}
$$

In [None]:
filename = 'diamond.csv'
pathname = os.path.join('.', 'datasets', filename)
if os.path.isfile(pathname):
    df = pd.read_csv(pathname)
else:
    assert False

# An alternative way to look at OLS

Before we proceed to the study of multiple regressions, let us cast the results of regression with one independent variable in a slightly different light. Recall that the regression model for a single independent variable is $Y_i = \beta_0 + \beta_1 X_i + \epsilon_i$. If the fitted model gives $\hat{Y}_i = \hat{\beta}_0 + \hat{\beta}_1 X_i$ then the residual of $Y_i$ is $e_i = Y_i - \bar{Y}_i$. Further,
$$
\begin{eqnarray}
\hat{\beta}_1 &=& \cor(X, Y)\frac{S_Y}{S_X} \\
\hat{\beta}_0 &=& \bar{Y} - \hat{\beta}_1\bar{X}
\end{eqnarray}
$$
If there is no $\beta_1$, that is, if the model fits only the intercept then for all $Y_i$ we will get just $\bar{Y}$. If we rewrite the regression model as $Y_i = \beta_0 X_{0i} + \beta_1 X_{1i} + \epsilon_i$ where $X_{0i} = 1$ and $X_{1i} = X_i$ for all $i$ then we consider two possibilities:

- Without $\beta_1$, the model fits all $Y_i$ to $\bar{Y}$ leading to the residual $Y_i - \bar{Y}$.
- If, instead, we regress $X_{1i}$ on $X_{0i}$ then the estimated coefficient is $\bar{X}$ so that the residual is $X_{1i} - \bar{X}$.

These residuals are written as

- $e_{i, Y|X_2} = Y_i - \bar{Y}$ and
- $e_{i, X_1|X_2} = X_{1i} - \bar{X}$.

An equivalent estimate of $\hat{\beta}_1$ is
$$
\hat{\beta}_1 = \frac{\sum_{i=1}^n e_{i,Y|X_2} e_{i, X_1|X_2}}{\sum_{i=1}^n e_{i, X_1|X_2}^2}
$$

We show that this expression gives the correct estimate of slope by applying the ideas to the `diamond price` dataset.

In [None]:
Y = df['price']
X1 = df['carat']
X2 = np.ones(len(Y))
n = len(df)

model_1 = sm.OLS(Y, X2) # Fitting only the intrecept.
result_1 = model_1.fit()
resid_1 = result_1.resid

model_2 = sm.OLS(X1, X2)
result_2 = model_2.fit()
resid_2 = result_2.resid

In [None]:
model = sm.OLS(Y, sm.add_constant(X1))
result = model.fit()
result.params

In [None]:
beta_hat_1 = np.cov(resid_1, resid_2)[0][1]/np.var(resid_2) * (n - 1)/n
print(np.round(beta_hat_1, 6))

# Multiple regression

In [None]:
filename = 'seatbelts.csv'
pathname = os.path.join('.', 'datasets', filename)
if os.path.isfile(pathname):
    df = pd.read_csv(pathname)
else:
    assert False

In [None]:
df.head()

In [None]:
model = smf.ols(data=df, formula='DriversKilled ~ kms + PetrolPrice')
result = model.fit()
result.summary()

In [None]:
n_acc = result.params['Intercept'] + result.params['kms'] * df['kms'].mean() + result.params['PetrolPrice'] * df['PetrolPrice'].mean()
print(f'# accidents at average kms and petrol price as {n_acc}, rounded to {np.round(n_acc)}.')

In [None]:
print(f'It should be no surprise that this is same as {df['DriversKilled'].mean()}.')

## An equivalent way to get the same results.

If we are fitting a model $Y_i = \beta_0 + \beta_1 X_{1i} + \beta_2 X_{2i}$ then we can obtain $\hat{\beta}_1$ in three successive applications of OLS.

1. Fit $Y_i = \beta_{1, 0} + \beta_{1, 2} X_{2i}$. Get the residuals $e_{2i} = Y_i - \hat{Y}_i$.
2. Fit $X_{1i} = \beta_{12, 0} + \beta_{12, 1}X_{2i}$. Get the residual $e_{12, i} = X_{1i} - \hat{X}_1$.
3. Fit $e_{2i} - e_{12, i}$ **without intercept**. The slope of this line is the estimate $\hat{\beta}_1$.

Similarly, we can get $\hat{\beta}_2$ as
1. Fit $Y_i = \beta_{2, 0} + \beta_{2, 1} X_{1i}$. Get the residuals $e_{1i} = Y_i - \hat{Y}_i$.
2. Fit $X_{2i} = \beta_{21, 0} + \beta_{21, 1}X_{1i}$. Get the residual $e_{21, i} = X_{2i} - \hat{X}_2$.
3. Fit $e_{1i} - e_{21, i}$ **without intercept**. The slope of this line is the estimate $\hat{\beta}_2$.

We will illustrate it for the `seatbelts` data.

### Regression coefficient for `PetrolPrice`

Take the residual for `DriversKilled` having regressed out `kms` and an `intercept` and the residual for `PetrolPrice` having regressed out `kms` and an `intercept`. Fit a regression through the origin of the two residuals and show that it is the same coefficient obtained previously by the standard method.

In [None]:
model_dk = smf.ols(data=df, formula='DriversKilled ~ kms')
result_dk = model_dk.fit()
resid_dk = result_dk.resid
result_dk.summary()

In [None]:
model_pk = smf.ols(data=df, formula='PetrolPrice ~ kms')
result_pk = model_pk.fit()
resid_pk = result_pk.resid
result_pk.summary()

In [None]:
plt.scatter(resid_pk, resid_dk)
plt.ylabel('Residual of DriversKilled ~ kms')
plt.xlabel('Residual of PetrolPrice ~ kms')
_ = plt.title('Plot of two residuals')

In [None]:
model_pp = sm.OLS(endog=resid_dk, exog=resid_pk)
result_pp = model_pp.fit()
result_pp.summary()

In [None]:
if np.abs(result_pp.params.iloc[0] - result.params.iloc[2]) < 1e-8:
    print('The regression coefficients obtained by the two methods is the same.')
else:
    print('The regression coefficients obtained by the two methods is not the same.')

### Regression coefficient for `kms`

Take the residual for `DriversKilled` having regressed out `PetrolPrice` and an `intercept` and the residual for `kms` having regressed out `PetrolPrice` and an `intercept`. Fit a regression through the origin of the two residuals and show that it is the same coefficient obtained previously by the standard method.

In [None]:
model_dp = smf.ols(data=df, formula='DriversKilled ~ PetrolPrice')
result_dp = model_dp.fit()
resid_dp = result_dp.resid
result_dp.summary()

In [None]:
model_kp = smf.ols(data=df, formula='kms ~ PetrolPrice')
result_kp = model_kp.fit()
resid_kp = result_kp.resid
result_kp.summary()

In [None]:
plt.scatter(resid_kp, resid_dp)
plt.ylabel('Residual of DriversKilled ~ PetrolPrice')
plt.xlabel('Residual of kms ~ PetrolPrice')
_ = plt.title('Plot of two residuals')

In [None]:
model_kk = sm.OLS(endog=resid_dp, exog=resid_kp)
result_kk = model_kk.fit()
result_kk.summary()

In [None]:
if np.abs(result_kk.params.iloc[0] - result.params.iloc[1]) < 1e-8:
    print('The regression coefficients obtained by the two methods is the same.')
else:
    print('The regression coefficients obtained by the two methods is not the same.')