In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.formula.api as smf
from scipy.stats import t

sns.set(font_scale=1.5)
sns.set_style("whitegrid", {'grid.linestyle':'--'})

## Automobile MPG data

In [None]:
data = pd.read_csv("https://raw.githubusercontent.com/changyaochen/MECE4520/master/data/auto_mpg.csv")
data.head()

In [None]:
# Retrieve data as a numpy array
Y = data[["mpg"]].values
print(Y.shape)
Y[:5]

### Simple linear regression

In [None]:
X = np.append(np.ones(shape=(data.shape[0], 1)), data[["weight"]].values, axis=1)
print(X.shape)
X[:5]

In [None]:
# Calculates coefficients
betas = np.linalg.inv(X.T @ X) @ X.T @ Y
betas

In [None]:
# Calculates the standard error
Y_hat = X @ betas
residual = Y - Y_hat
var = np.var(residual, ddof=X.shape[1])

se = np.sqrt(var * np.linalg.inv(X.T @ X))
se

In [None]:
# Calculates R2
r2 = np.power(Y_hat - np.mean(Y), 2).sum() / np.power(Y - np.mean(Y), 2).sum()
r2

In [None]:
# confidence interval
x_new = 4000

y_hat = (betas[0] + betas[1] * x_new)[0]
print(y_hat)

delta = np.sqrt(var) * np.sqrt(1 / X.shape[0] + (x_new - np.mean(X[:, 1]))**2 / np.sum((X[:, 1] - np.mean(X[:, 1]))**2))  
multiplier = 1.96
# multiplier = t.ppf(q=0.975, df=X.shape[0] - X.shape[1])

print(f"The lower bound of the 95% CI is: {y_hat - multiplier * delta:5.3f}")
print(f"The upper bound of the 95% CI is: {y_hat + multiplier * delta:5.3f}")

In [None]:
delta = np.sqrt(var) * np.sqrt(1 + 1 / X.shape[0] + (x_new - np.mean(X[:, 1]))**2 / np.sum((X[:, 1] - np.mean(X[:, 1]))**2))  

print(f"The lower bound of the 95% PI is: {y_hat - multiplier * delta:5.3f}")
print(f"The upper bound of the 95% PI is: {y_hat + multiplier * delta:5.3f}")

### Multiple linear regression

In [None]:
X = data[["weight", "acceleration"]].values
X = np.append(np.ones((X.shape[0], 1)), X, axis=1)
print(X.shape)
X[:5]

In [None]:
# Calculates coefficients
betas = np.linalg.inv(X.T @ X) @ X.T @ Y
betas

In [None]:
# Calculates the standard error
Y_hat = X @ betas
residual = Y - Y_hat
var = np.var(residual, ddof=X.shape[1])

se = np.sqrt(var * np.linalg.inv(X.T @ X))
se

In [None]:
# Calculates R2
r2 = np.power(Y_hat - np.mean(Y), 2).sum() / np.power(Y - np.mean(Y), 2).sum()
r2

In [None]:
# Linear regression with the `statsmodels` library
model_1 = smf.ols(formula='mpg ~ weight + acceleration', data=data)
result_1 = model_1.fit()
print(result_1.summary())

In [None]:
# One-hot encode the categorical variables
model_2 = smf.ols(formula='mpg ~ weight + C(origin)', data=data)
result_2 = model_2.fit()
print(result_2.summary())

In [None]:
# Multicollinearity
model_3 = smf.ols(formula='mpg ~ weight + displacement + horsepower + acceleration', data=data)
result_3 = model_3.fit()
print(result_3.summary())

## Ads spend data

In [None]:
ads_data = pd.read_csv("https://raw.githubusercontent.com/changyaochen/MECE4520/master/data/ads_spend.csv")
ads_data.head()

In [None]:
ads_model = smf.ols(formula='revenue ~ Facebook + Google + TV', data=ads_data)
ads_result = ads_model.fit()
print(ads_result.summary())

In [None]:
ads_data[["TV", "Facebook", "Google"]].plot()

In [None]:
ads_data[["TV", "Facebook", "Google"]].corr()