In [None]:
from typing import List
from typing import Tuple
from typing import Union

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.formula.api as smf

from tqdm import tqdm

sns.set(font_scale=1.5)
sns.set_style("whitegrid", {'grid.linestyle':'--'})

## Reading data

In [None]:
data = pd.read_csv("https://raw.githubusercontent.com/changyaochen/MECE4520/master/data/auto_mpg.csv")
data.head()

## Exploratory Data Analysis (EDA)

In [None]:
# distribution of the dependent variable
sns.histplot(x="mpg", data=data)
plt.tight_layout()

In [None]:
# correlations
continuous_variables = [
    "displacement",
    "horsepower",
    "weight",
    "acceleration",
]

for variable in continuous_variables:
    plt.figure()
    sns.scatterplot(x=variable, y="mpg", data=data)
    plt.tight_layout()

## Simple linear regression

In [None]:
y = data["mpg"]
x = data["weight"]

def simple_linear_regression(
    x: Union[List, np.ndarray, pd.Series], 
    y: Union[List, np.ndarray, pd.Series]) -> Tuple[float, float]:
    """Return the intercept and slope of a simple linear regression."""
    beta_1 = np.cov(x, y)[0][1] / np.cov(x, x)[0][1]
    beta_0 = np.mean(y) - beta_1 * np.mean(x)
    
    return beta_0, beta_1

beta_0, beta_1 = simple_linear_regression(x=x, y=y)

# calculate R^2
y_pred = beta_0 + beta_1 * x
SST = np.sum(np.square(y - np.mean(y)))
residual = y - y_pred
SSE = np.sum(np.square(residual))
r2 = 1 - SSE / SST

print(f"beta_0 is: {beta_0:5.4f}")
print(f"beta_1 is: {beta_1:5.4f}")
print(f"R-square is: {r2:5.4f}")

plt.figure()
x_range = np.linspace(start=np.min(x), stop=np.max(x), num=100)
sns.scatterplot(x="weight", y="mpg", data=data)
sns.lineplot(x=x_range, y=(beta_0 + beta_1 * x_range), color="red")
plt.tight_layout()

In [None]:
# residual analysis
plt.figure()
sns.histplot(residual)
plt.tight_layout()
plt.show()

In [None]:
# accuracy of the coefficients
np.random.seed(42)
n_trials = 100

beta_0s, beta_1s = [], []
for _ in tqdm(range(n_trials)):
    sampling_proba = 0.2
    mask = np.random.choice([True, False], size=len(x), p=[sampling_proba, 1 - sampling_proba])
    x_sampled, y_sampled = x[mask], y[mask]
    beta_0, beta_1 = simple_linear_regression(x=x_sampled, y=y_sampled)
    beta_0s.append(beta_0)
    beta_1s.append(beta_1)

# plot the histograms
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(9, 5))
sns.histplot(beta_0s, ax=axes[0])
sns.histplot(beta_1s, ax=axes[1])
plt.tight_layout()

# plot the fited lines
plt.figure()
x_range = np.linspace(start=np.min(x), stop=np.max(x), num=100)
for i in tqdm(range(len(beta_0s))):
    sns.lineplot(x=x_range, y=(beta_0s[i] + beta_1s[i] * x_range), color="red", alpha=0.1)
sns.scatterplot(x="weight", y="mpg", data=data)
plt.tight_layout()

In [None]:
# confidence intervals
SE_beta_0 = (np.var(residual, ddof=2) * (1. / len(x) + (np.mean(x))**2 / np.sum((x - np.mean(x))**2)))**0.5
SE_beta_1 = (np.var(residual) / np.sum((x - np.mean(x))**2))**0.5 

print(f"The standard error for beta_0 is: {SE_beta_0:5.4f}")
print(f"The standard error for beta_1 is: {SE_beta_1:5.4f}")

In [None]:
# simple linear regression with the `statsmodels` library
model_1 = smf.ols(formula='mpg ~ weight', data=data)
result_1 = model_1.fit()
print(result_1.summary())

In [None]:
# linear regress in matrix format
X = np.hstack(
    (np.ones(shape=(len(x), 1)), 
     x.to_numpy().reshape(-1, 1)))

# point estimate
beta_matrix = np.linalg.inv(X.T @ X) @ X.T @ y
print("The estimates for beta are:")
print(beta_matrix)

# variance
se_matrix = np.sqrt(np.var(residual, ddof=2) * np.linalg.inv(X.T @ X))
print("\nThe standard error for beta are:")
print(se_matrix)

## Multi-variant linear regression

In [None]:
model_2 = smf.ols(formula='mpg ~ weight + displacement + horsepower + acceleration', data=data)
result_2 = model_2.fit()
print(result_2.summary())

In [None]:
# correlation between the continuous variables
data[continuous_variables].corr()

In [None]:
# one-hot encode the categorical variables
model_3 = smf.ols(formula='mpg ~ weight + C(origin)', data=data)
result_3 = model_3.fit()
print(result_3.summary())