In [None]:
%matplotlib notebook

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.formula.api as smf

sns.set(font_scale=1.5)
sns.set_style("whitegrid", {'grid.linestyle':'--'})

In [None]:
data = pd.read_csv("./auto_mpg.csv")
data.head()

## Exploratory Data Analysis (EDA)

In [None]:
# distribution of the dependent variable
sns.histplot(x="mpg", data=data)
plt.tight_layout()

In [None]:
continuous_variables = [
    "displacement",
    "horsepower",
    "weight",
    "acceleration",
]

for variable in continuous_variables:
    plt.figure()
    sns.scatterplot(x=variable, y="mpg", data=data)
    plt.tight_layout()

## Simple linear regression

In [None]:
y = data["mpg"]
x = data["weight"]

# calculating the coefficients
beta_1 = np.cov(x, y)[0][1] / np.cov(x, x)[0][1]
beta_0 = np.mean(y) - beta_1 * np.mean(x)

# calculate R^2
y_pred = beta_0 + beta_1 * x
SST = np.sum(np.square(y - np.mean(y)))
SSE = np.sum(np.square(y - y_pred))
r2 = 1 - SSE / SST

print(f"beta_0 is: {beta_0:5.3f}")
print(f"beta_1 is: {beta_1:5.3f}")
print(f"R-square is: {r2:5.3f}")

plt.figure()
x_range = np.linspace(start=np.min(x), stop=np.max(x), num=100)
sns.scatterplot(x="weight", y="mpg", data=data)
sns.lineplot(x=x_range, y=(beta_0 + beta_1 * x_range), color="red")
plt.tight_layout()

In [None]:
model = smf.ols(formula='mpg ~ weight', data=data)
result = model.fit()
print(result.summary())