In [None]:
import os
import scipy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

In [None]:
dataset: str = 'galton.csv'
pathname: str = os.path.join('.', 'datasets', dataset)

if os.path.isfile(pathname):
    df: pd.core.frame.DataFrame = pd.read_csv(pathname)
else:
    assert False

In [None]:
df.head()

In [None]:
fig, ax = plt.subplots(1, 2)
df.hist('child', ax=ax[0])
_ = df.hist('parent', ax=ax[1])

# Minimal square error

Given a data set $x_1, \ldots, x_n$, find a $\mu$ such that 
$$
\sum_{i=1}^n (x_i - \mu)^2
$$
is minimal.

This is a simple problem in calculus. Alternatively, we can also confirm numerically that the minimum is
$$
\mu = \frac{1}{n}\sum_{i=1}^n x_i = \bar{x}.
$$

In [None]:
def sqerr(mu: float, X: np.array) -> float:
    return np.sum((X - mu)**2)

res: scipy.optimize._optimize.OptimizeResult = None
res = scipy.optimize.minimize(fun=sqerr, 
                              x0=df['child'].iloc[0], 
                              args=df['child'].to_numpy(), 
                              method='nelder-mead')
if res.success:
    minimum_at = res.x
    print(f'Minimum found by optimiser: {np.round(res.x[0], 6)}')
    print(f'Expected minimum: {np.round(df['child'].mean(), 6)}')
else:
    print('The optimiser did not converge.')

In [None]:
_ = df.plot(x='parent', y='child', kind='scatter', title="Galton's data")

# Subtract mean from both variables
Let $\tilde{x}_i = x_i - \bar{x}, \tilde{y}_i = y_i - \bar{y}$. This is no more than a translation of the origin. We will now find the best slope $\beta$ that minimises
$$
\sum_{i=1}^n (\tilde{y}_i - \beta \tilde{y}_i)^2.
$$
It turns out that it is the slope of the regression line. This also suggests that the point $(\bar{x}, \bar{y})$ will lie on the regression line.

The variables $\tilde{x}_i$ and $\tilde{y}_i$ are called the _centred_ versions of $x_i$ and $y_i$.

In [None]:
x_prime = df['parent'].to_numpy() - df['parent'].mean()
y_prime = df['child'].to_numpy() - df['child'].mean()

def slope(beta: float, X: np.ndarray, Y: np.ndarray) -> float:
    return np.sum((Y - beta*X)**2)

res = scipy.optimize.minimize(fun=slope, 
                              x0=1, 
                              args=(x_prime, y_prime), 
                              method='nelder-mead')
if res.success:
    minimum_at = res.x
    print(f'Minimum found by optimiser: {res.x[0]}')
else:
    print('The optimiser did not converge.')

In [None]:
model = sm.OLS(endog=y_prime, exog=x_prime)
result = model.fit()
result.summary()

In [None]:
print(f'Minimum found by optimiser: {np.round(res.x[0], 6)}')
print(f'Expected minimum: {np.round(result.params[0], 6)}')

# Basic stats about the variables

In [None]:
x_bar = df['parent'].mean()
x_var = df['parent'].var()
s_x = df['parent'].std()

y_bar = df['child'].mean()
y_var = df['child'].var()
s_y = df['child'].std()

cov_xy = np.cov(df['parent'].to_numpy(), df['child'].to_numpy())
cor_xy = np.corrcoef(df['parent'].to_numpy(), df['child'].to_numpy())
                

In [None]:
metrics = ['Empirical mean of x',
           'Empirical variance of x',
           'Empirical standard deviation of x',
           'Empirical mean of y',
           'Empirical variance of y',
           'Empirical standard deviation of y',
           'Empirical covariance',
           'Empirical correlation']
values = [x_bar, x_var, s_x, y_bar, y_var, s_y, cov_xy[0][1], cor_xy[0][1]]
basic_stats = pd.DataFrame({'metric': metrics, 'value': values})
basic_stats.head(8)

# Theoretical estimates of regression parameters

It can be shown that if $y_i = \hat{\beta}_0 + \hat{\beta}_1 x_i$ then
$$
\hat{\beta}_1 = \frac{s_y}{s_x} \rho_{xy}
$$
and $\hat{\beta}_0 = \bar{y} - \beta_1\bar{x}$.

In [None]:
beta_1_hat_th = cor_xy * s_y/s_x
beta_0_hat_th = y_bar - beta_1_hat_th * x_bar

model = sm.OLS(endog = df['child'], exog=sm.add_constant(df['parent']))
result = model.fit()
result.summary()

In [None]:
reg_metrics = ['Theoretical intercept',
               'Theoretical slope',
               'Fitted intercept',
               'Fitted slope']
reg_values = [beta_0_hat_th, beta_1_hat_th, result.params.iloc[0], result.params.iloc[1]]
reg_stats = pd.DataFrame({'Metric': reg_metrics, 'Value': reg_values})
reg_stats.head()

No wonder, they match exactly. 

# Scaled variables 
If we standardise the variables $x$ and $y$ then the slope of the slope of the regression line will be their correlation coefficient and their intercept will be zero. We will check that below.

In [None]:
x_std = (df['parent'] - x_bar)/s_x
y_std = (df['child'] - y_bar)/s_y
model = sm.OLS(endog=y_std, exog=x_std)
result = model.fit()
result.summary()

In [None]:
print(f'Correlation coeff is {np.round(cor_xy[0][1], 4)}, slope of regression line of standardised variables is {np.round(result.params.iloc[0], 4)}.')