In [1]:
from scratch.multiple_regression import inputs
from scratch.linear_algebra import dot, Vector

def predict(x: Vector, beta: Vector) -> float:
    """
    assumes that the first element of is 1
    """
    return dot(x, beta)

def error(x: Vector, y:float, beta: Vector) -> float:
    return predict(x, beta) - y

def squared_error(x: Vector, y:float, beta: Vector) -> float:
    return error(x, y, beta) ** 2

x = [1,2,3]
y = 30
beta = [4,4,4]

assert predict(x, beta) == 24
assert error(x, y, beta) == -6
assert squared_error(x, y, beta) == 36

def sqerror_gradient(x: Vector, y: float, beta: Vector) -> Vector:
    """
    Partial derivative of eaxh x point
    """
    err = error(x, y, beta)
    return [2 * err * x_i
            for x_i in x]

assert sqerror_gradient(x, y, beta) == [-12, -24, -36]

import random
import tqdm
from scratch.linear_algebra import vector_mean
from scratch.gradient_descent import gradient_step


def least_squares_fit(xs: list[Vector],
                      ys: list[float],
                      learning_rate: float = 0.001,
                      num_steps: int = 1000,
                      batch_size: int = 1) -> Vector:
    """
    Finds the beta that minimises the sum of squared errors
    assuming the model y = dot(x, beta)
    """
    # start with a random guess
    guess = [random.random() for _ in xs[0]]
    
    for _ in tqdm.trange(num_steps, desc='least squares fit'):
        for start in range(0, len(xs), batch_size):
            batch_xs = xs[start:start+batch_size]
            batch_ys = ys[start:start+batch_size]
            
            gradient = vector_mean([sqerror_gradient(x, y, guess)
                                   for x, y in zip(batch_xs, batch_ys)])
            
            guess = gradient_step(guess, gradient, -learning_rate)
    return guess

from scratch.simple_linear_regression import total_sum_of_squares

def multiple_r_squared(xs: list[Vector], ys: Vector, beta: Vector) -> float:
    sum_of_squared_errors = sum(error(x, y, beta) ** 2
                                for x, y in zip(xs, ys))
    return 1.0 - sum_of_squared_errors / total_sum_of_squares(ys)

from typing import TypeVar, Callable

X = TypeVar('X')
Stat = TypeVar('Stat')


# using bootstrap to calculate the coeficients

def bootstrap_sample(data: list[X]) -> list[X]:
    """
    Randomly samples len(data) elements with replacement
    """
    return [random.choice(data) for _ in data]

def bootstrap_statistic(data: list[x], 
                        stats_fn: Callable[list[X], Stat],
                        num_samples: int) ->list[Stat]:
    """
    Evalates stats_fn on num_samples bootstrap samples from data
    """
    return [stats_fn(bootstrap_sample(data))
            for _ in range(num_samples)]


# example 101 points all cose to 100
close_to_100 = [99.5 + random.random() for _ in range(100)]

# 101 poibts, 50 close to 0, 50 near 200

far_from_100 = ([99.5 + random.random()] +
                [random.random() for _ in range(50)] +
                [200 + random.random() for _ in range(50)]
               )



from scratch.statistics import standard_deviation, median

medians_close = bootstrap_statistic(close_to_100, median, 100)
medians_far = bootstrap_statistic(far_from_100, median, 100)


assert standard_deviation(medians_close) < 1
assert standard_deviation(medians_far) > 90

# estimate the betas with bootstrap
# then estimate their standard errors

def estimate_sample_beta(pairs:list[tuple[Vector, float]]):
    x_sample = [x for x, _ in pairs]
    y_sample = [y for _, y in pairs]
    
    beta = least_squares_fit(xs=x_sample, ys=y_sample, 
                            num_steps=5000, batch_size=25)
    print("bootstrap sample", beta)
    return beta

random.seed(0)

from scratch.statistics import daily_minutes_good

# bootstrap_betas = bootstrap_statistic(list(zip(inputs, daily_minutes_good)),
#                                       estimate_sample_beta,
#                                       100)

# bootstrap_standard_errors = [
#     standard_deviation([beta[i] for beta in bootstrap_betas])
#     for i in range(4)
# ]
# print(bootstrap_standard_errors)

<Figure size 432x288 with 0 Axes>

In [2]:
from scratch.probability import normal_cdf


#in order to test does our Beta_j is eaual to 0
# we can calculate 
def p_value(beta_hat_j: float, sigma_hat_j: float) -> float:
    if beta_hat_j > 0:
        # if the coefficient is postitive, we need to compute twice
        # the probability of seeing an even larger value
        return 2 * (1 - normal_cdf(beta_hat_j / sigma_hat_j))
    else:
        # otherwise twice the probability of seeing a *smaller* value
        return 2 * normal_cdf(beta_hat_j / sigma_hat_j)

assert p_value(30.58, 1.27)   < 0.001  # constant term
assert p_value(0.972, 0.103)  < 0.001  # num_friends
assert p_value(-1.865, 0.155) < 0.001  # work_hours
assert p_value(0.923, 1.249)  > 0.4    # phd


learning_rate = 0.001
    

In [4]:
random.seed(0)
beta = least_squares_fit(inputs, daily_minutes_good, 
                         learning_rate, num_steps=5000, 
                         batch_size=25)
assert 30.50 < beta[0] < 30.70  # constant
assert  0.96 < beta[1] <  1.00  # num friends
assert -1.89 < beta[2] < -1.85  # work hours per day
assert  0.91 < beta[3] <  0.94  # has PhD

least squares fit: 100%|█| 5000/5000 [00:06<00:00, 801.70i


### Regularization

In [5]:
# Regularization is an approach in which we add to the error term a penalty 
# that gets larger as Beta gets larger. When then inimize the comined error and penalty

In [9]:
def ridge_penalty(beta:Vector, alpha:float ) -> float:
    """
    Alpha used as lambda
    """
    return alpha * dot(beta[1:], beta[1:])


def squared_error_ridge(x: Vector, 
                        y: float,
                        beta: Vector,
                        alpha: float) -> float:
    """
    Estimated error plus ridge penalty on beta
    """
    return error(x, y, beta) ** 2 + ridge_penalty_penalty(beta, alpha)

# we can then plug this into gradient descent in the usual way

from scratch.linear_algebra import add

def ridge_penalty_gradient(beta: Vector, alpha:float) -> Vector:
    """
    Gradient of just the ridge penalty
    """
    return [0.] + [2 * alpha * beta_j
                   for beta_j in beta[1:]]

def sqerror_ridge_gradient(x: Vector,
                           y: float,
                           beta: Vector,
                           alpha: float) -> Vector:
    """
    the gradient corresponding to the ith squared errro
    term including the ridge penalty
    """
    return add(sqerror_gradient(x, y, beta),
               ridge_penalty_gradient(beta, alpha))

# note usually data is rescalaed prior to using a regularization technique
    
def least_squares_fit_ridge(xs: list[Vector],
                      ys: list[float],
                      learning_rate: float = 0.001,
                      alpha:float =0, 
                      num_steps: int = 1000,
                      batch_size: int = 1) -> Vector:
    """
    Finds the beta that minimises the sum of squared errors
    assuming the model y = dot(x, beta)
    """
    # start with a random guess
    guess = [random.random() for _ in xs[0]]
    
    for _ in tqdm.trange(num_steps, desc='least squares ridge fit'):
        for start in range(0, len(xs), batch_size):
            batch_xs = xs[start:start+batch_size]
            batch_ys = ys[start:start+batch_size]
            
            gradient = vector_mean([sqerror_ridge_gradient(x, y, guess, alpha)
                                   for x, y in zip(batch_xs, batch_ys)])
            
            guess = gradient_step(guess, gradient, -learning_rate)
    return guess 

random.seed(0)
beta_0 = least_squares_fit_ridge(inputs, daily_minutes_good, 
                                 alpha=0, num_steps=5000,
                                 batch_size=25)

assert 5 < dot(beta_0[1:], beta_0[1: ]) < 6
assert 0.67 < multiple_r_squared(inputs, daily_minutes_good, beta_0) < 0.69

# as we increase alpha the goodness of fit gets worst
# but the size of beta gets smaller

beta_0_1 = least_squares_fit_ridge(inputs, daily_minutes_good, 
                                 alpha=0.1, num_steps=5000,
                                 batch_size=25)

assert 4 < dot(beta_0_1[1:], beta_0_1[1: ]) < 5
assert 0.67 < multiple_r_squared(inputs, daily_minutes_good, beta_0_1) < 0.69

beta_10 = least_squares_fit_ridge(inputs, daily_minutes_good, 
                                 alpha=10, num_steps=5000,
                                 batch_size=25)
# phd coef was proven to be the least relevant
# now reduced to close to 0
assert 1 < dot(beta_10[1:], beta_10[1: ]) < 2
assert 0.5 < multiple_r_squared(inputs, daily_minutes_good, beta_10) < 0.6

least squares ridge fit: 100%|█| 5000/5000 [00:07<00:00, 6
least squares ridge fit: 100%|█| 5000/5000 [00:07<00:00, 6
least squares ridge fit: 100%|█| 5000/5000 [00:07<00:00, 6


In [10]:
beta_0

[30.514795945185586, 0.9748274277323267, -1.8506912934343662, 0.91407780744768]

In [11]:
beta_0_1

[30.80152599845916, 0.9507225777158704, -1.833142990416332, 0.5384447644638315]

In [12]:
beta_10

[28.30708308025664,
 0.6726275942984854,
 -0.9045499907700505,
 -0.0052131931011540865]