# Learning Regressions with Maximum Likelihood Estimation (MLE)

In [1]:
import time

import numpy as np
import pandas as pd

## Make fake data

In [2]:
N = 100
beta = np.array([3.0, -5.0, 1.0])
p = len(beta)
beta = beta.reshape((p, 1))
sigma_squared = 5.0
theta_truth = tuple(float(b) for b in beta) + (sigma_squared,)

In [3]:
class FakeData(object):
    def __init__(self, N, p, beta, sigma_squared, rng=None):
        if not rng: 
            rng = np.random.RandomState()
        X = rng.uniform(size=N * p).reshape((N, p))
        error = rng.normal(loc=0, scale=sigma_squared ** 0.5, size=N).reshape((N, 1))
        X_times_beta = X @ beta
        assert X_times_beta.shape == (N, 1), X_times_beta.shape
        y = X_times_beta + error
        assert y.shape == (N, 1), y.shape
        self.X = X
        self.y = y
        return


In [4]:
# make y, X
rng = np.random.RandomState(seed=0)
fake_data = FakeData(N, p, beta, sigma_squared, rng=rng)
y, X = fake_data.y, fake_data.X
y.shape, X.shape

((100, 1), (100, 3))

# Run optimizations

In [5]:
columns = tuple(f"β[{p_}]" for p_ in range(p)) + ("σ²",)
results = pd.DataFrame(columns=columns)
results.loc['Truth'] = theta_truth
results

Unnamed: 0,β[0],β[1],β[2],σ²
Truth,3.0,-5.0,1.0,5.0


## Least Squares

In [6]:
beta_hat_ols = np.linalg.inv(X.T @ X) @ X.T @ y
beta_hat_ols = np.linalg.inv(X.T @ X) @ X.T @ y
residual_ols = y - X @ beta_hat_ols
rss_ols = residual_ols.T @ residual_ols
sigma_squared_hat_ols = rss_ols[0, 0] / (N - p)
# sigma_hat_ols = sigma_squared_hat_ols ** 0.5
theta_ols = tuple(float(b) for b in beta_hat_ols) + (sigma_squared_hat_ols,)

In [7]:
results.loc[f'least squares (N={N})'] = theta_ols

In [8]:
print("====Results so far====")
results

====Results so far====


Unnamed: 0,β[0],β[1],β[2],σ²
Truth,3.0,-5.0,1.0,5.0
least squares (N=100),2.032267,-5.115539,1.492514,4.459838


In [9]:
# double check with scikit-learn
from sklearn.linear_model import LinearRegression
linear_regression = LinearRegression(fit_intercept=False)
linear_regression.fit(X, y)
linear_regression.coef_

array([[ 2.03226693, -5.11553864,  1.49251394]])

## MLE: likelihood + scipy.optimize.fmin

### define likelihood

In [10]:
from scipy import optimize
from scipy.stats import norm
from scipy.stats import multivariate_normal

In [11]:
class Likelihood(object):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __call__(self, theta):
        N, p = self.X.shape
        if N > 1000:
            return 0
        sigma_squared = theta[-1]
        beta = np.array(theta[0:p]).reshape((p, 1))
        mu = self.X @ beta
        joint_probability = multivariate_normal.pdf(
            y[:, 0],
            mu[:, 0],
            sigma_squared)
        return joint_probability


In [12]:
class LoggedFunction(object):
    def __init__(self, f, every_s=1):
        self.f = f
        self.every_s = every_s
        self.n_calls = 0
        self.time_of_last_log = time.time()

    def __call__(self, *args, **kwargs):
        self.n_calls += 1
        start = time.time()
        if (start - self.time_of_last_log) > 5:
            return "giving up!"
        its_been_a_while = (start - self.time_of_last_log) > self.every_s
        lets_log = its_been_a_while or self.n_calls == 1
        if lets_log:
            print(f"Call #{self.n_calls}:")
            print(*args, **kwargs)
        result = self.f(*args, **kwargs)
        if lets_log:
            print(result)
            end = time.time()
            print("wall time (s):", end - start)
            self.time_of_last_log = start
        return result


In [13]:
likelihood = Likelihood(X, y)

### optimize likelihood

In [14]:
initial_guess = tuple(0.0 for _ in range(p)) + (1.0,)

In [15]:
# test out the function
likelihood(theta_truth)

4.0290232730964096e-95

In [16]:
negative_likelihood = lambda x: -1 * likelihood(x)
# f = negative_likelihood
f = LoggedFunction(negative_likelihood, every_s=1)
theta_mle_likelihood = optimize.fmin(f, initial_guess, xtol=1e-6, ftol=1e-6, maxiter=1000)

Call #1:
[0. 0. 0. 1.]
-8.638496939073674e-209
wall time (s): 0.006022930145263672
Optimization terminated successfully.
         Current function value: -0.000000
         Iterations: 487
         Function evaluations: 818


In [17]:
results.loc[f'MLE: likelihood (N={N})'] = theta_mle_likelihood

In [18]:
print("====Results so far====")
results

====Results so far====


Unnamed: 0,β[0],β[1],β[2],σ²
Truth,3.0,-5.0,1.0,5.0
least squares (N=100),2.032267,-5.115539,1.492514,4.459838
MLE: likelihood (N=100),2.032267,-5.115538,1.492514,4.326043


## MLE: negative log likelihood + scipy.optimize.fmin

### define negative log likelihood

In [19]:
class NegativeLogLikelihood(object):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __call__(self, theta):
        N, p = self.X.shape
        sigma_squared = theta[-1]
        beta = np.array(theta[0:p]).reshape((p, 1))
        log_probabilities = norm.logpdf(
            x=self.y[:, 0],
            loc=(self.X @ beta)[:, 0],
            scale=sigma_squared ** 0.5
        )
        negative_log_likelihood = -1 * sum(log_probabilities)
        return negative_log_likelihood

negative_log_likelihood = NegativeLogLikelihood(X, y)

In [20]:
initial_guess = tuple(0.0 for _ in range(p)) + (1.0,)

In [21]:
negative_log_likelihood(initial_guess)

479.0840558334582

In [22]:
negative_log_likelihood(theta_truth)

217.35205985184663

In [23]:
negative_log_likelihood(theta_ols)

215.14947087529944

In [24]:
negative_log_likelihood(theta_mle_likelihood)

215.1265105010642

### optimize negative log likelihood

In [25]:
negative_log_likelihood(theta_ols)

215.14947087529944

In [26]:
f = LoggedFunction(negative_log_likelihood, every_s=1)
theta_mle_negloglikelihood = optimize.fmin(f, initial_guess, ftol=1e-6, xtol=1e-6, maxiter=3000)
print("")
print("solution:")
print(theta_mle_negloglikelihood)
print(f(theta_mle_negloglikelihood))

Call #1:
[0. 0. 0. 1.]
479.0840558334582
wall time (s): 0.0025451183319091797
Optimization terminated successfully.
         Current function value: 215.126511
         Iterations: 488
         Function evaluations: 820

solution:
[ 2.03226669 -5.11553874  1.49251416  4.32604225]
215.12651050106416


In [27]:
results.loc[f'MLE: -log(likelihood) (N={N})'] = theta_mle_negloglikelihood

In [28]:
print("====Results so far====")
print(results)
results

====Results so far====
                                   β[0]      β[1]      β[2]        σ²
Truth                          3.000000 -5.000000  1.000000  5.000000
least squares (N=100)          2.032267 -5.115539  1.492514  4.459838
MLE: likelihood (N=100)        2.032267 -5.115538  1.492514  4.326043
MLE: -log(likelihood) (N=100)  2.032267 -5.115539  1.492514  4.326042


Unnamed: 0,β[0],β[1],β[2],σ²
Truth,3.0,-5.0,1.0,5.0
least squares (N=100),2.032267,-5.115539,1.492514,4.459838
MLE: likelihood (N=100),2.032267,-5.115538,1.492514,4.326043
MLE: -log(likelihood) (N=100),2.032267,-5.115539,1.492514,4.326042


### With more data

In [29]:
N = 100000
rng = np.random.RandomState(seed=0)
fake_data = FakeData(N, p, beta, sigma_squared, rng=rng)
y, X = fake_data.y, fake_data.X
y.shape, X.shape

((100000, 1), (100000, 3))

#### Least Squares

In [30]:
beta_hat_ols = np.linalg.inv(X.T @ X) @ X.T @ y
beta_hat_ols = np.linalg.inv(X.T @ X) @ X.T @ y
residual_ols = y - X @ beta_hat_ols
rss_ols = residual_ols.T @ residual_ols
sigma_squared_hat_ols = rss_ols[0, 0] / (N - p)
# sigma_hat_ols = sigma_squared_hat_ols ** 0.5
theta_ols = tuple(float(b) for b in beta_hat_ols) + (sigma_squared_hat_ols,)

In [31]:
results.loc[f'least squares (N={N})'] = theta_ols

In [32]:
print("====Results so far====")
results

====Results so far====


Unnamed: 0,β[0],β[1],β[2],σ²
Truth,3.0,-5.0,1.0,5.0
least squares (N=100),2.032267,-5.115539,1.492514,4.459838
MLE: likelihood (N=100),2.032267,-5.115538,1.492514,4.326043
MLE: -log(likelihood) (N=100),2.032267,-5.115539,1.492514,4.326042
least squares (N=100000),3.025649,-5.028245,0.999202,5.011678


In [33]:
# double check with scikit-learn
from sklearn.linear_model import LinearRegression
linear_regression = LinearRegression(fit_intercept=False)
linear_regression.fit(X, y)
linear_regression.coef_

array([[ 3.0256486 , -5.02824483,  0.99920249]])

In [34]:
negative_log_likelihood = NegativeLogLikelihood(X, y)

In [35]:
negative_log_likelihood(theta_ols)

222480.89592429655

In [36]:
f = LoggedFunction(negative_log_likelihood, every_s=1)
theta_mle_negloglikelihood = optimize.fmin(f, initial_guess, ftol=1e-6, xtol=1e-6, maxiter=3000)
print("")
print("solution:")
print(theta_mle_negloglikelihood)
print(f(theta_mle_negloglikelihood))

Call #1:
[0. 0. 0. 1.]
502682.670681692
wall time (s): 0.03987598419189453
Call #40:
[ 1.00744944e-02 -3.73373785e-02  7.62493682e-03  1.26882833e+01]
251117.28759407744
wall time (s): 0.02255988121032715
Call #82:
[ 5.94986288e-03 -2.34328837e-02  4.26246321e-03  8.19247676e+00]
247014.7727401743
wall time (s): 0.023183822631835938
Call #124:
[-4.89896482e-02 -9.05182983e-02  4.51315823e-03  8.48663618e+00]
246553.4127067833
wall time (s): 0.023405075073242188
Call #168:
[-5.72186498e-01 -7.12604139e-01  3.62041211e-03  7.70550741e+00]
244267.1806379409
wall time (s): 0.024611711502075195
Call #211:
[-5.53024964e-01 -6.90016371e-01  3.68573162e-03  7.77943030e+00]
244262.64736546265
wall time (s): 0.022819042205810547
Call #254:
[-0.4881476  -0.62881507  0.01926741  7.72570981]
244208.07644335687
wall time (s): 0.022964954376220703
Call #297:
[-0.24113794 -1.30764069  0.9337691   6.7693763 ]
239924.04645759435
wall time (s): 0.022605180740356445
Call #338:
[-0.30058981 -3.34492148  2.

In [37]:
results.loc[f'MLE: -log(likelihood) (N={N})'] = theta_mle_negloglikelihood

In [38]:
print("====Results so far====")
print(results)
results

====Results so far====
                                      β[0]      β[1]      β[2]        σ²
Truth                             3.000000 -5.000000  1.000000  5.000000
least squares (N=100)             2.032267 -5.115539  1.492514  4.459838
MLE: likelihood (N=100)           2.032267 -5.115538  1.492514  4.326043
MLE: -log(likelihood) (N=100)     2.032267 -5.115539  1.492514  4.326042
least squares (N=100000)          3.025649 -5.028245  0.999202  5.011678
MLE: -log(likelihood) (N=100000)  3.025649 -5.028244  0.999202  5.011528


Unnamed: 0,β[0],β[1],β[2],σ²
Truth,3.0,-5.0,1.0,5.0
least squares (N=100),2.032267,-5.115539,1.492514,4.459838
MLE: likelihood (N=100),2.032267,-5.115538,1.492514,4.326043
MLE: -log(likelihood) (N=100),2.032267,-5.115539,1.492514,4.326042
least squares (N=100000),3.025649,-5.028245,0.999202,5.011678
MLE: -log(likelihood) (N=100000),3.025649,-5.028244,0.999202,5.011528



## MLE: negative log likelihood + scipy.optimize.newton

In [39]:
class NegativeLogLikelihood(object):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __call__(self, theta):
        N, p = self.X.shape
        sigma_squared = theta[-1]
        beta = np.array(theta[0:p]).reshape((p, 1))
        log_probabilities = norm.logpdf(
            x=self.y[:, 0],
            loc=(self.X @ beta)[:, 0],
            scale=sigma_squared ** 0.5
        )
        negative_log_likelihood = -1 * sum(log_probabilities)
        return negative_log_likelihood
    
    def gradient_wrt_theta(self, theta):
        N, p = self.X.shape
        sigma_squared = theta[-1]
        beta = np.array(theta[0:p]).reshape((p, 1))
        beta_portion = self.X.T @ self.y - self.X.T @ self.X @ beta
        residuals = y - X @ beta
        sigma_squared_portion = -0.5 / sigma_squared * (1 - ((residuals.T @ residuals)[0, 0] / sigma_squared))
        sigma_squared_portion = np.array(sigma_squared_portion).reshape((1, 1))
        gradient_log_likelihood = np.append(beta_portion, sigma_squared_portion)
        gradient_neg_log_likelihood = -1 * gradient_log_likelihood
        return gradient_neg_log_likelihood


negative_log_likelihood = NegativeLogLikelihood(X, y)

In [40]:
print(theta_ols)
negative_log_likelihood(theta_ols)

(3.0256485975893357, -5.028244834872511, 0.9992024914920415, 5.011678315318209)


222480.89592429655

In [42]:
negative_log_likelihood.gradient_wrt_theta(theta_truth)

array([  -129.57115266,    320.31104121,     92.00924712, -10023.20480652])

# Appendix

### Past results

```
ftol=1e-11, xtol=1e-11, maxiter=1000
[ 3.01147067 -5.01263143  0.99964318  1.00115215]
142009.0002800877

ftol=1e-14, xtol=1e-14, maxiter=1000

Optimization terminated successfully.
         Current function value: 142009.000280
         Iterations: 661
         Function evaluations: 1191

solution:
[ 3.01147067 -5.01263143  0.99964318  1.00115215]
142009.0002800877


N = 1000000
ftol=1e-14, xtol=1e-14, maxiter=1000
... took too long!
ftol=1e-6, xtol=1e-6
Call #796:
[ 2.99618089 -5.00124526  1.00226668  0.99884177]
1417772.6734368044
wall time (s): 0.24301505088806152
Warning: Maximum number of function evaluations has been exceeded.

solution:
[ 2.99622187 -5.0012157   1.00224932  0.99883615]
```

- Issue: This fails for `N = 100000` and without `xtol`. 
  - Solution: go for more iterations
  - Solution: specify `xtol=1e-7`, everything is fine. Not sure why.
  - Source: "when you set ftol smaller than xtol during the search, xtol is exceeded before ftol is reached so it terminates prematurely" (https://stackoverflow.com/questions/9667514/what-is-the-difference-between-xtol-and-ftol-to-use-fmin-of-scipy-optimize#comment12285824_9669373)
- Issue: Fails when sigma_squared is large, e.g. 2500 (50 squared)

```
N = 100000
beta = np.array([3.0, -5.0, 1.0])
p = len(beta)
beta = beta.reshape((p, 1))
sigma = 10.0
                                    β[0]      β[1]      β[2]          σ
Truth                           3.000000 -5.000000  1.000000  10.000000
least squares                   3.114704 -5.126315  0.996433  10.011672
MLE w/ likelihood               0.000000  0.000000  0.000000   1.000000
MLE w/ negative log likelihood  3.114709 -5.126314  0.996428  10.011522

N = 100000
beta = np.array([3.0, -5.0, 1.0])
p = len(beta)
beta = beta.reshape((p, 1))
sigma = 50.0
                                    β[0]      β[1]      β[2]          σ
Truth                           3.000000 -5.000000  1.000000  50.000000
least squares                   3.573520 -5.631574  0.982167  50.058358
MLE w/ likelihood               0.000000  0.000000  0.000000   1.000000
MLE w/ negative log likelihood  3.753647 -3.448924 -1.755966  50.068533
```
- Issue: fails with a large coefficient

```
N = 100000
beta = np.array([3.0, -5.0, 1.0])
p = len(beta)
beta = beta.reshape((p, 1))
sigma = 10.0
                                    β[0]      β[1]      β[2]          σ
Truth                           3.000000 -5.000000  1.000000  10.000000
least squares                   3.114704 -5.126315  0.996433  10.011672
MLE w/ likelihood               0.000000  0.000000  0.000000   1.000000
MLE w/ negative log likelihood  3.114709 -5.126314  0.996428  10.011522
```

```
N = 100000
beta = np.array([3.0, -5.0, 1.0])
p = len(beta)
beta = beta.reshape((p, 1))
sigma_squared = 100.0

                                    β[0]      β[1]      β[2]          σ²
Truth                           3.000000 -5.000000  1.000000  100.000000
least squares                   3.114704 -5.126315  0.996433  100.233566
MLE w/ likelihood               0.000000  0.000000  0.000000    1.000000
MLE w/ negative log likelihood  3.114705 -5.126320  0.996436  100.230576

```

- another run...

```
N = 1000000
beta = np.array([3.0, -5.0, 1.0])
p = len(beta)
beta = beta.reshape((p, 1))
sigma_squared = 1.0

Optimization terminated successfully.
         Current function value: 1417772.672752
         Iterations: 449
         Function evaluations: 746

solution:
[ 2.99614446 -5.00118483  1.00229821  0.9976711 ]
1417772.6727520851

                                    β[0]      β[1]      β[2]        σ²
Truth                           3.000000 -5.000000  1.000000  1.000000
least squares                   2.996144 -5.001184  1.002298  0.997674
MLE w/ likelihood               0.000000  0.000000  0.000000  1.000000
MLE w/ negative log likelihood  2.996144 -5.001185  1.002298  0.997671
```