In [None]:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import chi2, poisson

In [None]:
k = np.arange(6)
n_k = np.array([109, 65, 22, 3, 1, 0])

To fit a Poisson model to the data note that the log-likelihood is given by the expression at the bottom of page 83 in the book (as the last value does not contribute due to having zero count):

$$
\log L(\theta) = - \theta \sum_k n_k + \sum_k k n_k \log \theta
$$

and the MLE was shown to be:

$$
\hat{\theta} = \sum_k k n_k \big/ \sum_k n_k
$$

In [None]:
def log_likelihood_poisson(
    k: np.ndarray, n_k: np.ndarray, theta: np.ndarray) -> np.ndarray:
    # can leave in the last value, it doesn't matter
    log_like = - theta * np.sum(n_k) + np.sum(k * n_k) * np.log(theta)
    return log_like


def mle_poisson(k: np.ndarray, n_k: np.ndarray) -> float:
    theta_hat = np.sum(k * n_k) / np.sum(n_k)
    return theta_hat

In [None]:
theta = np.linspace(0.01, 1, num=100)
log_like = log_likelihood_poisson(k, n_k, theta)
like = np.exp(log_like)
like /= np.max(like)
mle = mle_poisson(k, n_k)
print('MLE = ', mle)

In [None]:
def plot_likelihood(theta: np.ndarray, likelihood: np.ndarray, mle: float) -> None:
    plt.plot(theta, likelihood)
    plt.axvline(mle, linestyle='--')
    plt.xlabel(r'$\theta$')
    plt.ylabel('Likelihood')
    plt.legend(['likelihood', 'MLE'])
    plt.title('Likelihood of soldiers killed by horse kicks per year');

In [None]:
plot_likelihood(theta, like, mle)

Let's evalute the goodness of fit of the Poisson model by looking at the difference between the observed ($n_k$) and the expected frequencies:

$$
e_k = N \hat{p}_k
$$

and using the chi square statistic:

$$
\chi^2 = \sum_k r_k^2
$$

where the residual:

$$
r_k = \frac{n_k - e_k}{\sqrt{e_k}}
$$

In [None]:
def expected_freq(k: np.ndarray, n_k: np.ndarray, mle: float) -> np.ndarray:
    N = np.sum(n_k)
    p_k_hat = poisson.pmf(k, mle)
    e_k = N * p_k_hat
    return e_k


def chi_sqare(k: np.ndarray, n_k: np.ndarray, mle: float) -> float:
    e_k = expected_freq(k, n_k, mle)
    r_k = (n_k - e_k) / np.sqrt(e_k)
    chi_sq = np.sum(r_k ** 2)
    return chi_sq

In [None]:
print(f'e_k = {(expected_freq(k, n_k, mle)).astype(int)}')
print(f'n_k = {n_k}')

It is clear from that the Poisson model is an excellent fit to the data since the expected frequencies and the observed frequencies are very similar. Or with more rigorous statistics:

In [None]:
chi_sq = np.round(chi_sqare(k, n_k, mle), 1)
df = len(k) - 1
print(f'observed chi-square = {chi_sq} at N = {df} degrees of freedom')
alpha = 0.05
exp_ch_sq = np.round(chi2.ppf(1 - alpha, df), 1)
print(f'expected chi-square = {exp_ch_sq} at N = {df} degrees of freedom') 

The observed $\chi^2$ value is far less than the expected $\chi^2$ value at a significance level of $\alpha = 0.05$. As a result, there is not sufficient evidence in the data to reject the null hypothesis that the data is drawn from a Poisson distribution.