In [2]:
import numpy as np
import matplotlib.pyplot as plt

from bayesian_network import BayesNet
from utils import sample_forward, get_default_bayes_net

# Parameter Learning

In this problem, we will assume that a fixed dependency graph structure between variables is given and learn the parameters (the complete Conditional Probability Distribution Table (CPDT)) from a set of events. Furthermore, we will use log-likelihood to find a model structure which also generalizes to future data.
    
## ML Estimates for Conditional Distributions

<div class="alert alert-warning">
    To implement: <i>maximum_likelihood_estimate</i> function, which computes the Maximum Likelihood Estimate for the parameters of a discrete (conditional) probability distribution $ P(X \mid \mathit{pa}(X) )$, given a data set.
</div>

`maximum_likelihood_estimate` takes  three parameters:
- `data` is a NumPy array of shape `(num_samples, num_variables)`.
- `variable_id` is the column index of the variable to estimate the distribution for.
- `parent_ids` is a tuple, containing the column indices of parent variables.

`maximum_likelihood_estimate` must return one object:
- A Maximum Likelihood Estimate (MLE) of the parameters in form of a `np.ndarray`. The first dimension (index `0`) of the returned array must correspond to variable `variable_id`, the remaining dimensions must be sorted according to `parent_ids`. Altogether, tuple `(variable_id, ) + parent_ids` gives the mapping of dimensions to variables.


In [34]:
from itertools import combinations

def maximum_likelihood_estimate(data: np.ndarray, variable_id: int, parent_ids: tuple=tuple()):
    """
    Estimates the conditional probability distribution of a (discrete) variable from data.
    :param data:    data to estimate distribution from
    :param variable_id:  column index corresponding to the variable we estimate the distribution for
    :param parent_ids: column indices of the variables the distribution is conditioned on
    :returns: estimated conditional probability distribution table
    """
    
    assert type(variable_id) == int
    assert type(parent_ids) == tuple
    
    # mapping of axis to variable_id,
    # e.g. the variable with id variable_ids[i] is on axis i of the CPDT
    variable_ids = (variable_id,) + parent_ids

    #mle 
    mle = np.zeros((2,) * len(variable_ids))
   
    #count occurances of the data
    for d in data:
        index = tuple([d[v] for v in variable_ids])
        mle[index] += 1

    mle = mle / mle.sum(axis=0,keepdims = True)
    return mle


In [35]:
# sanity checks
_A_, _B_, _C_, _D_, _E_ = 0, 1, 2, 3, 4
# get the bayes net from the previous problem
bayes_net = get_default_bayes_net()
np.random.seed(0)
# draw 100 samples
data = sample_forward(bayes_net, 100)

# get exact A form bayes net
expected = bayes_net[_A_].pdt[:,0,0,0,0]
# estimate A from the data
actual = maximum_likelihood_estimate(data, _A_)
# estimate should not be far off
assert np.all(np.isclose(expected, actual, atol=0.05))

# get exact B_A form bayes net
expected = bayes_net[_B_].pdt[:,:,0,0,0].T
# estimate B_A from data
actual = maximum_likelihood_estimate(data, _B_, (_A_,))
# estimate should not be far off
assert np.all(np.isclose(expected, actual, atol=0.05))

[1 1 0 0 1]
(1,)
[0. 1.]
####
[1 1 0 0 0]
(1,)
[0. 2.]
####
[1 1 0 0 0]
(1,)
[0. 3.]
####
[1 1 0 0 0]
(1,)
[0. 4.]
####
[1 1 1 0 1]
(1,)
[0. 5.]
####
[1 1 0 0 0]
(1,)
[0. 6.]
####
[1 1 0 0 1]
(1,)
[0. 7.]
####
[1 1 0 0 0]
(1,)
[0. 8.]
####
[1 0 0 1 0]
(1,)
[0. 9.]
####
[0 0 0 1 0]
(0,)
[1. 9.]
####
[1 0 0 1 0]
(1,)
[ 1. 10.]
####
[1 1 0 0 0]
(1,)
[ 1. 11.]
####
[1 0 0 1 0]
(1,)
[ 1. 12.]
####
[0 0 0 0 1]
(0,)
[ 2. 12.]
####
[1 1 1 0 1]
(1,)
[ 2. 13.]
####
[0 0 0 1 0]
(0,)
[ 3. 13.]
####
[0 0 0 1 0]
(0,)
[ 4. 13.]
####
[0 0 0 1 0]
(0,)
[ 5. 13.]
####
[0 0 0 1 0]
(0,)
[ 6. 13.]
####
[0 0 0 0 0]
(0,)
[ 7. 13.]
####
[1 1 0 0 0]
(1,)
[ 7. 14.]
####
[1 1 0 0 0]
(1,)
[ 7. 15.]
####
[1 1 0 0 1]
(1,)
[ 7. 16.]
####
[1 1 0 0 1]
(1,)
[ 7. 17.]
####
[1 1 0 0 0]
(1,)
[ 7. 18.]
####
[1 1 0 0 0]
(1,)
[ 7. 19.]
####
[0 0 0 1 0]
(0,)
[ 8. 19.]
####
[1 1 0 0 0]
(1,)
[ 8. 20.]
####
[1 1 0 0 0]
(1,)
[ 8. 21.]
####
[1 1 0 1 0]
(1,)
[ 8. 22.]
####
[0 0 0 1 1]
(0,)
[ 9. 22.]
####
[1 1 0 0 1]
(1,)
[ 9. 23.]
#

### The Log-Likelihood Function
<div class="alert alert-warning">
    To implement: <i>log_likelihood</i> function, which computes the log-likelihood $\mathcal{L}(\mathcal{M} : \mathcal{D})$ of a model (BayesNet) relative to a data set.
</div>

`log_likelihood`takes  two parameters:
- `data` is a NumPy array of shape `(num_samples, num_variables)`.
- `bayes_net` a BayesNet object representing the model $\mathcal{M}$ (containing already estimated CPDTs).

`log_likelihood` must return one object:
- The log-likelihood of the model given the data (i.e., a floating point number (<= 0)).


In [19]:
def log_likelihood(data: np.ndarray, bayes_net: BayesNet):
    """
    Computes the log-likelihood of a given Bayesian network relative to the data.
    :param data: data to compute the log-likelihood relative to.
    :param bayes_net: Bayesian network model.
    :returns: the log-likelihood of the Bayesian network relative to the data.
    """    

    ll = 0
    for i in range(len(data)):
        for variable in bayes_net:
            distribution = variable(data[i])
            correct_prob = distribution[data[i][variable.id]]
            
            ll += np.sum(np.log(correct_prob))

    return ll


In [20]:
# sanity checks
# get the bayes net from the previous problem
bayes_net = get_default_bayes_net()
np.random.seed(0)
# draw 100 samples
data = sample_forward(bayes_net, 100)

# expected log-likelihood
expected = -215.9
# actual log-likelihood
actual = log_likelihood(data, bayes_net)

# must be close
assert np.all(np.isclose(expected, actual, atol=0.1))


# remove unused variables
del data
del bayes_net

## Finding a Model for Strokes   

After watching hours of medical dramas on television, a medicine freshman tries to figure out the perfect prediction model for strokes. Some of her computer science colleagues told her about how Bayesian networks can be used for symptom diagnosis, so she decides to model her ideas using this technique. In order to find out the (conditional) probability distributions, she needs to find a lot of training examples. One of her computer science colleagues cracks the computer system of the university clinic and copies a lot of medical patient data. 

All variables in this example are boolean (false=0 or true=1). The data for this assignment is stored in two text files, "train.txt" and "test.txt". They contain 500 (train) and 5000 (test) samples of the following 5 random variables:

 - Column 0: A ... Alcoholism
 - Column 1: H ... High Blood Pressure
 - Column 2: S ... Stroke
 - Column 3: C ... Confusion
 - Column 4: V ... Vertigo
 
First, she decides to try out the following, very simple, model:
    
<img style='width:100%;  max-width:400px;' src="img/bn_mod1.svg">

<br>

<div class="alert alert-warning">
    Estimate the (conditional) probability tables of Model 1 and compute the log-likelihood of Model 1 given the training data.
</div>

Store the CPDTs into the provided variables.

In [21]:
_A_, _H_, _S_, _C_, _V_ = 0, 1, 2, 3, 4
train = np.loadtxt('data/train.txt', dtype=int)

A = maximum_likelihood_estimate(train, _A_)
H = maximum_likelihood_estimate(train, _H_)
S = maximum_likelihood_estimate(train, _S_)
C = maximum_likelihood_estimate(train, _C_)
V = maximum_likelihood_estimate(train, _V_)

# begin sanity check
assert np.all(np.isclose(A.sum(axis=0), 1))
assert np.all(np.isclose(H.sum(axis=0), 1))
assert np.all(np.isclose(S.sum(axis=0), 1))
assert np.all(np.isclose(C.sum(axis=0), 1))
assert np.all(np.isclose(V.sum(axis=0), 1))
# end sanity check

bayes_net_1 = BayesNet(
    (A, (_A_,)),
    (H, (_H_,)),
    (S, (_S_,)),
    (C, (_C_,)),
    (V, (_V_,))
)

tr_log_likelihood_1 = log_likelihood(train,bayes_net_1)

In [22]:
# sanity check
assert tr_log_likelihood_1 < -1100
assert tr_log_likelihood_1 > -1300

Not satisfied with the results, our freshman decides that Model 1 is probably too simple to represent the real world. She adds some edges to the model, and tries again:

<img  style='width:100%;  max-width:400px;' src="img/bn_mod2.svg">

<div class="alert alert-warning">
    Estimate the (conditional) probability tables of Model 2 and compute the log-likelihood of Model 2 given the training data. 
</div>

Store the CPDTs into the provided variables. The dimensions of the CPDT must be sorted according to the naming of the variable, e.g., in C_AS, dimension 0 corresponds to C, dimension 1 to A, and dimension 2 to S.


In [23]:
_A_, _H_, _S_, _C_, _V_ = 0, 1, 2, 3, 4
train = np.loadtxt('data/train.txt', dtype=int)

A, H_A, S_H, C_AS, V_S = None, None, None, None, None

# YOUR CODE HERE
A = maximum_likelihood_estimate(train, _A_)
H_A = maximum_likelihood_estimate(train, _H_,(_A_,))
S_H = maximum_likelihood_estimate(train, _S_,(_H_,))
C_AS = maximum_likelihood_estimate(train, _C_,(_A_,_S_,))
V_S = maximum_likelihood_estimate(train, _V_,(_S_,))

# begin sanity check
assert np.all(np.isclose(A.sum(axis=0), 1))
assert np.all(np.isclose(H_A.sum(axis=0), 1))
assert np.all(np.isclose(S_H.sum(axis=0), 1))
assert np.all(np.isclose(C_AS.sum(axis=0), 1))
assert np.all(np.isclose(V_S.sum(axis=0), 1))
# end sanity check

bayes_net_2 = BayesNet(
    (A, (_A_,)),
    (H_A, (_H_,_A_)),
    (S_H, (_S_,_H_)),
    (C_AS, (_C_,_A_,_S_)),
    (V_S, (_V_,_S_))
)

tr_log_likelihood_2 = log_likelihood(train,bayes_net_2)

In [24]:
# sanity check
assert tr_log_likelihood_2 < -1100
assert tr_log_likelihood_2 > -1300


Finally, she decides to try out an even more complex model:

<img  style='width:100%;  max-width:400px;' src="img/bn_mod3.svg">

<div class="alert alert-warning">
    Estimate the (conditional) probability tables of Model 3 and compute the log-likelihood of Model 3 given the training data. 
</div>

Store the CPDTs into the provided variables. The dimensions of the CPDT must be sorted according to the naming of the variable, e.g., in C_AS, dimension 0 corresponds to C, dimension 1 to A, and dimension 2 to S.

**Hint**:
- Use the two functions you implemented above (`maximum_likelihood_estimate` and `log_likelihood`)!
- The training data is stored in variable `train`. 
- `_A_, _H_, _S_, _C_, _V_` hold the column indices (= IDs) of the variables. 

In [25]:
_A_, _H_, _S_, _C_, _V_ = 0, 1, 2, 3, 4
train = np.loadtxt('data/train.txt', dtype=int)


A = maximum_likelihood_estimate(train, _A_)
H_A = maximum_likelihood_estimate(train, _H_,(_A_,))
S_AH = maximum_likelihood_estimate(train, _S_,(_A_,_H_,))
C_AS = maximum_likelihood_estimate(train, _C_,(_A_,_S_,))
V_CS = maximum_likelihood_estimate(train, _V_,(_C_,_S_,))

# begin sanity check
assert np.all(np.isclose(A.sum(axis=0), 1))
assert np.all(np.isclose(H_A.sum(axis=0), 1))
assert np.all(np.isclose(S_AH.sum(axis=0), 1))
assert np.all(np.isclose(C_AS.sum(axis=0), 1))
assert np.all(np.isclose(V_CS.sum(axis=0), 1))
# end sanity check

bayes_net_3 = BayesNet(
    (A, (_A_,)),
    (H_A, (_H_,_A_)),
    (S_AH, (_S_,_A_,_H_)),
    (C_AS, (_C_,_A_,_S_)),
    (V_CS, (_V_,_C_,_S_))
)

tr_log_likelihood_3 = log_likelihood(train,bayes_net_3)

In [26]:
# sanity check
assert tr_log_likelihood_3 < -1100
assert tr_log_likelihood_3 > -1300


### Compare Train Log-Likelihoods

Compare the log-likelihoods of the training data given the three models.

In [27]:
print('logP(train|M1) = {}'.format(tr_log_likelihood_1))
print('logP(train|M2) = {}'.format(tr_log_likelihood_2))
print('logP(train|M3) = {}'.format(tr_log_likelihood_3))

logP(train|M1) = -1214.6425612469914
logP(train|M2) = -1175.123630631206
logP(train|M3) = -1174.6821826610374


### Compare Test Log-Likelihoods

The computer science colleague manages to obtain more data from the clinic's network and stores it in the file `test.txt`. Our medicine freshman is eager to try the models on the new data.

In [28]:
test = np.loadtxt('data/test.txt', dtype=int)

She computes the log-likelihood of the test data for each of the three models:

In [29]:
te_log_likelihood_1 = log_likelihood(test, bayes_net_1)
te_log_likelihood_2 = log_likelihood(test, bayes_net_2)
te_log_likelihood_3 = log_likelihood(test, bayes_net_3)

print('logP(test|M1) = {}'.format(te_log_likelihood_1))
print('logP(test|M2) = {}'.format(te_log_likelihood_2))
print('logP(test|M3) = {}'.format(te_log_likelihood_3))

logP(test|M1) = -11911.976345359737
logP(test|M2) = -11565.57072258751
logP(test|M3) = -11571.024593414042
