# Homework of Week 1

In [67]:
import numpy as np
import matplotlib.pyplot as plt

from typing import Callable
from scipy.integrate import solve_ivp

# Helper Functions

In [68]:
def entropy(pp: np.ndarray, b: int = 2) -> float:
    """    
    Calculates entropy for a discrete random variable (or joint entropies for more random variables).

    Args:
        pp: Probability vector.
        b: Log base.

    Returns:
        float: Entropy.
    """

    # Probability 0 events contribute zero to the entropy
    pp = pp[pp > 0] 

    # Entropy
    return -np.sum(pp * np.log(pp) / np.log(b))


def conditional_entropy(pp: np.ndarray, b: int = 2) -> float:
    """    
    Calculates joint entropy H(X | Y) for two discrete random variables X and Y.

    Args:
        pp: Joint probability vector, which should have same Y along columns, that is pp[index_x, index_y].
        b: Log base.

    Returns:
        float: Conditional Entropy.
    """

    # Marginal probability P(X)
    ppy = np.sum(pp, axis=0, keepdims=True)

    # Conditional probabiliy P(Y | X) and avoid division by zero
    ppx_given_y = np.where(ppy > 0, pp / ppy, 0)  
    
    # Conditional Entropy
    return -np.sum(pp * np.log(np.where(ppx_given_y > 0, ppx_given_y, 1)) / np.log(b))



def mutual_information(pp: np.ndarray, b: int = 2) -> float:
    """    
    Calculates mutual information I(X; Y) for two discrete random variables X and Y.

    Args:
        pp: Joint probability vector.
        b: Log base.

    Returns:
        float: Mutual Information.
    """

    # Marginal probabilitis P(X) and P(Y)
    ppx = np.sum(pp, axis=0, keepdims=True)
    ppy = np.sum(pp, axis=1, keepdims=True)

    # Probability 0 events contribute zero to the entropy
    ppxy = ppx * ppy
    ppxy = np.where(ppxy > 0, ppxy, 1)

    return -np.sum(pp * np.log(ppxy/ pp) / np.log(b))

In [69]:
def normalise(pp: np.ndarray) -> np.ndarray:
    """    
    Normalizes probability vector for a set of discrete random variables.

    Args:
        pp: Input vector.

    Returns:
        float: Valid probability vector.
    """

    return pp / np.sum(pp)

# Exercise 1

In [70]:
# Input
pp = np.array([
    [0.1, 0.2, 0.1],
    [0.1, 0.3, 0.2],
    [0.2, 0.1, 0.3]
])


# Normalize and calculate marginal distributions
pp = normalise(pp)
ppx = np.sum(pp, axis = 1)
ppy = np.sum(pp, axis = 0)


# Determine desired properties
print(f"Entropy of X: {entropy(ppx)}")
print(f"Entropy of Y: {entropy(ppy)}")
print(f"Mutual entropy of (X, Y): {entropy(pp)}")

Entropy of X: 1.561278124459133
Entropy of Y: 1.561278124459133
Mutual entropy of (X, Y): 3.0306390622295662


# Exercise 2

We can see that:
- Y is the distribution with the highest entropy for two possible outcomes (total lack of knowledge, hence we assign 50% to each outcome).
- H(X | Y) is equal to H(X) because knowing Y tells us nothing about X, as they are independent.

In [71]:
# Input
ppx = np.array([1/3, 2/3])
ppy = np.array([1/2, 1/2])

# Assuming independence
pp = np.outer(ppx, ppy)
print(f"Joint probability distribution assuming independence: \n{pp}\n")

# Determine desired properties
print(f"Entropy of X: {entropy(ppx)}")
print(f"Entropy of Y: {entropy(ppy)}")
print(f"Mutual entropy of (X, Y): {entropy(pp)}")
print(f"Conditional entropy H(X | Y): {conditional_entropy(pp)}")

Joint probability distribution assuming independence: 
[[0.16666667 0.16666667]
 [0.33333333 0.33333333]]

Entropy of X: 0.9182958340544896
Entropy of Y: 1.0
Mutual entropy of (X, Y): 1.9182958340544896
Conditional entropy H(X | Y): 0.9182958340544896


# Exercise 4

In [72]:
# Input
pp = np.array([
    [0.2, 0.3, 0.15],
    [0.1, 0.15, 0.05]
])

# Normalize and calculate marginal distributions
pp = normalise(pp)
ppx = np.sum(pp, axis = 0, keepdims=True)
ppy = np.sum(pp, axis = 1, keepdims=True)

print(f"Marginal probability distribution for X: {ppx.flatten()}")
print(f"Marginal probability distribution for Y: {ppy.flatten()}")
print("")


# Conditional probability distributions
ppx_given_y = pp / ppy
ppy_given_x = pp / ppx

print(f"Probability distribution for P(X | Y): \n{ppx_given_y}\n")
print(f"Probability distribution for P(Y | X): \n{ppy_given_x}\n")

Marginal probability distribution for X: [0.31578947 0.47368421 0.21052632]
Marginal probability distribution for Y: [0.68421053 0.31578947]

Probability distribution for P(X | Y): 
[[0.30769231 0.46153846 0.23076923]
 [0.33333333 0.5        0.16666667]]

Probability distribution for P(Y | X): 
[[0.66666667 0.66666667 0.75      ]
 [0.33333333 0.33333333 0.25      ]]



# Exercise 5

In [73]:
# Input
pp = np.array([
    [0.1, 0.05],
    [0.2, 0.1],
    [0.05, 0.15]
])

# Normalize
pp = normalise(pp)

print(f"Joint probability distribution: \n{pp}\n")

Joint probability distribution: 
[[0.15384615 0.07692308]
 [0.30769231 0.15384615]
 [0.07692308 0.23076923]]



# Exercise 6

In [74]:
# Initialize a 3D array filled with zeros
pp = np.zeros((3, 4, 2))

# Assign given probabilities
pp[0, 0, 0] = 0.05  # P(X=1, Y=1, Z=0)
pp[0, 1, 0] = 0.1   # P(X=1, Y=2, Z=0)
pp[0, 1, 1] = 0.1   # P(X=1, Y=2, Z=1)
pp[0, 2, 0] = 0.05  # P(X=1, Y=3, Z=0)
pp[1, 0, 1] = 0.1   # P(X=2, Y=1, Z=1)
pp[1, 1, 0] = 0.05  # P(X=2, Y=2, Z=0)
pp[1, 2, 0] = 0.1   # P(X=2, Y=3, Z=0)
pp[1, 3, 1] = 0.05  # P(X=2, Y=4, Z=1)
pp[2, 0, 0] = 0.1   # P(X=3, Y=1, Z=0)
pp[2, 2, 1] = 0.05  # P(X=3, Y=3, Z=1)
pp[2, 3, 0] = 0.05  # P(X=3, Y=4, Z=0)

# Normalize
pp = normalise(pp)

# Desired probability
print(f"The hypothesis H = {{{{(X, Y, Z) : X = 1, Y = 2, Z = 1}}}} has probability: {pp[0, 1, 1] * 100}%")

The hypothesis H = {{(X, Y, Z) : X = 1, Y = 2, Z = 1}} has probability: 12.5%


# Exercise 7

We have that:
- A total of 1% of the population has the disease.
- Hence, the probability that the person has the disease given that they have tested positive is given by Bayes's rule as (where $D$ is having the disease and $T$ is testing positive):
    $$
        P(D \mid T) = \frac{P(D \cap T)}{P(T)}
    $$

- We can now calculate the desired probabilities:
    - $P(D \cap T) = P(D) \cdot P(T) = 0.01 \cdot 0.9 = 0.09 = 9\%$ where independence can be assumed, has having the disease and doing a test are independent processes.
    - $P(T) = P(D \cap T) + P(\bar D \cap T) = 0.09 + 0.1 \cdot 0.9 = 0.18 = 18\%$

Therefore, the individual has a probability of 50% of being infected! 

# Exercise 8

We have that:
- Using Bayes's rule:
    $$
        P(X = 0 \mid Y = 1) = \frac{P(X = 0 \cap Y = 1)}{P(Y = 1)}
    $$

- Using the BSC model we have that:
    - $P(X = 0 \cap Y = 1) = p \cdot e$ because it only happens if we send a zero and there is a bit-flip.
    - $P(Y = 1) = P(X = 0 \cap Y = 1) + P(X = 1 \cap Y = 1) = p \cdot e + (1-p)\cdot(1-e)$ has we can receive a one if a zero is sent and there is a bit flip or if a one is sent with no bit flip.

Hence the desired probability is:
$$
    P(X = 0 \mid Y = 1) = \frac{p \cdot e}{p \cdot e + (1-p)\cdot(1-e)}
$$

# Exercise 9 and Exercise 10

Already solved in the statement.

# Exercise 11

It's impossible that both balls are blue as there is only one blue ball in the box and the first ball is taken without replacement.

# Exercise 12*

Will do later!

# Exercise 13

We have that:
- The maximum likelihood estimate for X given that we have only observed X = 2 is that X = 2 with probability 1.
- The Baysean estimate is obtained using Bayes's theorem:
    $$
        P(X = x \mid X_\text{observed} = 2) = \frac{P(X_\text{observed} = 2\mid X = x)P(X = x)}{P(X_\text{observed} = 2)}
    $$

    We have that:
    - $P(X_\text{observed} = 2\mid X = x) = \delta_{x, 2}$ ignoring measurement errors
    - $P(X = x)$ is given by the priors
    

    Hence the Baysean estimate is the same, that is, X = 2 with probability 1 (because $P(X_\text{observed} = 2) = P(X = 2)$ as our knowledge of the system is embedded in our prior probability):
    $$
    \begin{align*}
        P(X = x \mid X_\text{observed} = 2) &= \frac{\delta_{x, 2}P(X = x)}{P(X_\text{observed} = 2)}
        \\\\

        &= \frac{\delta_{x, 2}P(X = 2)}{P(X_\text{observed} = 2)}
        \\\\

        &= \delta_{x, 2}
    \end{align*}
    $$

We can only use these methods if we have more data, if we have only one observation of the random variable the estimator just colapses to that value.

# Exercise 14 and Exercise 15

Will do later!

&