# Chapter 8 exercises

In [1]:
import pandas as pd
import numpy as np
import itertools as it

In [84]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

def color_print(string, bcolor=bcolors.WARNING):
    print(bcolor + string + bcolors.ENDC)

## 8.3

Consider three binary variables $a, b, c \in \{0, 1\}$ having the joint distribution given by the table below. Show by direct evaluation that this distribution has the property that $a$ and $b$s are marginally dependent, so that $p(a,b) \neq p(a)p(b)$, but that they become independent when conditioned on $c$ so that $p(a,b|c) = p(a|c)p(b|c)$ fo both $c=0$ and $c=1$

In [2]:
p = np.array([
    [0, 0, 0, 0.192],
    [0, 0, 1, 0.144],
    [0, 1, 0, 0.048],
    [0, 1, 1, 0.216],
    [1, 0, 0, 0.192],
    [1, 0, 1, 0.064],
    [1, 1, 0, 0.048],
    [1, 1, 1, 0.096],
])

p = pd.DataFrame(p, columns=["a", "b", "c", "p_abc"])
p = p.set_index(["a", "b", "c"])

#### Factorization without conditioning

In [3]:
# p(a, b)
p.xs(0, level=2) + p.xs(1, level=2)

Unnamed: 0_level_0,Unnamed: 1_level_0,p_abc
a,b,Unnamed: 2_level_1
0.0,0.0,0.336
0.0,1.0,0.264
1.0,0.0,0.256
1.0,1.0,0.144


In [4]:
# p(a)p(b)
p_marg = p.xs(0, level=2) + p.xs(1, level=2)
p_a = p_marg.xs(0, level=-1) + p_marg.xs(1, level=-1)
p_b = p_marg.xs(0, level=0) + p_marg.xs(1, level=0)

p_a["key"] = 1
p_b["key"] = 1

p_marg = pd.merge(p_a.reset_index(), p_b.reset_index(), on="key",
         suffixes=("_a", "_b")).drop("key", axis=1)

p_marg.set_index(["a", "b"]).prod(axis=1)

a    b  
0.0  0.0    0.3552
     1.0    0.2448
1.0  0.0    0.2368
     1.0    0.1632
dtype: float64

#### Factorization with conditioning

In [5]:
# p(a,b|c=0)
p_marg = p.xs(0, level=-1)
p_marg = p_marg / p_marg.sum()
p_marg

Unnamed: 0_level_0,Unnamed: 1_level_0,p_abc
a,b,Unnamed: 2_level_1
0.0,0.0,0.4
0.0,1.0,0.1
1.0,0.0,0.4
1.0,1.0,0.1


In [6]:
# p(a|c=0)p(b|c=0)
p_a = p_marg.sum(level=0)
p_b = p_marg.sum(level=1)

p_a = p_a.reset_index().assign(key=1)
p_b = p_b.reset_index().assign(key=1)

p_fact = (pd.merge(p_a, p_b, on="key", suffixes=("_a", "_b"))
            .drop("key", axis=1)
            .set_index(["a", "b"]))

p_fact.prod(axis=1)

a    b  
0.0  0.0    0.4
     1.0    0.1
1.0  0.0    0.4
     1.0    0.1
dtype: float64

## 8.4

Evaluate the distributions $p(a)$, $p(b | c)$, and $p(c | a)$ corresponding to the joint distribution given in Table 8.2. Hence show by direct evaluation that $p(a, b, c) = p(a)p(c | a)p(b | c)$. Draw the corresponding directed graph.

In [7]:
p_a = p.sum(level=0)

In [8]:
a = 1
p_a["key"] = [1,2]
p_a

Unnamed: 0_level_0,p_abc,key
a,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.6,1
1.0,0.4,2


In [9]:
# p(c|a)
p_c_giv_a = p.sum(level=["a", "c"]) / p.sum(level=["a", "c"]).sum(level=0)
p_c_giv_a["key"] = [1, 1, 2, 2]
p_c_giv_a

Unnamed: 0_level_0,Unnamed: 1_level_0,p_abc,key
a,c,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.0,0.4,1
0.0,1.0,0.6,1
1.0,0.0,0.6,2
1.0,1.0,0.4,2


In [10]:
# p(b|c)
p_b_giv_c = p.sum(level=["b", "c"]) / p.sum(level=["b", "c"]).sum(level=1)
p_b_giv_c["key"] = [1, 2, 1, 2]
p_b_giv_c

Unnamed: 0_level_0,Unnamed: 1_level_0,p_abc,key
b,c,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.0,0.8,1
0.0,1.0,0.4,2
1.0,0.0,0.2,1
1.0,1.0,0.6,2


In [11]:
fact = p_a.merge(p_c_giv_a.reset_index(), on="key")
fact = fact.drop(["key"], axis=1).set_index(["a", "c"])
fact = fact.prod(axis=1)
fact = pd.DataFrame(fact, columns=["p"])
fact["key"] = [1, 2, 1, 2]

fact = fact.reset_index().merge(p_b_giv_c.reset_index(), on="key")
fact = fact.rename({"c_x": "c"}, axis=1).drop(["key", "c_y"], axis=1)
fact = fact.groupby(["a", "c", "b"]).sum().prod(axis=1)

fact.name="p_factorized"

Final Answer:

Contains a bug: c-level is swapped

In [338]:
p.join(fact, on=["a", "b", "c"])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,p_abc,p_factorized
a,b,c,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,0.0,0.0,0.192,0.192
0.0,0.0,1.0,0.144,0.048
0.0,1.0,0.0,0.048,0.144
0.0,1.0,1.0,0.216,0.216
1.0,0.0,0.0,0.192,0.192
1.0,0.0,1.0,0.064,0.048
1.0,1.0,0.0,0.048,0.064
1.0,1.0,1.0,0.096,0.096


## 8.8
Consider the graphical model represented by the *noist*-OR
$$
    p(y=1|x_1, \ldots, x_M) = 1 - (1 - \mu_0)\prod_{n=1}^N(1 - \mu_i)^{x_i}
$$

Show that $p$ can be interpreted as a "soft" (probabilisitc) form of the logical OR function (i.e, the function that gives $y=1$ whenever at least on the $x_i=1$). Discuss the interpretation of $\mu_0$

In [62]:
# ** Considering two parameters plus the null term: mu_0, mu_1, mu_2 **
mu_0, mu_1, mu_2 = 0, 0, 0
# We consider x1 and x2 as variables to consider inside the OR
for mu_1, mu_2 in it.product([0, 1], repeat=2):
    p_y = 1 - (1 - mu_0) * (1 - mu_1) ** x1 * (1 - mu_2) ** x2
    print(f"p(y|mu_1={mu_1}, mu_2={mu_2})={p_y}")

p(y|mu_1=0, mu_2=0)=0
p(y|mu_1=0, mu_2=1)=1
p(y|mu_1=1, mu_2=0)=1
p(y|mu_1=1, mu_2=1)=1


In [70]:
# ** Considering four parameters plus the null term: mu_0, mu_1, mu_2, mu_3, mu_4 **

x1, x2, x3, x4 = 1, 1, 1, 1
# We consider x1 and x2 as variables to consider inside the OR
for mu_1, mu_2, mu_3, mu_4 in it.product([0, 1], repeat=4):
    p_y = 1 - (1 - mu_0) * (1 - mu_1) ** x1 * (1 - mu_2) ** x2 * (1 - mu_3) ** x3 * (1 - mu_4) ** x4
    print(f"p(y|mu_1={mu_1}, mu_2={mu_2}, mu_3={mu_3}, mu_4={mu_4})={p_y}")

p(y|mu_1=0, mu_2=0, mu_3=0, mu_4=0)=0
p(y|mu_1=0, mu_2=0, mu_3=0, mu_4=1)=1
p(y|mu_1=0, mu_2=0, mu_3=1, mu_4=0)=1
p(y|mu_1=0, mu_2=0, mu_3=1, mu_4=1)=1
p(y|mu_1=0, mu_2=1, mu_3=0, mu_4=0)=1
p(y|mu_1=0, mu_2=1, mu_3=0, mu_4=1)=1
p(y|mu_1=0, mu_2=1, mu_3=1, mu_4=0)=1
p(y|mu_1=0, mu_2=1, mu_3=1, mu_4=1)=1
p(y|mu_1=1, mu_2=0, mu_3=0, mu_4=0)=1
p(y|mu_1=1, mu_2=0, mu_3=0, mu_4=1)=1
p(y|mu_1=1, mu_2=0, mu_3=1, mu_4=0)=1
p(y|mu_1=1, mu_2=0, mu_3=1, mu_4=1)=1
p(y|mu_1=1, mu_2=1, mu_3=0, mu_4=0)=1
p(y|mu_1=1, mu_2=1, mu_3=0, mu_4=1)=1
p(y|mu_1=1, mu_2=1, mu_3=1, mu_4=0)=1
p(y|mu_1=1, mu_2=1, mu_3=1, mu_4=1)=1


First remarks: 

* The $\{x_n\}_{n=1}^N$ elements control for which variables are we interested in computing the *soft*-OR fuction
* If the model considers $\forall n\geq 1. \mu_n\in\{0,1\}$ and $\mu_0 = 0$, then the model corresponds to the *hard* OR function

In [93]:
# In this example, we "turn off" x1 and x3, meaning that the OR
# function is computing using only x2 and x4 

x1, x2, x3, x4 = 0, 1, 0, 1
# We consider x1 and x2 as variables to consider inside the OR
for mu_1, mu_2, mu_3, mu_4 in it.product([0, 1], repeat=4):
    p_y = 1 - (1 - mu_0) * (1 - mu_1) ** x1 * (1 - mu_2) ** x2 * (1 - mu_3) ** x3 * (1 - mu_4) ** x4
    p_y_str = f"p(y|mu_1={mu_1}, mu_2={mu_2}, mu_3={mu_3}, mu_4={mu_4})={p_y}"
    if (mu_2 == 0 or mu_3 == 0) and p_y == 0:
        color_print(p_y_str)
    else:
        print(p_y_str)

[93mp(y|mu_1=0, mu_2=0, mu_3=0, mu_4=0)=0[0m
p(y|mu_1=0, mu_2=0, mu_3=0, mu_4=1)=1
[93mp(y|mu_1=0, mu_2=0, mu_3=1, mu_4=0)=0[0m
p(y|mu_1=0, mu_2=0, mu_3=1, mu_4=1)=1
p(y|mu_1=0, mu_2=1, mu_3=0, mu_4=0)=1
p(y|mu_1=0, mu_2=1, mu_3=0, mu_4=1)=1
p(y|mu_1=0, mu_2=1, mu_3=1, mu_4=0)=1
p(y|mu_1=0, mu_2=1, mu_3=1, mu_4=1)=1
[93mp(y|mu_1=1, mu_2=0, mu_3=0, mu_4=0)=0[0m
p(y|mu_1=1, mu_2=0, mu_3=0, mu_4=1)=1
[93mp(y|mu_1=1, mu_2=0, mu_3=1, mu_4=0)=0[0m
p(y|mu_1=1, mu_2=0, mu_3=1, mu_4=1)=1
p(y|mu_1=1, mu_2=1, mu_3=0, mu_4=0)=1
p(y|mu_1=1, mu_2=1, mu_3=0, mu_4=1)=1
p(y|mu_1=1, mu_2=1, mu_3=1, mu_4=0)=1
p(y|mu_1=1, mu_2=1, mu_3=1, mu_4=1)=1


**The more general case**

$\mu_0$ controls the level of the *null* term: the bigger it is, the more the null terms (0) converge to 1

In [146]:
x1, x2, x3 = 0, 1, 0

mu_0 = 0.45
# We consider x1 and x2 as variables to consider inside the OR
for mu_1, mu_2, mu_3 in it.product([0, 1], repeat=3):
    p_y = 1 - (1 - mu_0) * (1 - mu_1) ** x1 * (1 - mu_2) ** x2 * (1 - mu_3) ** x3
    p_y_str = f"p(y|mu_1={mu_1}, mu_2={mu_2}, mu_3={mu_3}, mu_4={mu_4})={p_y:.2}"
    print(p_y_str)

p(y|mu_1=0, mu_2=0, mu_3=0, mu_4=1)=0.45
p(y|mu_1=0, mu_2=0, mu_3=1, mu_4=1)=0.45
p(y|mu_1=0, mu_2=1, mu_3=0, mu_4=1)=1.0
p(y|mu_1=0, mu_2=1, mu_3=1, mu_4=1)=1.0
p(y|mu_1=1, mu_2=0, mu_3=0, mu_4=1)=0.45
p(y|mu_1=1, mu_2=0, mu_3=1, mu_4=1)=0.45
p(y|mu_1=1, mu_2=1, mu_3=0, mu_4=1)=1.0
p(y|mu_1=1, mu_2=1, mu_3=1, mu_4=1)=1.0


$\forall n\geq 1.\mu_n$ controls the value of the positive terms

In [147]:

mu_1, mu_2, mu_3 = 0.3, 0.2, 0.5
mu_0 = 0
# We consider x1 and x2 as variables to consider inside the OR
for x1, x2, x3 in it.product([0, 1], repeat=3):
    p_y = 1 - (1 - mu_0) * (1 - mu_1) ** x1 * (1 - mu_2) ** x2 * (1 - mu_3) ** x3
    p_y_str = f"p(y|x1={x1}, x2={x2}, x3={x3})={p_y:.2f}"
    print(p_y_str)

p(y|x1=0, x2=0, x3=0)=0.00
p(y|x1=0, x2=0, x3=1)=0.50
p(y|x1=0, x2=1, x3=0)=0.20
p(y|x1=0, x2=1, x3=1)=0.60
p(y|x1=1, x2=0, x3=0)=0.30
p(y|x1=1, x2=0, x3=1)=0.65
p(y|x1=1, x2=1, x3=0)=0.44
p(y|x1=1, x2=1, x3=1)=0.72
