In [1]:
from __future__ import division
from __future__ import print_function
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import minimum_spanning_tree
import numpy as N, networkx as nx
from collections import defaultdict
import pandas as pd
import numpy  as np

In [None]:
# BELOW CODE WAS IMPLEMENTED BY John Reid in pybool package
# https://github.com/JohnReid/pybool/blob/master/python/pybool/chow_liu_trees.py
# The code has been pasted here because csgrads1 was having issues installing pybool
# Add one laplace smoothing has also been included.

def marginal_distribution(X, u):
    """
    Return the marginal distribution for the u'th features of the data points, X.
    """
    values = defaultdict(float)
    s = 1. / len(X)
    for x in X:
        values[x[u]] += s
    return values



def marginal_pair_distribution(X, u, v):
    """
    Return the marginal distribution for the u'th and v'th features of the data points, X.
    """
    if u > v:
        u, v = v, u
    values = defaultdict(float)
    s = 1. / len(X)
    for x in X:
        values[(x[u], x[v])] += s
    return values



def calculate_mutual_information(X, u, v):
    """
    X are the data points.
    u and v are the indices of the features to calculate the mutual information for.
    """
    if u > v:
        u, v = v, u
    marginal_u = marginal_distribution(X, u)
    marginal_v = marginal_distribution(X, v)
    marginal_uv = marginal_pair_distribution(X, u, v)
    I = 0.
    for x_u, p_x_u in marginal_u.iteritems():
        for x_v, p_x_v in marginal_v.iteritems():
            if (x_u, x_v) in marginal_uv:
                p_x_uv = marginal_uv[(x_u, x_v)]
                I += p_x_uv * (N.log(p_x_uv) - N.log(p_x_u) - N.log(p_x_v))
    return I


def build_chow_liu_tree(X, n):
    """
    Build a Chow-Liu tree from the data, X. n is the number of features. The weight on each edge is
    the negative of the mutual information between those features. The tree is returned as a networkx
    object.
    """
    G = nx.Graph()
    for v in xrange(n):
        G.add_node(v)
        for u in xrange(v):
            G.add_edge(u, v, weight=-calculate_mutual_information(X, u, v))
    T = nx.minimum_spanning_tree(G)
    return T

In [71]:
data_dir = "./small-10-datasets/"
data_titles = ['accidents', 'baudio', 'bnetflix', 'dna', 'jester', 'kdd', 'msnbc',
              'nltcs', 'plants', 'r52']
test  = dict()
train = dict()
valid = dict()

for title in data_titles:
    test[title]  = np.loadtxt(data_dir + title + '.test.data', delimiter=',')
    train[title] = np.loadtxt(data_dir + title + '.ts.data', delimiter=',')
    valid[title] = np.loadtxt(data_dir + title + '.valid.data', delimiter=',')

In [73]:
# Bayesian Network, No edges Algorithm
# Takes in a dataset of binary variables and takes a test set of binary variables.
def BN_NE(T, test):
    cols = np.transpose(T)
    n    = len(cols)
    p_1 = np.array([(sum(cols[i] == 1) + 1) / (len(cols[i]) + 2) for i in range(n)])
    return np.array([[p_1[+i] if ti == 0 else 1 - p_1[i] for i, ti in enumerate(t)] for t in test])
    
# Log Likelihood
def LL(p):
    return np.sum(np.log(p))

def AVG_LL(P):
    return sum([LL(p) for p in P]) / len(P)

# Assume node 0 is root
def Create_Network(MST):
    n = len(MST)
    network = [[] for i in range(n)]
    for i in range(1,n):
        addedNode = False
        for j in range(1,n):
            if (MST[j][i] != 0):
                network[i].append(j)
                addedNode = True
        if (not addedNode):
            network[i].append(0)
    return network;

def BN_CL(T, t):
    d = np.transpose(T)
    n = len(d)
    CLT = Create_Network(np.array(nx.adjacency_matrix(build_chow_liu_tree(T, n)).todense()))
    def Compute_Probability(i, parents, test):
        return np.product([(sum(d[i][d[p] == test] == test) + 1) / (sum(d[p] == test) + 2) for p in parents])
    return [[Compute_Probability(i, CLT[i], test[i]) for i in range(n)] for test in t]

In [76]:
AVG_LL(BN_NE(train['accidents'], test['accidents']))

-210.09122926066948

In [None]:
AVG_LL(BN_CL(train['accidents'], test['accidents']))