In [100]:
from __future__ import division
from __future__ import print_function
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import minimum_spanning_tree
import numpy as N, networkx as nx
from collections import defaultdict
import pandas as pd
import numpy  as np
import random

In [99]:
# BELOW CODE WAS IMPLEMENTED BY John Reid in pybool package
# https://github.com/JohnReid/pybool/blob/master/python/pybool/chow_liu_trees.py
# The code has been pasted here because csgrads1 was having issues installing pybool
# Add one laplace smoothing has also been included.

def marginal_distribution(X, u):
    """
    Return the marginal distribution for the u'th features of the data points, X.
    """
    values = defaultdict(float)
    s = 1. / len(X)
    for x in X:
        values[x[u]] += s
    return values



def marginal_pair_distribution(X, u, v):
    """
    Return the marginal distribution for the u'th and v'th features of the data points, X.
    """
    if u > v:
        u, v = v, u
    values = defaultdict(float)
    s = 1. / len(X)
    for x in X:
        values[(x[u], x[v])] += s
    return values



def calculate_mutual_information(X, u, v):
    """
    X are the data points.
    u and v are the indices of the features to calculate the mutual information for.
    """
    if u > v:
        u, v = v, u
    marginal_u = marginal_distribution(X, u)
    marginal_v = marginal_distribution(X, v)
    marginal_uv = marginal_pair_distribution(X, u, v)
    I = 0.
    for x_u, p_x_u in marginal_u.iteritems():
        for x_v, p_x_v in marginal_v.iteritems():
            if (x_u, x_v) in marginal_uv:
                p_x_uv = marginal_uv[(x_u, x_v)]
                I += p_x_uv * (N.log(p_x_uv) - N.log(p_x_u) - N.log(p_x_v))
    return I


def build_chow_liu_tree(X, n):
    """
    Build a Chow-Liu tree from the data, X. n is the number of features. The weight on each edge is
    the negative of the mutual information between those features. The tree is returned as a networkx
    object.
    """
    G = nx.Graph()
    for v in xrange(n):
        G.add_node(v)
        for u in xrange(v):
            G.add_edge(u, v, weight=-calculate_mutual_information(X, u, v))
    T = nx.minimum_spanning_tree(G)
    return T

In [5]:
data_dir = "./small-10-datasets/"
data_titles = ['accidents', 'baudio', 'bnetflix', 'dna', 'jester', 'kdd', 'msnbc',
              'nltcs', 'plants', 'r52']
test  = dict()
train = dict()
valid = dict()

for title in data_titles:
    test[title]  = np.loadtxt(data_dir + title + '.test.data', delimiter=',')
    train[title] = np.loadtxt(data_dir + title + '.ts.data', delimiter=',')
    valid[title] = np.loadtxt(data_dir + title + '.valid.data', delimiter=',')

In [141]:
# Log Likelihood
def LL(p):
    return np.sum(np.log2(p))

def AVG_LL(P):
    return sum([LL(p) for p in P]) / len(P)


# How it should work: Each node should have their respective probabilities from the training set.
# For root case, it should simple hold a table for p(x=1) and p(x=0)
# For the directed edges, there should contain a p(x=1|y=1), p(x=1|y=0) and so on for each variable
# Pick random node for root
def Create_Network(MST, T):
    n = len(MST)
    T = np.transpose(T)
    network = [{} for i in range(n)]
    root = 0
    for i in range(0,n):
        addedNode = False
        for j in range(0,n):
            if (MST[j][i] != 0 and i != j):
                p = [(sum(T[i][T[j] == 1] == 1) + 1) / (sum(T[j] == 1) + 2),
                     (sum(T[i][T[j] == 0] == 1) + 1) / (sum(T[j] == 0) + 2)]
                network[i].update({i : [1 - p[0], p[0], 1 - p[1], p[1]]})
                addedNode = True
        if (not addedNode or i == root):
            p = (sum(T[i] == 1) + 1) / (len(T[i]) + 2)
            network[i].update({i : [1 - p, p]})
    return network;


def Predict_Network(N, test):
    all_predictions = []
    for t in test:
        predictions = []
        for i in range(len(N)):
            probs = []
            for k in N[i].keys():
                if(k == i):
                    probs.append(N[i][k][0] if t[i] == 0 else N[i][k][1])
                else:
                    if (t[k] == 0):
                        probs.append(N[i][k][0] if t[i] == 0 else N[i][k][1])
                    else:
                        probs.append(N[i][k][2] if t[i] == 0 else N[i][k][3])
            predictions.append(np.product(probs))
        all_predictions.append(predictions)
    return all_predictions

# n = # of columns
def Generate_Network(n):
    G = nx.from_numpy_array(np.array([[random.random() for i in range(n)] for j in range(n)]))
    G = nx.to_numpy_array(nx.minimum_spanning_tree(G))
    return Create_Network(G)

def Split_Tree(T, k):
    split = [i for i in range(0, len(T), int(len(T) / k))]
    return [T[range(split[i],len(T) if i == k -1 else split[i+1])] for i in range(k)]

# Bayesian Network, Chow-Liu Algorithm
# Returns a data structure in the form of a Bayesian Network with probabilities pre-computed inside.
def BN_CL(T):
    n = len(T[0])
    CLT = Create_Network(np.array(nx.adjacency_matrix(build_chow_liu_tree(T, n)).todense()), T)
    return CLT

# Bayesian Network, No edges Algorithm
# Takes in a dataset of binary variables and takes a test set of binary variables.
def BN_NE(T, test):
    cols = np.transpose(T)
    n    = len(cols)
    p_1 = np.array([(sum(cols[i] == 1) + 1) / (len(cols[i]) + 2) for i in range(n)])
    return np.array([[p_1[+i] if ti == 0 else 1 - p_1[i] for i, ti in enumerate(t)] for t in test])

# Mixtures of Tree Bayesian networks using EM
def MT_BN(T, k):
    valid = T[range(0, int(len(T) / 10))]
    T     = T[range(int(len(T) / 10), len(T))]
    return EM(T,k)


def EM(T, k):
    T = Split_Tree(T, k)
    r = [random.random() for i in range(k)]
    pi = [-np.log2(i) for i in r]
    Ns = [BN_CL(T[i]) for i in range(k)]
    return pi
    

In [108]:
AVG_LL(BN_NE(train['accidents'], test['accidents']))

-303.09757458863464

In [107]:
AVG_LL(Predict_Network((BN_CL(T=train['accidents'])), test['accidents']))

-186.3189656766254

In [136]:
s = MT_BN(train['accidents'][0:100], 5)

In [148]:
np.exp(s[0]) / np.log(np.sum(np.exp(s)))

5.631221988421369

3.3081687755833475