In [3]:
from __future__ import division
from __future__ import print_function
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import minimum_spanning_tree
import numpy as N, networkx as nx
from collections import defaultdict
import pandas as pd
import numpy  as np
import random

In [4]:
# BELOW CODE WAS IMPLEMENTED BY John Reid in pybool package
# https://github.com/JohnReid/pybool/blob/master/python/pybool/chow_liu_trees.py
# The code has been pasted here because csgrads1 was having issues installing pybool
# Add one laplace smoothing has also been included.

def marginal_distribution(X, u):
    """
    Return the marginal distribution for the u'th features of the data points, X.
    """
    values = defaultdict(float)
    s = 1. / len(X)
    for x in X:
        values[x[u]] += s
    return values



def marginal_pair_distribution(X, u, v):
    """
    Return the marginal distribution for the u'th and v'th features of the data points, X.
    """
    if u > v:
        u, v = v, u
    values = defaultdict(float)
    s = 1. / len(X)
    for x in X:
        values[(x[u], x[v])] += s
    return values



def calculate_mutual_information(X, u, v):
    """
    X are the data points.
    u and v are the indices of the features to calculate the mutual information for.
    """
    if u > v:
        u, v = v, u
    marginal_u = marginal_distribution(X, u)
    marginal_v = marginal_distribution(X, v)
    marginal_uv = marginal_pair_distribution(X, u, v)
    I = 0.
    for x_u, p_x_u in marginal_u.iteritems():
        for x_v, p_x_v in marginal_v.iteritems():
            if (x_u, x_v) in marginal_uv:
                p_x_uv = marginal_uv[(x_u, x_v)]
                I += p_x_uv * (N.log(p_x_uv) - N.log(p_x_u) - N.log(p_x_v))
    return I


def build_chow_liu_tree(X, n):
    """
    Build a Chow-Liu tree from the data, X. n is the number of features. The weight on each edge is
    the negative of the mutual information between those features. The tree is returned as a networkx
    object.
    """
    G = nx.Graph()
    for v in xrange(n):
        G.add_node(v)
        for u in xrange(v):
            G.add_edge(u, v, weight=-calculate_mutual_information(X, u, v))
    T = nx.minimum_spanning_tree(G)
    return T

In [5]:
data_dir = "./small-10-datasets/"
data_titles = ['accidents', 'baudio', 'bnetflix', 'dna', 'jester', 'kdd', 'msnbc',
              'nltcs', 'plants', 'r52']
test  = dict()
train = dict()
valid = dict()

for title in data_titles:
    test[title]  = np.loadtxt(data_dir + title + '.test.data', delimiter=',')
    train[title] = np.loadtxt(data_dir + title + '.ts.data', delimiter=',')
    valid[title] = np.loadtxt(data_dir + title + '.valid.data', delimiter=',')

In [7]:
# Log Likelihood
def LL(p):
    return np.sum(np.log2(p))

def AVG_LL(P):
    return sum([LL(p) for p in P]) / len(P)

# Pick random node for root
def Create_Network(MST):
    n = len(MST)
    network = [[] for i in range(n)]
    root = 0
    for i in range(0,n):
        addedNode = False
        for j in range(0,n):
            if (MST[j][i] != root):
                network[i].append(j)
                addedNode = True
        if (not addedNode or i == root):
            network[i] = [root]
    return network;

# n = # of columns
def Generate_Network(n):
    G = nx.from_numpy_array(np.array([[random.random() for i in range(n)] for j in range(n)]))
    G = nx.to_numpy_array(nx.minimum_spanning_tree(G))
    return Create_Network(G)
    
    
# Predictions with 1 laplace smoothing
def Predict_With_Network(T, Tree, t):
    d = np.transpose(T)
    n = len(d)
    p1G1 = [[0 for i in range(n)] for j in range(n)]
    p1G0 = [[0 for i in range(n)] for j in range(n)]
    for i in range(n):
        for a in CLT[i]:
            p1G1[i][a] = (sum(d[i][d[a] == 1] == 1) + 1) / (sum(d[a] == 1) + 2)
            p1G0[i][a] = (sum(d[i][d[a] == 0] == 1) + 1) / (sum(d[a] == 0) + 2)
    def Compute_Probability(i, parents):
        probs = [(p1G1[i][p] if test[i] == 1 else 1 - p1G1[i][p]) if test[p] == 1 
                 else (p1G0[i][p] if test[i] == 1 else 1 - p1G0[i][p]) for p in parents]
        return np.product(probs)
    return [[Compute_Probability(i, CLT[i]) for i in range(n)] for test in t]

def Split_Tree(T, k):
    split = [i for i in range(0, len(T), int(len(T) / k))]
    return [T[range(split[i],len(T) if i == k -1 else split[i+1])] for i in range(k)]

def BN_CL(T):
    n = len(T[0])
    CLT = Create_Network(np.array(nx.adjacency_matrix(build_chow_liu_tree(T, n)).todense()))
    return CLT

# Bayesian Network, No edges Algorithm
# Takes in a dataset of binary variables and takes a test set of binary variables.
def BN_NE(T, test):
    cols = np.transpose(T)
    n    = len(cols)
    p_1 = np.array([(sum(cols[i] == 1) + 1) / (len(cols[i]) + 2) for i in range(n)])
    return np.array([[p_1[+i] if ti == 0 else 1 - p_1[i] for i, ti in enumerate(t)] for t in test])

# Mixtures of Tree Bayesian networks using EM
def MT_BN(T, k):
    valid = T[range(0, int(len(T) / 10))]
    T     = T[range(int(len(T) / 10), len(T))]
    T     = Split_Tree(T, k)
    tree = [BN_CL(t) for t in T]
    return tree


def EM():
    

IndentationError: expected an indented block (<ipython-input-7-5ffc060c2b0b>, line 72)

In [None]:
AVG_LL(BN_NE(train['accidents'], test['accidents']))

In [None]:
AVG_LL(Predict_With_Network(train['accidents'], BN_CL(train['accidents']), test['accidents']))

In [76]:
MT_BN(train['accidents'][0:100], 5)

[[[0],
  [4],
  [4],
  [4],
  [0, 1, 2, 3, 50, 59, 94],
  [26, 86],
  [11],
  [13],
  [20],
  [37, 54, 58, 82],
  [15, 44, 57, 64],
  [6, 35],
  [28, 50, 87],
  [7, 46, 52, 56, 75],
  [25, 50],
  [10, 34, 35],
  [39, 72],
  [21, 31],
  [21, 32],
  [43, 53, 54, 62, 83],
  [8, 21, 40],
  [17, 18, 20, 29, 39],
  [29],
  [24, 33, 45],
  [23],
  [14, 34],
  [5],
  [49, 51, 60],
  [12, 30],
  [21, 22],
  [28, 32, 79],
  [17],
  [18, 30, 65],
  [23, 36, 70],
  [15, 25, 83],
  [11, 15],
  [33, 61],
  [9],
  [72],
  [16, 21],
  [20, 55],
  [73, 81, 87],
  [71, 96],
  [19],
  [10, 75],
  [23, 104],
  [13],
  [50],
  [79],
  [27, 54],
  [4, 12, 14, 47],
  [27, 68],
  [13],
  [19, 89],
  [9, 19, 49],
  [40],
  [13, 84, 91, 98],
  [10],
  [9],
  [4],
  [27, 78],
  [36, 80],
  [19],
  [79, 90],
  [10, 67, 96],
  [32],
  [80],
  [64],
  [51, 100],
  [71, 77],
  [33, 79],
  [42, 69],
  [16, 38],
  [41],
  [77],
  [13, 44],
  [77],
  [69, 74, 76],
  [60, 85, 95],
  [30, 48, 63, 70, 86, 93],
  [61, 66],