In [2]:
from __future__ import division
from __future__ import print_function
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import minimum_spanning_tree
import pandas as pd
import numpy  as np

In [3]:
data_dir = "./small-10-datasets/"
data_titles = ['accidents', 'baudio', 'bnetflix', 'dna', 'jester', 'kdd', 'msnbc',
              'nltcs', 'plants', 'r52']
test  = dict()
train = dict()
valid = dict()

for title in data_titles:
    test[title]  = np.loadtxt(data_dir + title + '.test.data', delimiter=',')
    train[title] = np.loadtxt(data_dir + title + '.ts.data', delimiter=',')
    valid[title] = np.loadtxt(data_dir + title + '.valid.data', delimiter=',')

In [351]:
# Bayesian Network, No edges Algorithm
# Takes in a dataset of binary variables and takes a test set of binary variables.
def BN_NE(T, test):
    cols = np.transpose(T)
    n    = len(cols)
    p_1 = np.array([(sum(cols[i] == 1) + 1) / (len(cols[i]) + 2) for i in range(n)])
    return np.array([[p_1[+i] if ti == 0 else 1 - p_1[i] for i, ti in enumerate(t)] for t in test])
    
# Log Likelihood
def LL(p):
    return np.sum(np.log(p))

def AVG_LL(P):
    return sum([LL(p) for p in P]) / len(P)

In [352]:
AVG_LL(BN_NE(train['accidents'], test['accidents']))

-210.09122926066948

In [349]:
# Assume node 0 is root
def Create_Network(MST):
    n = len(MST)
    network = [[] for i in range(n)]
    for i in range(1,n):
        addedNode = False
        for j in range(1,n):
            if (MST[j][i] != 0):
                network[i].append(j)
                addedNode = True
        if (not addedNode):
            network[i].append(0)
    return network;

# O(n^2 * size of data) when computing PxGy.
def CL(T):
    d = np.transpose(T)
    n = len(d)
    P = [0 for i in range(2)]
    P[1] = np.array([(sum(X == 1) + 1) / (len(X) + n) for X in d])
    P[0] = np.array([1 - p for p in p_1])
    PxGy = [[0 for i in range(2)] for j in range(2)]
    PxGy[1][1] = np.array([[(sum(X[Y == 1] == 1) + 1) / (sum(Y == 1) + 2) for X in d] for Y in d])
    PxGy[0][1] = np.array([[1 - p for p in X] for X in PxGy[1][1]])
    PxGy[1][0] = np.array([[(sum(X[Y == 0] == 1) + 1) / (sum(Y == 0) + 2) for X in d] for Y in d])
    PxGy[0][0] = np.array([[1 - p for p in X] for X in PxGy[1][0]])
    Ixy = [[PxGy[x][y] * P[y] * np.log2(PxGy[x][y] / P[x]) for x in range(2)] for y in range(2)]
    M = Ixy[0][0] + Ixy[0][1] + Ixy[1][0] + Ixy[1][1]
    MST = minimum_spanning_tree(M * np.array(-1)).toarray()
    MST = np.array([[0 if x == 0 else x * -1 for x in X] for X in MST])
    return Create_Network(MST)
    
def BN_CL(T, t):
    CLT = CL(T)
    d = np.transpose(T)
    n = len(d)
    def Compute_Probability(i, parents, test):
        return np.product([(sum(d[i][d[p] == test] == test) + 1) 
                           / (sum(d[p] == test) + 2) for p in parents])
    return [[Compute_Probability(i, CLT[i], test[i]) for i in range(n)] for test in t]

In [None]:
AVG_LL(BN_CL(train['accidents'], test['accidents']))