# Project 1

In [27]:
from __future__ import print_function
import time, os, json
import numpy as np
import matplotlib.pyplot as plt
import sys
import networkx as nx
import pandas
from math import lgamma

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [76]:
smallDf = pandas.read_csv('small.csv')
smallDf

Unnamed: 0,age,portembarked,fare,numparentschildren,passengerclass,sex,numsiblings,survived
0,1,1,1,1,1,1,1,1
1,2,2,1,1,2,2,1,2
2,1,1,1,1,1,2,1,2
3,2,1,1,1,2,2,1,2
4,2,1,1,1,1,1,1,1
5,2,3,1,1,1,1,1,1
6,3,1,1,1,2,1,1,1
7,1,1,1,1,1,1,2,1
8,2,1,1,2,1,2,1,2
9,1,2,1,1,3,2,1,2


In [29]:
list(smallDf)

['age',
 'portembarked',
 'fare',
 'numparentschildren',
 'passengerclass',
 'sex',
 'numsiblings',
 'survived']

In [106]:
#smallDf[['age', 'numsiblings']].groupby()
groups = smallDf.groupby(['age', 'portembarked']).size().reset_index(name='counts')
log0 = groups['portembarked'] == 1
log1 = groups['age'] == 1
groups[log0 & log1].counts.values[0]

248

In [129]:
print(smallDf['age'].unique())
#print(G.pred['age'])
G = nx.DiGraph()
G.add_node('age', inst=3, inst_vals=[1,2,3])
G.add_node('fare', inst=3, inst_vals=[1,2,3])
G.add_node('sex', inst=3, inst_vals=[1,2,3])
G.add_node('numparentschildren', inst=3, inst_vals=[1,2,3])
G.add_node('portembarked', inst=3, inst_vals=[1,2,3])
G.add_node('numsiblings', inst=3, inst_vals=[1,2,3])


G.add_edge('age','fare')
G.add_edge('age','portembarked')
G.add_edge('numparentschildren','age')
G.add_edge('numsiblings','age')
print(G.node['age']['inst'])
[G.node[parent]['inst'] for parent in G.predecessors('age')]
G.nodes(data=True)['age']['inst']

[1 2 3]
3


3

In [145]:
def initialize(df):
    headers = list(df)
    G = nx.DiGraph()
    for header in headers:
        inst_vals = df[header].unique()
        instantiations = np.max(inst_vals)
        G.add_node(header, inst=instantiations, instval=inst_vals)
    return G

def bayesian_score(g, df):
    nodes = G.nodes(data=True)
    num_nodes = len(nodes) # for alpha_ijk = 1
    score = 0
    for n,d in nodes:
        #parents = [G.node[parent]['inst_vals'] for parent in G.predecessors(n)]
        ## max(q_i, 1) to cover for |Pa| = 0
        #q_i = np.max(np.prod([G.node[parent]['inst'] for parent in G.predecessors(n)]), 1)
        #r_i = d['inst']
        node_score = 0
        for parent in G.predecessors(n):
            parent_score = 0
            parent_child = df.groupby([n, parent]).size().reset_index(name='counts')
            for pval in G.node[parent]['inst_vals']:
                log_p = parent_child[parent] == pval
                m_ij = 0
                child_score = 0
                for cval in d['inst_vals']:
                    log_c = parent_child[n] == cval
                    counts = parent_child[log_p & log_c].counts
                    if (len(counts) <= 0):
                        continue
                    m_ijk = counts.values[0]
                    m_ij += m_ijk
                    child_score += lgamma(1 + m_ijk) - lgamma(1)
                q_i_score = lgamma(num_nodes) - lgamma(num_nodes + m_ij) + child_score
                parent_score += q_i_score
            node_score += parent_score
        score += node_score

    return score

def k2(init_g, df, iters):
    g = init_g.copy()
    best_score = bayesian_score(g, df)
    for i in range(iters):
        for n in G.nodes():
            for p in G.nodes():
                # try removing edge
                if (g.has_edge(n,p)):
                    best_score = test_fitness(g.remove_edge, g.add_edge, n, p, g, df, best_score)
                    # if removing wasn't better, try reversing
                    if (g.has_edge(n,p)):
                        best_score = test_fitness(reverse_edge(g), reverse_edge(g), n, p, g, df, best_score)
                else: # try adding edge
                    best_score = test_fitness(g.add_edge, g.remove_edge, n, p, g, df, best_score)

#                 if (g.has_edge(n,p)):
#                     best_score = test_fitness(g.remove_edge, n, p, g, df)
#                 else:
#                     g.add_edge(n, p)
#                     score = bayesian_score(g, df)
#                     if (score > best_score and len(find_cycle(g)) == 0):
#                         best_score = score
#                     else:
#                         g.remove_edge(n, p)
                    
    return g, best_score



def test_fitness(fn, fnrev, n, p, g, df, best_score):
    fn(n, p)
    score = bayesian_score(g, df)

    if (score < best_score or has_cycle(g)):
        fnrev(n, p)
        score = best_score
    return score
    
    # loop through nodes
        # add parent to node
        # if cycle detected, abort move
        # calculate value. if better, then keep; else continue
    
def reverse_edge(g):
    def rev_with_g(n, p):
        g.remove_edge(n, p)
        g.add_edge(p, n)
    return rev_with_g


def has_cycle(g):
    cycle = False
    try:
        cycle = nx.algorithms.cycles.find_cycle(g)
    except:
        pass
    return cycle

In [148]:
bayesian_score(G, smallDf)
g, best_score = k2(G, smallDf, 3)

In [149]:
print(best_score)

-2412.920848340419


In [2]:
def write_gph(dag, idx2names, filename):
    with open(filename, 'w') as f:
        for edge in dag.edges():
            f.write("{}, {}\n".format(idx2names[edge[0]], idx2names[edge[1]]))


def compute(infile, outfile):
    
    
def main():
    if len(sys.argv) != 3:
        raise Exception("usage: python project1.py <infile>.csv <outfile>.gph")

    inputfilename = sys.argv[1]
    outputfilename = sys.argv[2]
    compute(inputfilename, outputfilename)

if __name__ == '__main__':
    main()

hi


# Load

In [40]:
x = np.loadtxt('logistic_x.txt')
y = np.loadtxt('logistic_y.txt')

print("x:", x.shape)
print("y:", y.shape)

x: (99, 2)
y: (99,)
