# Influence function - standard error

In [3]:
# Import libraries
%matplotlib inline
import matplotlib.pyplot as plt
import time
import numpy as np
from numpy import random
import math
import scipy
from scipy import stats
from random import choice
import networkx as nx
import json
from networkx.readwrite import json_graph
import line_profiler
import IPython
ip = IPython.get_ipython()
ip.define_magic('lprun', line_profiler.magic_lprun)

In [4]:
import findspark
import os
findspark.init()
import pyspark
sc = pyspark.SparkContext()

In [23]:
# Load network graph
with open("graph/nc_full.json", "r") as graph_data:
    graph_data = json.load(graph_data)
    NC_digraph = json_graph.node_link_graph(graph_data)

### Calculate variance of simulation results

In [9]:
def influenceFunction(detStartNodes, t):

    explored = set()
    activated = set()
    nodes = []
    for node in detStartNodes:
        nodes.append(node)
        activated.add(node)
    
    start = detStartNodes[0]
    successors = []

    for i in range(t):
        successors = []
        while len(nodes)>0:
            startNode = nodes[0]
            if startNode not in explored:
                explored.add(startNode)
            for succNode in NC_digraph.succ[startNode]:
                if succNode not in activated:
                    alpha = NC_digraph[startNode][succNode]['weight']
                    beta = NC_digraph.node[succNode]['review_count']
                    if random.uniform(0,1) < np.sqrt(random.beta(alpha, beta)):
                        if succNode not in explored:
                            activated.add(succNode)
                            successors.append(succNode)
            nodes = nodes[1:]
        nodes = successors
        if not nodes:
            break
    return len(activated)

In [10]:
def influenceFunctionNotPar(detStartNodes, N, t=999999):
    result = []
    for n in xrange(N):
        result.append(float(influenceFunction(detStartNodes, t)))
    return np.mean(result)

In [14]:
def influenceFunctionPar(detStartNodes, N, t=999999):
    runs = sc.parallelize(range(N), 4)
    results = runs.map(lambda x: influenceFunction(detStartNodes, t))
    return np.mean(results.collect())

In [21]:
def greedySearch(N, numNodes, t=999999):
    
    searchNodes = NC_digraph.nodes()
    print len(searchNodes)
    
    searchNodes = list(set(list(sum(NC_digraph.edges(), ()))))
    print len(searchNodes)
    
    actNodeOpt = []
    node = 0
    
    for s in range(numNodes):
        
        maxActNode = 0
        maxActNodeStart = []
        
        node = 0
        
        for n in searchNodes:
            node += 1
            if node%500==0:
                print node
            detStartNodes = actNodeOpt + [n]
            runs = sc.parallelize(range(N), 4)
            results = runs.map(lambda x: influenceFunction(detStartNodes, t))
            act = np.mean(results.collect())
            if act>maxActNode:
                maxActNode = act
                maxActNodeStart = n
            
        searchNodes.remove(maxActNodeStart)
        actNodeOpt.append(maxActNodeStart)
        print maxActNodeStart
    
    return actNodeOpt, maxActNode

In [None]:
def greedySearch2(k = 3, N = 1000):
    
    best_s = []
    max_inf = 0
    nodeRDD = sc.parallelize(NC_digraph.nodes())
    print 
    
    for i in range(k):
        infRDD = nodeRDD.map(lambda n: (n, 0.) if n in best_s else (n, influenceFunction(best_s + [n], N=N)))
        next_s, next_i = infRDD.reduce(lambda a,b: a if a[1] > b[1] else b)
        print next_s
        best_s += [next_s]
        max_inf = next_i
    
    return best_s,max_inf

In [16]:
# Test influence function for given start nodes
start = ['NzWLMPvbEval0OVg_YDn4g','ts7EG6Zv2zdMDg29nyqGfA','VhI6xyylcAxi0wOy2HOX3w']

print "No cap on t (serial):"
%time print influenceFunctionNotPar(start, 100)

print "t capped at 10 (serial):"
%time print influenceFunctionNotPar(start, 100, 10)

print "t capped at 10 (parallel):"
%time print influenceFunctionNotPar(start, 100, 10)

No cap on t (serial):
44.14
CPU times: user 1.84 s, sys: 184 ms, total: 2.03 s
Wall time: 2.03 s
t capped at 10 (serial):
48.61
CPU times: user 87.5 ms, sys: 703 µs, total: 88.2 ms
Wall time: 89.2 ms
t capped at 10 (parallel):
44.97
CPU times: user 93.1 ms, sys: 656 µs, total: 93.7 ms
Wall time: 93.9 ms


In [22]:
# Test greedy algorithm

print "No cap on t:"
%time print greedySearch(1, 1)

print "t capped at 10:"
%time print greedySearch(1, 1, 10)

No cap on t:
240
145
A6bPFcUjuuayRBoyybedDQ
([u'A6bPFcUjuuayRBoyybedDQ'], 50.0)
CPU times: user 6.44 s, sys: 339 ms, total: 6.78 s
Wall time: 16.4 s
t capped at 10:
240
145
ED6n40WmZJm0AvsKkG7iaw
([u'ED6n40WmZJm0AvsKkG7iaw'], 69.0)
CPU times: user 5.37 s, sys: 322 ms, total: 5.7 s
Wall time: 11.9 s
