# Timing tests

In [1]:
# Import libraries
%matplotlib inline
import matplotlib.pyplot as plt
import time
import numpy as np
from numpy import random
import math
import scipy
from scipy import stats
from random import choice
import networkx as nx
import json
from networkx.readwrite import json_graph
import line_profiler
import IPython
ip = IPython.get_ipython()
ip.define_magic('lprun', line_profiler.magic_lprun)

In [2]:
import findspark
import os
findspark.init()
import pyspark
sc = pyspark.SparkContext()

In [3]:
# Load network graph
with open("graph/nc_mini.json", "r") as graph_data_mini:
    graph_data_mini = json.load(graph_data_mini)
    NC_digraph_mini = json_graph.node_link_graph(graph_data_mini)
    
with open("graph/nc_full.json", "r") as graph_data_full:
    graph_data_full = json.load(graph_data_full)
    NC_digraph_full = json_graph.node_link_graph(graph_data_full)

### Functions

In [4]:
def influenceFunction(graph, detStartNodes, t=999999):

    explored = set()
    activated = set()
    nodes = []
    for node in detStartNodes:
        nodes.append(node)
        activated.add(node)
    
    start = detStartNodes[0]
    successors = []

    for i in range(t):
        successors = []
        while len(nodes)>0:
            startNode = nodes[0]
            if startNode not in explored:
                explored.add(startNode)
            for succNode in graph.succ[startNode]:
                if succNode not in activated:
                    alpha = graph[startNode][succNode]['weight']
                    beta = graph.node[succNode]['review_count']
                    if random.uniform(0,1) < np.sqrt(random.beta(alpha, beta)):
                        if succNode not in explored:
                            activated.add(succNode)
                            successors.append(succNode)
            nodes.remove(startNode)
        nodes = successors
        if not nodes:
            break
    return float(len(activated))

In [5]:
def influenceFunctionNotParDetStart(graph, detStartNodes, N, t=999999):
    result = []
    for n in xrange(N):
        result.append(float(influenceFunction(graph, detStartNodes, t)))
    return np.mean(result)

In [6]:
def influenceFunctionParDetStart(graph, detStartNodes, N, t=999999):
    runs = sc.parallelize(range(N), 4)
    results = runs.map(lambda x: influenceFunction(graph, detStartNodes, t))
    return np.mean(results.collect())

In [7]:
def influenceFunctionNotParRandStart(graph, N, t=999999):
    result = []
    for n in xrange(N):
        result.append(float(influenceFunction(graph, [random.choice(graph.nodes(),1)[0]], t)))
    return np.mean(result)

In [8]:
def influenceFunctionParRandStart(graph, N, t=999999):
    runs = sc.parallelize(range(N), 4)
    results = runs.map(lambda x: influenceFunction(graph, [random.choice(graph.nodes(),1)[0]], t))
    return np.mean(results.collect())

In [9]:
def greedySearch(graph, k=3, N=1000, t=999999):
    
    best_s = []
    max_inf = 0
    nodeRDD = sc.parallelize(list(set(list(sum(graph.edges(), ())))), 4)
    
    for i in range(k):
        infRDD = nodeRDD.map(lambda n: (n, 0.) if n in best_s else \
                             (n, influenceFunctionNotParDetStart(graph, best_s + [n], N)))
        next_s, next_i = infRDD.reduce(lambda a,b: a if a[1] > b[1] else b)
#         print next_s
        best_s += [next_s]
        max_inf = next_i
    
    return best_s, max_inf

In [10]:
def maxNodes(graph, k=3, N=1000, t=999999):
    
    nodeRDD = sc.parallelize(list(set(list(sum(graph.edges(), ())))), 4)   
    infRDD = nodeRDD.map(lambda n: (n, influenceFunctionNotParDetStart(graph, [n], N)))
    return infRDD.takeOrdered(k, lambda w: -w[1])

In [22]:
def maxEdges(graph, k=3):
    
    nodeRDD = sc.parallelize(list(set(list(sum(graph.edges(), ())))), 4)   
    infRDD = nodeRDD.map(lambda n: (n, len(graph.succ[n])))
    return infRDD.takeOrdered(k, lambda w: -w[1])

### Timing - NC_mini

In [27]:
NC_digraph_mini.number_of_nodes()

240

In [10]:
# Influence function for given start nodes
start = ['NzWLMPvbEval0OVg_YDn4g','ts7EG6Zv2zdMDg29nyqGfA','VhI6xyylcAxi0wOy2HOX3w']
noRuns = 100

print "No cap on t (serial):"
startTime = time.time()
influenceFunctionNotParDetStart(NC_digraph_mini, start, noRuns)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

print "\nt capped at 10 (serial):"
startTime = time.time()
influenceFunctionNotParDetStart(NC_digraph_mini, start, noRuns, 10)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

print "\nNo cap on t (parallel):"
startTime = time.time()
influenceFunctionParDetStart(NC_digraph_mini, start, noRuns)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

print "\nt capped at 10 (parallel):"
startTime = time.time()
influenceFunctionParDetStart(NC_digraph_mini, start, noRuns, 10)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

No cap on t (serial):
2.077829 seconds run time

t capped at 10 (serial):
0.094027 seconds run time

No cap on t (parallel):
2.400825 seconds run time

t capped at 10 (parallel):
0.183546 seconds run time


In [11]:
# Influence function for random start nodes
noRuns = 100

print "No cap on t (serial):"
startTime = time.time()
influenceFunctionNotParRandStart(NC_digraph_mini, noRuns)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

print "\nt capped at 10 (serial):"
startTime = time.time()
influenceFunctionNotParRandStart(NC_digraph_mini, noRuns, 10)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

print "\nno cap on t (parallel):"
startTime = time.time()
influenceFunctionParRandStart(NC_digraph_mini, noRuns)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

print "\nt capped at 10 (parallel):"
startTime = time.time()
influenceFunctionParRandStart(NC_digraph_mini, noRuns, 10)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

No cap on t (serial):
2.369556 seconds run time

t capped at 10 (serial):
0.040825 seconds run time

no cap on t (parallel):
1.660861 seconds run time

t capped at 10 (parallel):
0.199847 seconds run time


In [12]:
# Influence function for random start nodes
noRuns = 1000

print "No cap on t (serial):"
startTime = time.time()
influenceFunctionNotParRandStart(NC_digraph_mini, noRuns)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

print "\nt capped at 10 (serial):"
startTime = time.time()
influenceFunctionNotParRandStart(NC_digraph_mini, noRuns, 10)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

print "\nno cap on t (parallel):"
startTime = time.time()
influenceFunctionParRandStart(NC_digraph_mini, noRuns)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

print "\nt capped at 10 (parallel):"
startTime = time.time()
influenceFunctionParRandStart(NC_digraph_mini, noRuns, 10)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

No cap on t (serial):
24.234820 seconds run time

t capped at 10 (serial):
0.153250 seconds run time

no cap on t (parallel):
9.927389 seconds run time

t capped at 10 (parallel):
0.208044 seconds run time


In [13]:
# Greedy algorithm
noRuns = 1

print "No cap on t:"
startTime = time.time()
greedySearch(NC_digraph_mini, 3, noRuns)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

print "\nt capped at 10:"
startTime = time.time()
greedySearch(NC_digraph_mini, 3, noRuns, 10)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

No cap on t:
4.336949 seconds run time

t capped at 10:
4.325324 seconds run time


In [14]:
# Look up nodes with max # of activations
noRuns = 1

print "No cap on t:"
startTime = time.time()
maxNodes(NC_digraph_mini, 3, noRuns)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

print "\nt capped at 10:"
startTime = time.time()
maxNodes(NC_digraph_mini, 3, noRuns, 10)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

No cap on t:
1.640124 seconds run time

t capped at 10:
2.304997 seconds run time


In [25]:
# Look up nodes with max # of edges
startTime = time.time()
print maxEdges(NC_digraph_mini, 3)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

[(u'NzWLMPvbEval0OVg_YDn4g', 39), (u'ts7EG6Zv2zdMDg29nyqGfA', 36), (u'VhI6xyylcAxi0wOy2HOX3w', 30)]
0.199740 seconds run time


### Summary of run-times for NC_mini

**100 runs of influence function for 3 given starting nodes:**

No cap on t (serial):
2.077829 seconds run time >> i.e. ~0.02 seconds/run

t capped at 10 (serial):
0.094027 seconds run time >> i.e. ~0.0009 seconds /run

No cap on t (parallel):
2.400825 seconds run time

t capped at 10 (parallel):
0.183546 seconds run time

**100 runs of influence function for 1 random starting node:**

No cap on t (serial):
2.369556 seconds run time >> i.e. ~0.02 seconds/run

t capped at 10 (serial):
0.040825 seconds run time >> i.e. ~0.0004 seconds/run

no cap on t (parallel):
1.660861 seconds run time

t capped at 10 (parallel):
0.199847 seconds run time

**1000 runs of influence function for 1 random starting node:**

No cap on t (serial):
24.234820 seconds run time >> i.e. ~0.02 seconds/run

t capped at 10 (serial):
0.153250 seconds run time >> i.e. 0.0002 seconds/run

no cap on t (parallel):
9.927389 seconds run time

t capped at 10 (parallel):
0.208044 seconds run time

**1 run of Greedy algorithm for k=3**

No cap on t:
4.336949 seconds run time

t capped at 10:
4.325324 seconds run time

**Back of the envelope Greedy runtime calculation**:

[number of graph nodes] x [k] x [N] x [Influence fn runtime (serial)]
= 240 x 3 x 1 x 0.02 = 14.4 seconds = ~3-4x actual run-time

### Timing - NC_full

In [29]:
NC_digraph_full.number_of_nodes()

24224

100 runs of influence function - NC_full

In [14]:
# Influence function for given start nodes
start = ['NzWLMPvbEval0OVg_YDn4g','ts7EG6Zv2zdMDg29nyqGfA','VhI6xyylcAxi0wOy2HOX3w']
noRuns = 100

print "No cap on t (serial):"
startTime = time.time()
influenceFunctionNotParDetStart(NC_digraph_full, start, noRuns)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

print "\nt capped at 10 (serial):"
startTime = time.time()
influenceFunctionNotParDetStart(NC_digraph_full, start, noRuns, 10)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

print "\nno cap on t (parallel):"
startTime = time.time()
influenceFunctionParDetStart(NC_digraph_full, start, noRuns)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

print "\nt capped at 10 (parallel):"
startTime = time.time()
influenceFunctionParDetStart(NC_digraph_full, start, noRuns, 10)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

No cap on t (serial):
13.001081 seconds run time

t capped at 10 (serial):
9.181028 seconds run time

no cap on t (parallel):
9.347237 seconds run time

t capped at 10 (parallel):
9.984308 seconds run time


In [15]:
# Influence function for random start nodes
noRuns = 100

print "No cap on t (serial):"
startTime = time.time()
influenceFunctionNotParRandStart(NC_digraph_full, noRuns)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

print "\nt capped at 10 (serial):"
startTime = time.time()
influenceFunctionNotParRandStart(NC_digraph_full, noRuns, 10)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

print "\nno cap on t (parallel):"
startTime = time.time()
influenceFunctionParRandStart(NC_digraph_full, noRuns)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

print "\nt capped at 10 (parallel):"
startTime = time.time()
influenceFunctionParRandStart(NC_digraph_full, noRuns, 10)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

No cap on t (serial):
11.763882 seconds run time

t capped at 10 (serial):
2.570651 seconds run time

no cap on t (parallel):
6.617609 seconds run time

t capped at 10 (parallel):
5.377774 seconds run time


In [16]:
# Influence function for random start nodes
noRuns = 1000

print "No cap on t (serial):"
startTime = time.time()
influenceFunctionNotParRandStart(NC_digraph_full, noRuns)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

print "\nt capped at 10 (serial):"
startTime = time.time()
influenceFunctionNotParRandStart(NC_digraph_full, noRuns, 10)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

print "\nno cap on t (parallel):"
startTime = time.time()
influenceFunctionParRandStart(NC_digraph_full, noRuns)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

print "\nt capped at 10 (parallel):"
startTime = time.time()
influenceFunctionParRandStart(NC_digraph_full, noRuns, 10)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

No cap on t (serial):
101.021717 seconds run time

t capped at 10 (serial):
25.390516 seconds run time

no cap on t (parallel):
31.609033 seconds run time

t capped at 10 (parallel):
15.761864 seconds run time


In [17]:
# Greedy algorithm
noRuns = 1

print "No cap on t:"
startTime = time.time()
greedySearch(NC_digraph_full, 3, noRuns)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

print "\nt capped at 10:"
startTime = time.time()
greedySearch(NC_digraph_full, 3, noRuns, 10)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

No cap on t:
1975.263020 seconds run time

t capped at 10:
2695.917483 seconds run time


In [15]:
# Look up nodes with max # of activations
noRuns = 1

print "No cap on t:"
startTime = time.time()
maxNodes(NC_digraph_full, 3, noRuns)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

print "\nt capped at 10:"
startTime = time.time()
maxNodes(NC_digraph_full, 3, noRuns, 10)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

No cap on t:
488.436412 seconds run time

t capped at 10:
555.493437 seconds run time


In [26]:
# Look up nodes with max # of edges
startTime = time.time()
print maxEdges(NC_digraph_full, 3)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

[(u'CvMVd31cnTfzMUsHDXm4zQ', 319), (u'NzWLMPvbEval0OVg_YDn4g', 316), (u'4G68oLRY3aHE5XUt_MUUcA', 314)]
108.431385 seconds run time


### Summary of run-times for NC_full

**100 runs of influence function for 3 given starting nodes:**

No cap on t (serial):
13.001081 seconds run time >> i.e. ~0.1 seconds/run

t capped at 10 (serial):
9.181028 seconds run time >> i.e. ~0.09 seconds/run

no cap on t (parallel):
9.347237 seconds run time

t capped at 10 (parallel):
9.984308 seconds run time

**100 runs of influence function for 1 random starting node:**

No cap on t (serial):
11.763882 seconds run time >> i.e. ~0.1 seconds/run

t capped at 10 (serial):
2.570651 seconds run time >> i.e. ~0.03 seconds/run

no cap on t (parallel):
6.617609 seconds run time

t capped at 10 (parallel):
5.377774 seconds run time

**1000 runs of influence function for 1 random starting node:**

No cap on t (serial):
101.021717 seconds run time >> i.e. ~0.1 seconds/run

t capped at 10 (serial):
25.390516 seconds run time >> i.e. ~0.03 seconds/run

no cap on t (parallel):
31.609033 seconds run time

t capped at 10 (parallel):
15.761864 seconds run time

**1 run of Greedy algorithm for k=3 (4 workers)**

No cap on t:
1975.263020 seconds run time = 32.9 minutes

**Back of the envelope Greedy runtime calculation**:

[number of graph nodes] x [k] x [N] x [Influence fn runtime (serial)]
= 24224 x 3 x 1 x 0.1 = 7267 seconds = 121 minutes = ~3-4x actual run-time