# Full US graph

In [1]:
# Import libraries
%matplotlib inline
import matplotlib.pyplot as plt
import time
import numpy as np
from numpy import random
import math
import scipy
from scipy import stats
from random import choice
import networkx as nx
import json
from networkx.readwrite import json_graph
import line_profiler
import IPython
ip = IPython.get_ipython()
ip.define_magic('lprun', line_profiler.magic_lprun)

In [2]:
import findspark
import os
findspark.init()
# import pyspark
# sc = pyspark.SparkContext()

In [5]:
# Load network graph
with open("./graph/nc_mini.json", "r") as graph_data_mini:
    graph_data_mini = json.load(graph_data_mini)
    NC_digraph_mini = json_graph.node_link_graph(graph_data_mini)
    
with open("./graph/nc_full.json", "r") as graph_data_full:
    graph_data_full = json.load(graph_data_full)
    NC_digraph_full = json_graph.node_link_graph(graph_data_full)
    
# with open("./graph/US.json", "r") as graph_data_US:
#     graph_data_US = json.load(graph_data_US)
#     NC_digraph_US = json_graph.node_link_graph(graph_data_US)

In [None]:
import requests
from networkx.readwrite import json_graph
r = requests.get('https://usgraph.blob.core.windows.net/usgraph/usgraph')
NC_digraph_US = json_graph.node_link_graph(r.json())

### Functions

In [4]:
NC_digraph_US.number_of_nodes()

350620

In [5]:
def influenceFunction(graph, detStartNodes, t=999999):

    explored = set()
    activated = set()
    nodes = []
    for node in detStartNodes:
        nodes.append(node)
        activated.add(node)
    
    start = detStartNodes[0]
    successors = []

    for i in range(t):
        successors = []
        while len(nodes)>0:
            startNode = nodes[0]
            if startNode not in explored:
                explored.add(startNode)
            for succNode in graph.succ[startNode]:
                if succNode not in activated:
                    alpha = graph[startNode][succNode]['weight']
                    beta = graph.node[succNode]['review_count']
                    if random.uniform(0,1) < np.sqrt(random.beta(alpha, beta)):
                        if succNode not in explored:
                            activated.add(succNode)
                            successors.append(succNode)
            nodes.remove(startNode)
        nodes = successors
        if not nodes:
            break
    return float(len(activated))

In [6]:
def influenceFunctionNotParDetStart(graph, detStartNodes, N, t=999999):
    result = []
    for n in xrange(N):
        result.append(float(influenceFunction(graph, detStartNodes, t)))
    return np.mean(result)

In [7]:
def influenceFunctionParDetStart(graph, detStartNodes, N, t=999999):
    runs = sc.parallelize(range(N), 4)
    results = runs.map(lambda x: influenceFunction(graph, detStartNodes, t))
    return np.mean(results.collect())

In [8]:
def influenceFunctionNotParRandStart(graph, N, t=999999):
    result = []
    for n in xrange(N):
        result.append(float(influenceFunction(graph, [random.choice(graph.nodes(),1)[0]], t)))
    return np.mean(result)

In [9]:
def influenceFunctionParRandStart(graph, N, t=999999):
    runs = sc.parallelize(range(N), 4)
    results = runs.map(lambda x: influenceFunction(graph, [random.choice(graph.nodes(),1)[0]], t))
    return np.mean(results.collect())

In [10]:
def greedySearch(graph, k=3, N=1000, t=999999):
    
    best_s = []
    max_inf = 0
    nodeRDD = sc.parallelize(list(set(list(sum(graph.edges(), ())))), 4)
    
    for i in range(k):
        infRDD = nodeRDD.map(lambda n: (n, 0.) if n in best_s else \
                             (n, influenceFunctionNotParDetStart(graph, best_s + [n], N)))
        next_s, next_i = infRDD.reduce(lambda a,b: a if a[1] > b[1] else b)
        best_s += [next_s]
        max_inf = next_i
    
    return best_s, max_inf

In [11]:
def maxEdges(graph, k=3):
    
    nodeRDD = sc.parallelize(list(set(list(sum(graph.edges(), ())))), 4)   
    infRDD = nodeRDD.map(lambda n: (n, len(graph.succ[n])))
    return infRDD.takeOrdered(k, lambda w: -w[1])

In [12]:
def maxNodes(graph, k=3, N=1000, t=999999):
    
    nodeRDD = sc.parallelize(list(set(list(sum(graph.edges(), ())))), 4)   
    infRDD = nodeRDD.map(lambda n: (n, influenceFunctionNotParDetStart(graph, [n], N)))
    return infRDD.takeOrdered(k, lambda w: -w[1])

### Timing

In [13]:
# Influence function for given start nodes
start = ['Iu3Jo9ROp2IWC9FwtWOaUQ', 'OaFcpi3W4AwxrD8W2pgC_A', 'glRXVWWD6x1EZKfjJawTOg']
noRuns = 1

print "No cap on t (serial):"
startTime = time.time()
print influenceFunctionNotParDetStart(NC_digraph_US, start, noRuns)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

print "\nt capped at 10 (serial):"
startTime = time.time()
print influenceFunctionNotParDetStart(NC_digraph_US, start, noRuns, 10)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

No cap on t (serial):
184135.0
4.542444 seconds run time

t capped at 10 (serial):
184073.0
4.087694 seconds run time


In [14]:
# Influence function for random start nodes
noRuns = 1

print "No cap on t (serial):"
startTime = time.time()
print influenceFunctionNotParRandStart(NC_digraph_US, noRuns)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

print "\nt capped at 10 (serial):"
startTime = time.time()
print influenceFunctionNotParRandStart(NC_digraph_US, noRuns, 10)
runTime = time.time() - startTime
print '%f seconds run time' % (runTime)

No cap on t (serial):
1.0
0.192753 seconds run time

t capped at 10 (serial):
1.0
0.122009 seconds run time


In [15]:
# # Greedy algorithm
# noRuns = 1

# print "No cap on t:"
# startTime = time.time()
# print greedySearch(NC_digraph_US, 3, noRuns)
# runTime = time.time() - startTime
# print '%f seconds run time' % (runTime)

# print "\nt capped at 10:"
# startTime = time.time()
# print greedySearch(NC_digraph_US, 3, noRuns, 10)
# runTime = time.time() - startTime
# print '%f seconds run time' % (runTime)

In [None]:
# # Look up nodes with max # of edges
# startTime = time.time()
# print maxEdges(NC_digraph_US, 3)
# runTime = time.time() - startTime
# print '%f seconds run time' % (runTime)

In [None]:
# # Look up nodes with max # of activations
# noRuns = 1

# print "No cap on t:"
# startTime = time.time()
# print maxNodes(NC_digraph_US, 3, noRuns)
# runTime = time.time() - startTime
# print '%f seconds run time' % (runTime)

# print "\nt capped at 10:"
# startTime = time.time()
# print maxNodes(NC_digraph_US, 3, noRuns, 10)
# runTime = time.time() - startTime
# print '%f seconds run time' % (runTime)