# Influence function - standard error

In [1]:
# Import libraries
%matplotlib inline
import matplotlib.pyplot as plt
import time
import numpy as np
from numpy import random
import math
import scipy
from scipy import stats
from random import choice
import networkx as nx
import json
from networkx.readwrite import json_graph
import line_profiler
import IPython
ip = IPython.get_ipython()
ip.define_magic('lprun', line_profiler.magic_lprun)

In [2]:
import findspark
import os
findspark.init()
import pyspark
sc = pyspark.SparkContext()

In [4]:
# Load network graph
with open("graph/nc_mini.json", "r") as graph_data_mini:
    graph_data_mini = json.load(graph_data_mini)
    NC_digraph_mini = json_graph.node_link_graph(graph_data_mini)
    
with open("graph/nc_full.json", "r") as graph_data_full:
    graph_data_full = json.load(graph_data_full)
    NC_digraph_full = json_graph.node_link_graph(graph_data_full)

### Calculate variance of simulation results

In [12]:
def influenceFunction(graph, detStartNodes, t=999999):

    explored = set()
    activated = set()
    nodes = []
    for node in detStartNodes:
        nodes.append(node)
        activated.add(node)
    
    start = detStartNodes[0]
    successors = []

    for i in range(t):
        successors = []
        while len(nodes)>0:
            startNode = nodes[0]
            if startNode not in explored:
                explored.add(startNode)
            for succNode in graph.succ[startNode]:
                if succNode not in activated:
                    alpha = graph[startNode][succNode]['weight']
                    beta = graph.node[succNode]['review_count']
                    if random.uniform(0,1) < np.sqrt(random.beta(alpha, beta)):
                        if succNode not in explored:
                            activated.add(succNode)
                            successors.append(succNode)
            nodes.remove(startNode)
        nodes = successors
        if not nodes:
            break
    return len(activated)

In [13]:
def influenceFunctionNotParDetStart(graph, detStartNodes, N, t=999999):
    result = []
    for n in xrange(N):
        result.append(float(influenceFunction(graph, detStartNodes, t)))
    return np.mean(result)

In [None]:
def influenceFunctionParDetStart(graph, detStartNodes, N, t=999999):
    runs = sc.parallelize(range(N), 4)
    results = runs.map(lambda x: influenceFunction(graph, detStartNodes, t))
    return np.mean(results.collect())

In [None]:
def influenceFunctionNotParRandStart(graph, N, t=999999):
    result = []
    for n in xrange(N):
        result.append(float(influenceFunction(graph, [random.choice(graph.nodes(),1)[0]], t)))
    return np.mean(result)

In [None]:
def influenceFunctionParRandStart(graph, N, t=999999):
    runs = sc.parallelize(range(N), 4)
    results = runs.map(lambda x: influenceFunction(graph, [random.choice(graph.nodes(),1)[0]], t))
    return np.mean(results.collect())

**Plot standard error as a function of N:**

In [16]:
graph = NC_digraph_mini

In [None]:
N = 100

results = []
for i in xrange(100):
    results.append(influenceFunctionParRandStart(graph, N, 10))
fd = open("data/SE-test-N=" + str(k) + ".json","w")
json.dump(results, fd)
fd.close()
del results

In [None]:
# N = 250

# results = []
# for i in xrange(100):
#     results.append(influenceFunctionParRandStart(graph, N, 10))
# fd = open("data/SE-test-N=" + str(k) + ".json","w")
# json.dump(results, fd)
# fd.close()
# del results

In [None]:
# N = 500

# results = []
# for i in xrange(100):
#     results.append(influenceFunctionParRandStart(graph, N, 10))
# fd = open("data/SE-test-N=" + str(k) + ".json","w")
# json.dump(results, fd)
# fd.close()
# del results

In [None]:
# N = 750

# results = []
# for i in xrange(100):
#     results.append(influenceFunctionParRandStart(graph, N, 10))
# fd = open("data/SE-test-N=" + str(k) + ".json","w")
# json.dump(results, fd)
# fd.close()
# del results

In [None]:
# N = 1000

# results = []
# for i in xrange(100):
#     results.append(influenceFunctionParRandStart(graph, N, 10))
# fd = open("data/SE-test-N=" + str(k) + ".json","w")
# json.dump(results, fd)
# fd.close()
# del results

In [3]:
# stdErr = dict()

# for k in [100,250,500,750,1000]:
#     with open("data/SE-NC_test-N=" + str(k) + ".json", "r") as fd:
#         results = json.load(fd)
#     stdErr[k] = np.std(results)
    
# plt.plot(stdErr.keys(), stdErr.values(), 'o-', color='r', linestyle='None', markersize=8)
# plt.xlim(xmin=0)
# plt.ylim(ymin=0)
# plt.show()

In [11]:
# y = [math.log(i) for i in stdErr.values()]
# x = [math.log(i) for i in stdErr.keys()]
# slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(x, y)

In [2]:
# print 'lambda = %f' % (-slope)
# print 'alpha = %f' % (math.exp(intercept))

Check results with log-plot:

In [1]:
# keys = [math.log(i) for i in stdErr.keys()]
# keys.sort()
# solution = [intercept + slope*n for n in keys]

# plt.plot(x, [math.log(i) for i in stdErr.values()], 'o-', color='r', linestyle='None', markersize=8)
# plt.plot(keys, solution)
# plt.show()