# Influence function - standard error

In [1]:
# Import libraries
%matplotlib inline
import matplotlib.pyplot as plt
import time
import numpy as np
from numpy import random
import math
import scipy
from scipy import stats
from random import choice
import networkx as nx
import json
from networkx.readwrite import json_graph
import line_profiler
import IPython
ip = IPython.get_ipython()
ip.define_magic('lprun', line_profiler.magic_lprun)

In [2]:
import findspark
import os
findspark.init()
import pyspark
sc = pyspark.SparkContext()

In [50]:
# Load network graph
with open("graph/nc_full.json", "r") as graph_data:
    graph_data = json.load(graph_data)
    NC_digraph = json_graph.node_link_graph(graph_data)

### Calculate variance of simulation results

In [51]:
def activateNodesOpt():

    nx.set_node_attributes(NC_digraph, 'activated', False)
    nx.set_node_attributes(NC_digraph, 'explored', False)
    
    start = random.choice(NC_digraph.nodes(), 1)
    
    nodes = [start[0]]

    activated = 1
    
    NC_digraph.node[start[0]]['activated'] = True
    
    while len(nodes)>0:

        startNode = nodes[0]

        if NC_digraph.node[startNode]['explored']==False:

            NC_digraph.node[startNode]['explored'] = True

            successors = []

            for succNode in NC_digraph.succ[startNode]:

                if NC_digraph.node[succNode]['activated']==False:

                    alpha = NC_digraph[startNode][succNode]['weight']
                    beta = NC_digraph.node[succNode]['review_count']

                    randUnif = random.uniform(0,1)
                    randBeta = np.sqrt(random.beta(alpha, beta))

                    if randUnif < randBeta:
                        NC_digraph.node[succNode]['activated'] = True
                        successors.append(succNode)
                        activated = activated + 1

        nodes = nodes[1:]
        nodes = nodes + successors

    return activated

def activateNodesLoop(N):

    result = []

    for n in xrange(N):
        result.append(float(activateNodesOpt()))
        
    return np.mean(result)

**Plot standard error as a function of N:**

In [54]:
plot_range = [100, 250, 500, 750, 1000]

for k in plot_range: 
    runs = sc.parallelize(range(100), 4)
    results = runs.map(lambda x: activateNodesLoop(k))
    results_list = results.collect()
    fd = open("data/SE-NC_full-N=" + str(k) + ".json","w")
    json.dump(results_list, fd)
    fd.close()
    del results

In [57]:
stdErr = dict()

for k in plot_range:

    with open("data/SE-NC_full-N=" + str(k) + ".json", "r") as fd:
        results = json.load(fd)
        
    stdErr[k] = np.std(results)
    
plt.plot(stdErr.keys(), stdErr.values(), 'o-', color='r', linestyle='None', markersize=8)
plt.xlim(xmin=0)
plt.ylim(ymin=0)
plt.show()

In [46]:
y = [math.log(i) for i in stdErr.values()]
x = [math.log(i) for i in stdErr.keys()]
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(x, y)

In [59]:
print 'lambda = %f' % (-slope)
print 'alpha = %f' % (math.exp(intercept))

Check results with log-plot:

In [58]:
keys = [math.log(i) for i in stdErr.keys()]
keys.sort()
solution = [intercept + slope*n for n in keys]

plt.plot(x, [math.log(i) for i in stdErr.values()], 'o-', color='r', linestyle='None', markersize=8)
plt.plot(keys, solution)
plt.xlim(xmin=4)
plt.ylim(ymin=-1)
plt.show()