In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster('local').setAppName('master')
sc = SparkContext(conf = conf)

In [8]:
!cat ./PageRank-test.txt

B	{'C': 1}
C	{'B': 1}
D	{'A': 1, 'B': 1}
E	{'D': 1, 'B': 1, 'F': 1}
F	{'B': 1, 'E': 1}
G	{'B': 1, 'E': 1}
H	{'B': 1, 'E': 1}
I	{'B': 1, 'E': 1}
J	{'E': 1}
K	{'E': 1}


In [None]:
raw_data = sc.textFile("./PageRank-test.txt")

#process data

def processLine(line):
    splits = line.strip().split('\t')
    return [splits[0], eval(splits[1])]
data = raw_data.map(processLine).cache()

In [86]:
#count nodes
def mapCount(entry):
    yield (str(entry[0]), 1)
    for key, value in entry[1].items():
        yield (key, 1)
        
nodecount = data.flatMap(mapCount).reduceByKey(lambda x, y: 1).count()
broadcast_nodecount = sc.broadcast(nodecount)

In [87]:
print "There are " + str(broadcast_nodecount.value) + " listed nodes."

There are 11 listed nodes.


In [103]:
#prep data
#nodes that don't have edges need to be better represented

def prep_map(entry):
    nodeID = entry[0]
    edges = entry[1]
    for edge, weight in edges.items():
        yield (str(edge.strip()), {})
    yield (str(nodeID), edges)
    
def prep_reduce(x, y):
    edges = {}
    for key, value in x.items():
        edges[key] = value
    for key, value in y.items():
        edges[key] = value
    return edges

total_pr = sc.accumulator(0.0)

def init_entry(entry, starting_node):
    if entry[0] == starting_node:
        state = 'Q'
    else:
        state = 'U'
    total_pr.add(1.0 / broadcast_nodecount.value)
    return (entry[0], [entry[1], 1.0 / broadcast_nodecount.value, state])

prepped_data = data.flatMap(prep_map).reduceByKey(prep_reduce).map(lambda x: init_entry(x, 'B')).collect()
print total_pr
print prepped_data


1.0
[('A', [{}, 0.09090909090909091, 'U']), ('C', [{'B': 1}, 0.09090909090909091, 'U']), ('B', [{'C': 1}, 0.09090909090909091, 'Q']), ('E', [{'B': 1, 'D': 1, 'F': 1}, 0.09090909090909091, 'U']), ('D', [{'A': 1, 'B': 1}, 0.09090909090909091, 'U']), ('G', [{'B': 1, 'E': 1}, 0.09090909090909091, 'U']), ('F', [{'B': 1, 'E': 1}, 0.09090909090909091, 'U']), ('I', [{'B': 1, 'E': 1}, 0.09090909090909091, 'U']), ('H', [{'B': 1, 'E': 1}, 0.09090909090909091, 'U']), ('K', [{'E': 1}, 0.09090909090909091, 'U']), ('J', [{'E': 1}, 0.09090909090909091, 'U'])]


In [84]:
total_pr

Accumulator<id=8, value=0.0>

In [85]:
print initial_state.collect()
print total_pr.value

[[u'B', {'C': 1}, 0.1, 'Q'], [u'C', {'B': 1}, 0.1, 'U'], [u'D', {'A': 1, 'B': 1}, 0.1, 'U'], [u'E', {'B': 1, 'D': 1, 'F': 1}, 0.1, 'U'], [u'F', {'B': 1, 'E': 1}, 0.1, 'U'], [u'G', {'B': 1, 'E': 1}, 0.1, 'U'], [u'H', {'B': 1, 'E': 1}, 0.1, 'U'], [u'I', {'B': 1, 'E': 1}, 0.1, 'U'], [u'J', {'E': 1}, 0.1, 'U'], [u'K', {'E': 1}, 0.1, 'U']]
1.0


In [None]:
#high-level pagerank

def pagerank(data, starting_node):
    
    
    data = mark_as_unvisited(data, starting_node)
    while (unvisited_states > 0): #assumes all states are reachable from starting point
        data = push_frontier(data)