In [106]:
import findspark
findspark.init()

In [107]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster('local').setAppName('master')
sc = SparkContext(conf = conf)

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=master, master=local) created by __init__ at <ipython-input-2-f493162bd33f>:3 

In [108]:
!cat ./PageRank-test.txt

B	{'C': 1}
C	{'B': 1}
D	{'A': 1, 'B': 1}
E	{'D': 1, 'B': 1, 'F': 1}
F	{'B': 1, 'E': 1}
G	{'B': 1, 'E': 1}
H	{'B': 1, 'E': 1}
I	{'B': 1, 'E': 1}
J	{'E': 1}
K	{'E': 1}


In [109]:
raw_data = sc.textFile("./PageRank-test.txt")

#process data

def processLine(line):
    splits = line.strip().split('\t')
    return [splits[0], eval(splits[1])]
data = raw_data.map(processLine).cache()

In [110]:
#count nodes
def mapCount(entry):
    yield (str(entry[0]), 1)
    for key, value in entry[1].items():
        yield (key, 1)
        
nodecount = data.flatMap(mapCount).reduceByKey(lambda x, y: 1).count()
broadcast_nodecount = sc.broadcast(nodecount)

In [111]:
print "There are " + str(broadcast_nodecount.value) + " listed nodes."

There are 11 listed nodes.


In [120]:
#prep data
#nodes that don't have edges need to be better represented

def prep_map(entry):
    nodeID = entry[0]
    edges = entry[1]
    for edge, weight in edges.items():
        yield (str(edge.strip()), {})
    yield (str(nodeID), edges)
    
def prep_reduce(x, y):
    edges = {}
    for key, value in x.items():
        edges[key] = value
    for key, value in y.items():
        edges[key] = value
    return edges

total_pr = sc.accumulator(0.0)

def init_entry(entry):
    total_pr.add(1.0 / broadcast_nodecount.value)
    return (entry[0], [entry[1], 1.0 / broadcast_nodecount.value])

prepped_data = data.flatMap(prep_map).reduceByKey(prep_reduce).map(init_entry).cache()

print prepped_data.collect()

print total_pr.value


[('A', [{}, 0.09090909090909091]), ('C', [{'B': 1}, 0.09090909090909091]), ('B', [{'C': 1}, 0.09090909090909091]), ('E', [{'B': 1, 'D': 1, 'F': 1}, 0.09090909090909091]), ('D', [{'A': 1, 'B': 1}, 0.09090909090909091]), ('G', [{'B': 1, 'E': 1}, 0.09090909090909091]), ('F', [{'B': 1, 'E': 1}, 0.09090909090909091]), ('I', [{'B': 1, 'E': 1}, 0.09090909090909091]), ('H', [{'B': 1, 'E': 1}, 0.09090909090909091]), ('K', [{'E': 1}, 0.09090909090909091]), ('J', [{'E': 1}, 0.09090909090909091])]
1.0


In [116]:
accum_dangling_mass = sc.accumulator(0.0)
accum_mass_moved = sc.accumulator(0)

broadcast_damping_factor = sc.broadcast(.15)
broadcast_dangling_mass = sc.broadcast(0.0)

In [183]:
def phaseOneMapper(entry):
    label = entry[0]
    edges = entry[1][0]
    pr = entry[1][1]

    if len(edges) == 0:
        accum_dangling_mass.add(pr)
    else:
        forwarding_pr = pr / len(edges)
        for edge, weight in edges.items():
            yield (edge, [{}, forwarding_pr])
    yield (label, [edges, 0.0])

def phaseOneReducer(x, y):
    edges = {}
    for edge, weight in x[0].items():
        edges[edge] = weight
    for edge, weight in y[0].items():
        edges[edge] = weight
    return [edges, x[1] + y[1]]

def deserialize(entry):
    label = entry[1:-1].split(',',1)[0][1:-1]
    pair = eval(entry[1:-1].split(',',1)[1])
    edges = pair[0]
    pr = pair[1]
    return (label, [edges, pr])


def finalize(entry, dangling_mass):
    label = str(entry[0])
    edges = entry[1][0]
    pr = entry[1][1]

    pr_prime = broadcast_damping_factor.value * (1.0 / broadcast_nodecount.value) + \
    (1 - broadcast_damping_factor.value) * (dangling_mass / broadcast_nodecount.value + pr)

    return (label, [edges, pr_prime])

    

In [178]:
testString = "('A', [{}, .045])"
print testString
print testString[1:-1]
print testString[1:-1].split(',',1)
print testString[1:-1].split(',',1)[0]
print testString[1:-1].split(',',1)[0][1:-1]
print eval(testString[1:-1].split(',',1)[1])

('A', [{}, .045])
'A', [{}, .045]
["'A'", ' [{}, .045]']
'A'
A
[{}, 0.045]


In [139]:
#initial testing
accum_dangling_mass = sc.accumulator(0.0)

broadcast_damping_factor = sc.broadcast(.15)

print prepped_data.flatMap(phaseOneMapper).collect()

accum_dangling_mass = sc.accumulator(0.0)


print prepped_data.flatMap(phaseOneMapper).map(lambda x: x[1][1]).reduce(lambda x, y: x + y) + \
    accum_dangling_mass.value


#                 .reduceByKey(phaseOneReducer)\
#                 .map(lambda x: finalize(x, accum_dangling_mass.value))\
#                 .collect()


[('A', [{}, 0.0]), ('B', [{}, 0.09090909090909091]), ('C', [{'B': 1}, 0.0]), ('C', [{}, 0.09090909090909091]), ('B', [{'C': 1}, 0.0]), ('B', [{}, 0.030303030303030304]), ('D', [{}, 0.030303030303030304]), ('F', [{}, 0.030303030303030304]), ('E', [{'B': 1, 'D': 1, 'F': 1}, 0.0]), ('A', [{}, 0.045454545454545456]), ('B', [{}, 0.045454545454545456]), ('D', [{'A': 1, 'B': 1}, 0.0]), ('B', [{}, 0.045454545454545456]), ('E', [{}, 0.045454545454545456]), ('G', [{'B': 1, 'E': 1}, 0.0]), ('B', [{}, 0.045454545454545456]), ('E', [{}, 0.045454545454545456]), ('F', [{'B': 1, 'E': 1}, 0.0]), ('B', [{}, 0.045454545454545456]), ('E', [{}, 0.045454545454545456]), ('I', [{'B': 1, 'E': 1}, 0.0]), ('B', [{}, 0.045454545454545456]), ('E', [{}, 0.045454545454545456]), ('H', [{'B': 1, 'E': 1}, 0.0]), ('E', [{}, 0.09090909090909091]), ('K', [{'E': 1}, 0.0]), ('E', [{}, 0.09090909090909091]), ('J', [{'E': 1}, 0.0])]
1.0


In [144]:
#phase two testing


broadcast_damping_factor = sc.broadcast(.15)

accum_dangling_mass = sc.accumulator(0.0)

print prepped_data.flatMap(phaseOneMapper).collect()

accum_dangling_mass = sc.accumulator(0.0)


prepped_data.flatMap(phaseOneMapper).reduceByKey(phaseOneReducer).collect()
#                 .map(lambda x: finalize(x, accum_dangling_mass.value))\
#                 .collect()

accum_dangling_mass = sc.accumulator(0.0)
prepped_data.flatMap(phaseOneMapper)\
    .reduceByKey(phaseOneReducer)\
    .map(lambda x: x[1][1])\
    .reduce(lambda x, y: x + y)\
    + accum_dangling_mass.value


[('A', [{}, 0.0]), ('B', [{}, 0.09090909090909091]), ('C', [{'B': 1}, 0.0]), ('C', [{}, 0.09090909090909091]), ('B', [{'C': 1}, 0.0]), ('B', [{}, 0.030303030303030304]), ('D', [{}, 0.030303030303030304]), ('F', [{}, 0.030303030303030304]), ('E', [{'B': 1, 'D': 1, 'F': 1}, 0.0]), ('A', [{}, 0.045454545454545456]), ('B', [{}, 0.045454545454545456]), ('D', [{'A': 1, 'B': 1}, 0.0]), ('B', [{}, 0.045454545454545456]), ('E', [{}, 0.045454545454545456]), ('G', [{'B': 1, 'E': 1}, 0.0]), ('B', [{}, 0.045454545454545456]), ('E', [{}, 0.045454545454545456]), ('F', [{'B': 1, 'E': 1}, 0.0]), ('B', [{}, 0.045454545454545456]), ('E', [{}, 0.045454545454545456]), ('I', [{'B': 1, 'E': 1}, 0.0]), ('B', [{}, 0.045454545454545456]), ('E', [{}, 0.045454545454545456]), ('H', [{'B': 1, 'E': 1}, 0.0]), ('E', [{}, 0.09090909090909091]), ('K', [{'E': 1}, 0.0]), ('E', [{}, 0.09090909090909091]), ('J', [{'E': 1}, 0.0])]


1.0

In [185]:
#full pipeline testing


broadcast_damping_factor = sc.broadcast(.15)

accum_dangling_mass = sc.accumulator(0.0)

print prepped_data.flatMap(phaseOneMapper).collect()

accum_dangling_mass = sc.accumulator(0.0)

! rm -rf ./intermediate/
prepped_data.flatMap(phaseOneMapper).reduceByKey(phaseOneReducer).saveAsTextFile("./intermediate")
!cat intermediate/part-00000

danging_mass = accum_dangling_mass.value

intermediate_data = sc.textFile("./intermediate/part-00000").map(deserialize)\
                .map(lambda x: finalize(x, danging_mass))
    
print intermediate_data.collect()
    
total_pr =  intermediate_data.map(lambda x: x[1][1]).reduce(lambda x, y: x + y)
        


print total_pr

# accum_dangling_mass = sc.accumulator(0.0)
# prepped_data.flatMap(phaseOneMapper)\
#     .reduceByKey(phaseOneReducer)\
#     .map(lambda x: x[1][1])\
#     .reduce(lambda x, y: x + y)\
#     + accum_dangling_mass.value


[('A', [{}, 0.0]), ('B', [{}, 0.09090909090909091]), ('C', [{'B': 1}, 0.0]), ('C', [{}, 0.09090909090909091]), ('B', [{'C': 1}, 0.0]), ('B', [{}, 0.030303030303030304]), ('D', [{}, 0.030303030303030304]), ('F', [{}, 0.030303030303030304]), ('E', [{'B': 1, 'D': 1, 'F': 1}, 0.0]), ('A', [{}, 0.045454545454545456]), ('B', [{}, 0.045454545454545456]), ('D', [{'A': 1, 'B': 1}, 0.0]), ('B', [{}, 0.045454545454545456]), ('E', [{}, 0.045454545454545456]), ('G', [{'B': 1, 'E': 1}, 0.0]), ('B', [{}, 0.045454545454545456]), ('E', [{}, 0.045454545454545456]), ('F', [{'B': 1, 'E': 1}, 0.0]), ('B', [{}, 0.045454545454545456]), ('E', [{}, 0.045454545454545456]), ('I', [{'B': 1, 'E': 1}, 0.0]), ('B', [{}, 0.045454545454545456]), ('E', [{}, 0.045454545454545456]), ('H', [{'B': 1, 'E': 1}, 0.0]), ('E', [{}, 0.09090909090909091]), ('K', [{'E': 1}, 0.0]), ('E', [{}, 0.09090909090909091]), ('J', [{'E': 1}, 0.0])]
('A', [{}, 0.045454545454545456])
('C', [{'B': 1}, 0.09090909090909091])
('B', [{'C': 1}, 0.34