In [1]:
%load_ext autoreload
%autoreload 2

Pagerank steps:

1. Count nodes
2. Initialize nodes with 1/N
3. Read sink mass from file/S3 and distribute it
4. Forward mass
    3b. Record sink mass in file/S3
5. Check convergence

In [63]:
%%writefile MRCountNodes.py
from mrjob.job import MRJob
from mrjob.step import MRStep

class MRCountNodes(MRJob):
    def count1(self, _, line):
        splits = line.split('\t')
        node = splits[0]
        edges = eval(splits[1])
        for edge, weight in edges.iteritems():
            yield edge, 1
        yield node, 1
        
    def sum1(self, key, values):
        yield None, 1
            
        
    def sum2(self, key, values):
        yield None, sum(values)
        
    def steps(self):
        return [
            MRStep(mapper=self.count1,
            reducer=self.sum1),
            MRStep(reducer=self.sum2)
        ]
    
    
if __name__ == '__main__':
    MRCountNodes.run()

Overwriting MRCountNodes.py


In [71]:
%%writefile MRPrepData.py
from mrjob.job import MRJob
from mrjob.step import MRStep

class MRPrepData(MRJob):
    
    def configure_options(self):
        super(MRPrepData, self).configure_options()
        self.add_passthrough_option('--initmass', default=None, type=float)
        
        
    def mapper(self, _, line):
        splits = line.split('\t')
        edges = eval(splits[1])
        for edge, weight in edges.iteritems():
            yield edge.strip(), '{}'
        yield splits[0].strip(), splits[1]
#         yield splits[0], [splits[1], self.options.initmass]


    def reducer(self, key, values):
        edges = {}
        for value in values:
            for edge, weight in eval(value).iteritems():
                edges[edge] = weight
        yield key, [str(edges), self.options.initmass]
        
        
    def steps(self):
        return [
            MRStep(mapper=self.mapper,
                  reducer=self.reducer)
        ]
    
    
if __name__ == '__main__':
    MRPrepData.run()

Overwriting MRPrepData.py


Pagerank steps:

1. Count nodes
2. Initialize nodes with 1/N
3. Read sink mass from file/S3 and distribute it
4. Forward mass
    3b. Record sink mass in file/S3
5. Check convergence

In [97]:
%%writefile MRPageRankPhaseOne.py
from mrjob.job import MRJob
from mrjob.step import MRStep
import sys, os

class MRPageRankPhaseOne(MRJob):        

    def mapper_init(self):
        self.danglingmass = 0.0
        
    def mapper(self, _, line):
        splits = line.split(' ', 1)
        key = splits[0]
        #load the values as an array
        value = eval(splits[1])
        #load the edges as a dictionary
        edges = eval(value[0])
        pr = float(value[1])
        if len(edges) == 0:
            yield 'dangling', [[], pr]
        else:
            forwarding_pr = pr/len(edges)
            for edge, weight in edges.iteritems():
                yield edge, ['{}', forwarding_pr]
        yield key, [str(edges), 0.0]
                        
    def reducer(self, key, values): 
        #the case if a node is listed with no edges
        if key == 'dangling':
            danglingmass = 0.0
            for value in values:
                danglingmass += float(value[1])
#             print 'Key label is "dangling": printing ' + str(danglingmass) + ' to file'
            massfile = open('/home/ubuntu/repos/261/week8/HW8/danglingmass.txt', 'a')
            massfile.write(str(danglingmass) + '\n')
            massfile.close()
        else:
            pr = 0.0
            edges = {}
            for value in values:
#                 print 'Value: ' + str(value) + ': ' + str(type(value))
#                 value_array = eval(value)
                pr += float(value[1])
                for edge, weight in eval(value[0]).iteritems():
                    edges[edge] = weight
            yield key, [str(edges), pr]
#             if len(edges) == 0:
#                 #the case when a node isn't listed except in the edge list of another graph
#                 massfile = open('/home/ubuntu/repos/261/week8/HW8/danglingmass.txt', 'a')
# #                 print 'Case: ' + str(key) + ' ' + str(edges) + ' has no outgoing edges: printing to file ' + os.path.abspath(massfile.name)
#                 massfile.write(str(pr) + '\n')
#                 massfile.close()
# #                 print '\tPrinted ' + str(pr) + ' as dangling mass'
#                 yield key, [str(edges), 0.0]
#             else:
#                 #the base case where a node is listed and has edges
# #                 print 'Base case: ' + str(key) + ' ' + str(edges)
#                 yield key, [str(edges), pr]


        
if __name__ == '__main__':
    MRPageRankPhaseOne.run()

Overwriting MRPageRankPhaseOne.py


In [86]:
%%writefile MRPageRankPhaseTwo.py
from mrjob.job import MRJob
from mrjob.step import MRStep


class MRPageRankPhaseTwo(MRJob):
    def configure_options(self):
        super(MRPageRankPhaseTwo, self).configure_options()
        self.add_passthrough_option('--danglingmass', default=None, type=str)
        self.add_passthrough_option('--totalnodes', default=None, type=str)
        self.add_passthrough_option('--dampingfactor', default=None, type=str)
        
    def mapper(self, _, line):
        splits = line.split(' ', 1)
        key = splits[0]
        #load the values as an array
        value = eval(splits[1])
        #load the edges as a dictionary
        edges = eval(value[0])
        pr = float(value[1])
        nodes = int(self.options.totalnodes)
        dampingfactor = float(self.options.dampingfactor)
        danglingmass = float(self.options.danglingmass)
        pr_prime = dampingfactor * (1.0/nodes) + (1 - dampingfactor) * (danglingmass/nodes + pr)
        yield key, [str(edges), pr_prime]
        
if __name__ == '__main__':
    MRPageRankPhaseTwo.run()

Overwriting MRPageRankPhaseTwo.py


In [99]:
from MRCountNodes import MRCountNodes
from MRPrepData import MRPrepData
from MRPageRankPhaseOne import MRPageRankPhaseOne
from MRPageRankPhaseTwo import MRPageRankPhaseTwo

tempFile = open('/home/ubuntu/repos/261/week8/HW8/danglingmass.txt', 'w')
tempFile.close()

damping_factor = .15

verbose = True

total_nodes = 0
mrjob_count_nodes = MRCountNodes(args=['PageRank-test.txt', '--no-strict-protocol'])
with mrjob_count_nodes.make_runner() as runner:
    runner.run()
    for line in runner.stream_output():
        key, value = mrjob_count_nodes.parse_output_line(line)
        total_nodes = value
  
if verbose:
    print '\nTotal nodes: ' + str(total_nodes) + '\n'

mrjob_prep_data = MRPrepData(args=['PageRank-test.txt', '--no-strict-protocol', '--initmass', 1.0/total_nodes])
with mrjob_prep_data.make_runner() as runner:
    runner.run()
    outfile = open('PageRankLooping.txt', 'w')
    for line in runner.stream_output():
        key, value = mrjob_prep_data.parse_output_line(line)
        if verbose:
            print key, value
        outfile.write(str(key) + ' ' + str(value) + '\n')
    outfile.close()
    
if verbose:
    print '\nLooping\n'

for i in range(30):
    if verbose:
        print '\nIteration ' + str(i) + '\n'
    
    
    mrjob_pagerank = MRPageRankPhaseOne(args=['PageRankLooping.txt', '--no-strict-protocol'])
    with mrjob_pagerank.make_runner() as runner:
        runner.run()
        total_mass = 0.0
        outfile = open('PageRankLooping.txt', 'w')
        for line in runner.stream_output():
            key, value = mrjob_pagerank.parse_output_line(line)
            total_mass += value[1]
#             print key, value
            outfile.write(str(key) + ' ' + str(value) + '\n')
        outfile.close()
        if verbose:
            print "Total phase 1 prMass: " + str(total_mass)
        
    massFile = open('/home/ubuntu/repos/261/week8/HW8/danglingmass.txt', 'r')
    total_danglingmass = 0.0
    for line in massFile.readlines():
        total_danglingmass += float(line.strip())
    massFile.close()
    massFile = open('/home/ubuntu/repos/261/week8/HW8/danglingmass.txt', 'w')
    massFile.close()
        
    if verbose:
        print 'Phase Two options: '
        print '\tTotal nodes: ' + str(total_nodes)
        print '\tDamping factor: ' + str(damping_factor)
        print '\tDangling mass: ' + str(total_danglingmass)
        print '\tDangling Mass + existing mass: ' + str(total_danglingmass) + ' + ' + str(total_mass) + ' = ' + str(total_danglingmass + total_mass)
    
    mrjob_pagerank_phase_two = MRPageRankPhaseTwo(args=['PageRankLooping.txt', '--totalnodes', str(total_nodes), \
                                                        '--dampingfactor', repr(damping_factor), '--danglingmass', repr(total_danglingmass), '--no-strict-protocol'])
    with mrjob_pagerank_phase_two.make_runner() as runner:
        runner.run()
        total_mass = 0.0
        outfile = open('PageRankLooping.txt', 'w')
        for line in runner.stream_output():
            key, value = mrjob_pagerank_phase_two.parse_output_line(line)
            total_mass += value[1]
#             print key, value
            outfile.write(str(key) + ' ' + str(value) + '\n')
        outfile.close()
        if verbose:
            print "Total phase 2 prMass: " + str(total_mass)
        

        
        
if verbose:
    print '\n\n====\n\n'
        
results_file = open('PageRankLooping.txt', 'r')
for line in results_file.readlines():
    print line.strip()



Total nodes: 11

A [u'{}', 0.09090909090909091]
B [u"{'C': 1}", 0.09090909090909091]
C [u"{'B': 1}", 0.09090909090909091]
D [u"{'A': 1, 'B': 1}", 0.09090909090909091]
E [u"{'B': 1, 'D': 1, 'F': 1}", 0.09090909090909091]
F [u"{'B': 1, 'E': 1}", 0.09090909090909091]
G [u"{'B': 1, 'E': 1}", 0.09090909090909091]
H [u"{'B': 1, 'E': 1}", 0.09090909090909091]
I [u"{'B': 1, 'E': 1}", 0.09090909090909091]
J [u"{'E': 1}", 0.09090909090909091]
K [u"{'E': 1}", 0.09090909090909091]

Looping


Iteration 0

Total phase 1 prMass: 0.909090909091
Phase Two options: 
	Total nodes: 11
	Damping factor: 0.15
	Dangling mass: 0.0909090909091
	Dangling Mass + existing mass: 0.0909090909091 + 0.909090909091 = 1.0
Total phase 2 prMass: 1.0

Iteration 1

Total phase 1 prMass: 0.940702479339
Phase Two options: 
	Total nodes: 11
	Damping factor: 0.15
	Dangling mass: 0.0592975206612
	Dangling Mass + existing mass: 0.0592975206612 + 0.940702479339 = 1.0
Total phase 2 prMass: 1.0

Iteration 2

Total phase 1 prMass: 0

In [23]:
!pwd

/home/ubuntu/repos/261/week8/HW8
