In [1]:
%load_ext autoreload
%autoreload 2

Pagerank steps:

1. Count nodes
2. Initialize nodes with 1/N
3. Read sink mass from file/S3 and distribute it
4. Forward mass
    3b. Record sink mass in file/S3
5. Check convergence

In [2]:
%%writefile MRCountNodes.py
from mrjob.job import MRJob
from mrjob.step import MRStep

class MRCountNodes(MRJob):
    def mapper(self, _, line):
        yield None, 1
        
    def reducer(self, key, values):
        yield None, sum(values)
        
    def steps(self):
        return [
            MRStep(mapper=self.mapper,
            reducer=self.reducer)
        ]
    
    
if __name__ == '__main__':
    MRCountNodes.run()

Overwriting MRCountNodes.py


In [3]:
%%writefile MRPrepData.py
from mrjob.job import MRJob
from mrjob.step import MRStep

class MRPrepData(MRJob):
    
    def configure_options(self):
        super(MRPrepData, self).configure_options()
        self.add_passthrough_option('--initmass', default=None, type=float)
        
        
    def mapper(self, _, line):
        splits = line.split('\t')
        yield splits[0], [splits[1], self.options.initmass]

    def steps(self):
        return [
            MRStep(mapper=self.mapper)
        ]
    
    
if __name__ == '__main__':
    MRPrepData.run()

Overwriting MRPrepData.py


In [4]:
from MRCountNodes import MRCountNodes
from MRPrepData import MRPrepData

total_nodes = 0
mrjob_count_nodes = MRCountNodes(args=['PageRank-test.txt', '--no-strict-protocol'])
with mrjob_count_nodes.make_runner() as runner:
    runner.run()
    for line in runner.stream_output():
        key, value = mrjob_count_nodes.parse_output_line(line)
        total_nodes = value

mrjob_prep_data = MRPrepData(args=['PageRank-test.txt', '--no-strict-protocol', '--initmass', 1.0/total_nodes])
with mrjob_prep_data.make_runner() as runner:
    runner.run()
    for line in runner.stream_output():
        key, value = mrjob_prep_data.parse_output_line(line)
        print key, value

B [u"{'C': 1}", 0.1]
C [u"{'B': 1}", 0.1]
D [u"{'A': 1, 'B': 1}", 0.1]
E [u"{'D': 1, 'B': 1, 'F': 1}", 0.1]
F [u"{'B': 1, 'E': 1}", 0.1]
G [u"{'B': 1, 'E': 1}", 0.1]
H [u"{'B': 1, 'E': 1}", 0.1]
I [u"{'B': 1, 'E': 1}", 0.1]
J [u"{'E': 1}", 0.1]
K [u"{'E': 1}", 0.1]


Pagerank steps:

1. Count nodes
2. Initialize nodes with 1/N
3. Read sink mass from file/S3 and distribute it
4. Forward mass
    3b. Record sink mass in file/S3
5. Check convergence

In [5]:
%%writefile MRPageRank.py
from mrjob.job import MRJob
from mrjob.step import MRStep
import sys, re

class MRPageRank(MRJob):
    def configure_options(self):
        super(MRPageRank, self).configure_options()
        self.add_passthrough_option('--danglingmass', default=None, type=float)
        self.add_passthrough_option('--dampedmass', default=None, type=float)
        

    def mapper_init():
        self.danglingmass = 0.0
        
    def mapper(self, _, line):
        splits = line.split(' ', 1)
        key = splits[0]
        #load the values as an array
        value = eval(splits[1])
        #load the edges as a dictionary
        edges = eval(value[0])
        pr = float(value[1]) + self.options.danglingmass + self.options.dampedmass
        if len(edges) == 0:
            self.danglingmass += pr
        else:
            forwarding_pr = pr/len(edges)
            for edge, weight in edges.iteritems():
                yield edge, forwarding_pr
                
    def mapper_final():
        massfile = open('danglingmass.txt', 'a')
        massfile.write(str(self.danglingmass) + '\n')
        massfile.close()
        
    def reducer(self, key, values):
        
        

            
        

/bin/sh: 1: ll: not found


In [5]:
testDict = {'a':1, 'b':2, 'c':3}
print len(testDict)

3
