In [1]:
%load_ext autoreload
%autoreload 2

Pagerank steps:

1. Count nodes
2. Initialize nodes with 1/N
3. Read sink mass from S3 and distribute it
4. Forward mass
    3b. Record sink mass in S3
5. Check convergence

In [2]:
%%writefile MRCountNodes.py
from mrjob.job import MRJob
from mrjob.step import MRStep

class MRCountNodes(MRJob):
    def mapper(self, _, line):
        yield None, 1
        
    def reducer(self, key, values):
        yield None, sum(values)
        
    def steps(self):
        return [
            MRStep(mapper=self.mapper,
            reducer=self.reducer)
        ]
    
    
if __name__ == '__main__':
    MRCountNodes.run()

Overwriting MRCountNodes.py


In [3]:
%%writefile MRPrepData.py
from mrjob.job import MRJob
from mrjob.step import MRStep

class MRPrepData(MRJob):
    
    def configure_options(self):
        super(MRPrepData, self).configure_options()
        self.add_passthrough_option('--initmass', default=None, type=float)
        
        
    def mapper(self, _, line):
        splits = line.split('\t')
        yield splits[0], [splits[1], self.options.initmass]

    def steps(self):
        return [
            MRStep(mapper=self.mapper)
        ]
    
    
if __name__ == '__main__':
    MRPrepData.run()

Overwriting MRPrepData.py


In [4]:
from MRCountNodes import MRCountNodes
from MRPrepData import MRPrepData

total_nodes = 0
mrjob_count_nodes = MRCountNodes(args=['PageRank-test.txt', '--no-strict-protocol'])
with mrjob_count_nodes.make_runner() as runner:
    runner.run()
    for line in runner.stream_output():
        key, value = mrjob_count_nodes.parse_output_line(line)
        total_nodes = value

mrjob_prep_data = MRPrepData(args=['PageRank-test.txt', '--no-strict-protocol', '--initmass', 1.0/total_nodes])
with mrjob_prep_data.make_runner() as runner:
    runner.run()
    for line in runner.stream_output():
        key, value = mrjob_prep_data.parse_output_line(line)
        print key, value

B [u"{'C': 1}", 0.1]
C [u"{'B': 1}", 0.1]
D [u"{'A': 1, 'B': 1}", 0.1]
E [u"{'D': 1, 'B': 1, 'F': 1}", 0.1]
F [u"{'B': 1, 'E': 1}", 0.1]
G [u"{'B': 1, 'E': 1}", 0.1]
H [u"{'B': 1, 'E': 1}", 0.1]
I [u"{'B': 1, 'E': 1}", 0.1]
J [u"{'E': 1}", 0.1]
K [u"{'E': 1}", 0.1]


In [4]:
!pwd

/home/ubuntu/repos/261/week8/HW8


In [5]:
!ll

/bin/sh: 1: ll: not found


In [7]:
!ls -l

total 16
-rw-rw-r-- 1 ubuntu ubuntu 2992 Mar 14 01:59 HW9.ipynb
-rw-rw-r-- 1 ubuntu ubuntu  392 Mar 14 01:42 MRCountNodes.py
-rw-rw-r-- 1 ubuntu ubuntu  532 Mar 14 01:57 MRPrepData.py
-rw-rw-r-- 1 ubuntu ubuntu  166 Mar 14 02:00 PageRank-test.txt
