In [1]:
%load_ext autoreload
%autoreload 2

Pagerank steps:

1. Count nodes
2. Initialize nodes with 1/N
3. Read sink mass from file/S3 and distribute it
4. Forward mass
    3b. Record sink mass in file/S3
5. Check convergence

In [2]:
%%writefile MRCountNodes.py
from mrjob.job import MRJob
from mrjob.step import MRStep

class MRCountNodes(MRJob):
    def mapper(self, _, line):
        yield None, 1
        
    def reducer(self, key, values):
        yield None, sum(values)
        
    def steps(self):
        return [
            MRStep(mapper=self.mapper,
            reducer=self.reducer)
        ]
    
    
if __name__ == '__main__':
    MRCountNodes.run()

Overwriting MRCountNodes.py


In [3]:
%%writefile MRPrepData.py
from mrjob.job import MRJob
from mrjob.step import MRStep

class MRPrepData(MRJob):
    
    def configure_options(self):
        super(MRPrepData, self).configure_options()
        self.add_passthrough_option('--initmass', default=None, type=float)
        
        
    def mapper(self, _, line):
        splits = line.split('\t')
        yield splits[0], [splits[1], self.options.initmass]
        
        
    def steps(self):
        return [
            MRStep(mapper=self.mapper)
        ]
    
    
if __name__ == '__main__':
    MRPrepData.run()

Overwriting MRPrepData.py


In [14]:
from MRCountNodes import MRCountNodes
from MRPrepData import MRPrepData
from MRPageRankPhaseOne import MRPageRankPhaseOne
from MRPageRankPhaseTwo import MRPageRankPhaseTwo

tempFile = open('/home/ubuntu/repos/261/week8/HW8/danglingmass.txt', 'w')
tempFile.close()

damping_factor = .15

total_nodes = 0
mrjob_count_nodes = MRCountNodes(args=['PageRank-test.txt', '--no-strict-protocol'])
with mrjob_count_nodes.make_runner() as runner:
    runner.run()
    for line in runner.stream_output():
        key, value = mrjob_count_nodes.parse_output_line(line)
        total_nodes = value
        

mrjob_prep_data = MRPrepData(args=['PageRank-test.txt', '--no-strict-protocol', '--initmass', 1.0/total_nodes])
with mrjob_prep_data.make_runner() as runner:
    runner.run()
    outfile = open('PageRankLooping.txt', 'w')
    for line in runner.stream_output():
        key, value = mrjob_prep_data.parse_output_line(line)
        print key, value
        outfile.write(str(key) + ' ' + str(value) + '\n')
    outfile.close()
    
print '\nLooping\n'

for i in range(3):        
    mrjob_pagerank = MRPageRankPhaseOne(args=['PageRankLooping.txt', '--no-strict-protocol'])
    with mrjob_pagerank.make_runner() as runner:
        runner.run()
        total_mass = 0.0
        outfile = open('PageRankLooping.txt', 'w')
        for line in runner.stream_output():
            key, value = mrjob_pagerank.parse_output_line(line)
            total_mass += value[1]
#             print key, value
            outfile.write(str(key) + ' ' + str(value) + '\n')
        outfile.close()
        print "Total phase 1 prMass: " + str(total_mass)
        
    massFile = open('/home/ubuntu/repos/261/week8/HW8/danglingmass.txt', 'r')
    total_danglingmass = 0.0
    for line in massFile.readlines():
        total_danglingmass += float(line.strip())
    massFile.close()
    massFile = open('/home/ubuntu/repos/261/week8/HW8/danglingmass.txt', 'w')
    massFile.close()
        
    print 'Phase Two options: '
    print '\tTotal nodes: ' + str(total_nodes)
    print '\tDamping factor: ' + str(damping_factor)
    print '\tDangling mass: ' + str(total_danglingmass)
    
    
    mrjob_pagerank_phase_two = MRPageRankPhaseTwo(args=['PageRankLooping.txt', '--totalnodes', str(total_nodes), '--dampingfactor', str(damping_factor), '--danglingmass', str(total_danglingmass), '--no-strict-protocol'])
    with mrjob_pagerank_phase_two.make_runner() as runner:
        runner.run()
        total_mass = 0.0
        outfile = open('PageRankLooping.txt', 'w')
        for line in runner.stream_output():
            key, value = mrjob_pagerank_phase_two.parse_output_line(line)
            total_mass += value[1]
#             print key, value
            outfile.write(str(key) + ' ' + str(value) + '\n')
        outfile.close()
        print "Total phase 2 prMass: " + str(total_mass)


B [u"{'C': 1}", 0.1]
C [u"{'B': 1}", 0.1]
D [u"{'A': 1, 'B': 1}", 0.1]
E [u"{'D': 1, 'B': 1, 'F': 1}", 0.1]
F [u"{'B': 1, 'E': 1}", 0.1]
G [u"{'B': 1, 'E': 1}", 0.1]
H [u"{'B': 1, 'E': 1}", 0.1]
I [u"{'B': 1, 'E': 1}", 0.1]
J [u"{'E': 1}", 0.1]
K [u"{'E': 1}", 0.1]

Looping

Case: A {} has no outgoing edges: printing to file /home/ubuntu/repos/261/week8/HW8/danglingmass.txt
	Printed 0.05 as dangling mass
Base case: B {'C': 1}
Base case: C {'B': 1}
Base case: D {'A': 1, 'B': 1}
Base case: E {'B': 1, 'D': 1, 'F': 1}
Base case: F {'B': 1, 'E': 1}
Base case: G {'B': 1, 'E': 1}
Base case: H {'B': 1, 'E': 1}
Base case: I {'B': 1, 'E': 1}
Base case: J {'E': 1}
Base case: K {'E': 1}
Total phase 1 prMass: 1.0
Phase Two options: 
	Total nodes: 10
	Damping factor: 0.15
	Dangling mass: 0.05
Total phase 2 prMass: 1.06175
Case: A {} has no outgoing edges: printing to file /home/ubuntu/repos/261/week8/HW8/danglingmass.txt
	Printed 0.0237916666667 as dangling mass
Base case: B {'C': 1}
Base case: C {'

Pagerank steps:

1. Count nodes
2. Initialize nodes with 1/N
3. Read sink mass from file/S3 and distribute it
4. Forward mass
    3b. Record sink mass in file/S3
5. Check convergence

In [12]:
%%writefile MRPageRankPhaseOne.py
from mrjob.job import MRJob
from mrjob.step import MRStep
import sys, os

class MRPageRankPhaseOne(MRJob):        

    def mapper_init(self):
        self.danglingmass = 0.0
        
    def mapper(self, _, line):
        splits = line.split(' ', 1)
        key = splits[0]
        #load the values as an array
        value = eval(splits[1])
        #load the edges as a dictionary
        edges = eval(value[0])
        pr = float(value[1])
        if len(edges) == 0:
            yield 'dangling', [[], pr]
        else:
            forwarding_pr = pr/len(edges)
            for edge, weight in edges.iteritems():
                yield edge, ['{}', forwarding_pr]
        yield key, [str(edges), 0.0]
                        
    def reducer(self, key, values): 
        #the case if a node is listed with no edges
        if key == 'dangling':
            print 'Key label is "dangling": printing to file'
            danglingmass = 0.0
            for value in values:
                danglingmass += float(value[1])
            massfile = open('/home/ubuntu/repos/261/week8/HW8/danglingmass.txt', 'a')
            massfile.write(str(danglingmass) + '\n')
            massfile.close()
        else:
            pr = 0.0
            edges = {}
            for value in values:
#                 print 'Value: ' + str(value) + ': ' + str(type(value))
#                 value_array = eval(value)
                pr += float(value[1])
                for edge, weight in eval(value[0]).iteritems():
                    edges[edge] = weight
            if len(edges) == 0:
                #the case when a node isn't listed except in the edge list of another graph
                massfile = open('/home/ubuntu/repos/261/week8/HW8/danglingmass.txt', 'a')
                print 'Case: ' + str(key) + ' ' + str(edges) + ' has no outgoing edges: printing to file ' + os.path.abspath(massfile.name)
                massfile.write(str(pr) + '\n')
                massfile.close()
                print '\tPrinted ' + str(pr) + ' as dangling mass'
                yield key, [str(edges), pr]
            else:
                #the base case where a node is listed and has edges
                print 'Base case: ' + str(key) + ' ' + str(edges)
                yield key, [str(edges), pr]

        
if __name__ == '__main__':
    MRPageRankPhaseOne.run()

Overwriting MRPageRankPhaseOne.py


In [13]:
%%writefile MRPageRankPhaseTwo.py
from mrjob.job import MRJob
from mrjob.step import MRStep


class MRPageRankPhaseTwo(MRJob):
    def configure_options(self):
        super(MRPageRankPhaseTwo, self).configure_options()
        self.add_passthrough_option('--danglingmass', default=None, type=str)
        self.add_passthrough_option('--totalnodes', default=None, type=str)
        self.add_passthrough_option('--dampingfactor', default=None, type=str)
        
    def mapper(self, _, line):
        splits = line.split(' ', 1)
        key = splits[0]
        #load the values as an array
        value = eval(splits[1])
        #load the edges as a dictionary
        edges = eval(value[0])
        pr = float(value[1])
        nodes = int(self.options.totalnodes)
        dampingfactor = float(self.options.dampingfactor)
        danglingmass = float(self.options.danglingmass)
        pr_prime = dampingfactor * (1.0/nodes) + (1 - dampingfactor) * (danglingmass/nodes + pr)
        yield key, [str(edges), pr_prime]
        
if __name__ == '__main__':
    MRPageRankPhaseTwo.run()

Overwriting MRPageRankPhaseTwo.py


In [5]:
testDict = {'a':1, 'b':2, 'c':3}
print len(testDict)

3


In [5]:
testList = [1,2,3]
testList2 = [3,4,5]

testList += testList2

In [6]:
testList

[1, 2, 3, 3, 4, 5]

In [23]:
!pwd

/home/ubuntu/repos/261/week8/HW8
