In [2]:
#Reload changes -> always run this
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 9.1: MRJob Implementation of Basic PageRank

Write a basic MRJob implementation of the iterative PageRank algorithm
that takes sparse adjacency lists as input (as explored in HW 7).
Make sure that you implementation utilizes teleportation (1-damping/the number of nodes in the network), 
and further, distributes the mass of dangling nodes with each iteration
so that the output of each iteration is correctly normalized (sums to 1).
[NOTE: The PageRank algorithm assumes that a random surfer (walker), starting from a random web page,
chooses the next page to which it will move by clicking at random, with probability d,
one of the hyperlinks in the current page. This probability is represented by a so-called
‘damping factor’ d, where d ∈ (0, 1). Otherwise, with probability (1 − d), the surfer
jumps to any web page in the network. If a page is a dangling end, meaning it has no
outgoing hyperlinks, the random surfer selects an arbitrary web page from a uniform
distribution and “teleports” to that page]


As you build your code, use the test data

s3://ucb-mids-mls-networks/PageRank-test.txt
Or under the Data Subfolder for HW7 on Dropbox with the same file name. 
(On Dropbox https://www.dropbox.com/sh/2c0k5adwz36lkcw/AAAAKsjQfF9uHfv-X9mCqr9wa?dl=0)

with teleportation parameter set to 0.15 (1-d, where d, the damping factor is set to 0.85), and crosscheck
your work with the true result, displayed in the first image
in the Wikipedia article:

https://en.wikipedia.org/wiki/PageRank

and here for reference are the corresponding PageRank probabilities:

A,0.033
B,0.384
C,0.343
D,0.039
E,0.081
F,0.039
G,0.016
H,0.016
I,0.016
J,0.016
K,0.016


In [49]:
%%writefile numberNodesMR.py

from mrjob.job import MRJob
from mrjob.step import MRStep

class NumberNodes(MRJob):
    
    def mapper1(self, _, line):
        newLine = line.split('\t')
        
        node = newLine[0]
        neighbors = eval(newLine[1])
        yield node, 1
        for neighbor in neighbors.keys():
            yield neighbor, 1
    
    def reducer1(self, key, values):
        yield key, 1
    
    def mapper2(self, key, values):
        yield None, 1
    
    def reducer2(self, key, values):
        total = sum(values)
        yield None, total
    
        
    def steps(self):
        return [MRStep(mapper = self.mapper1, reducer = self.reducer1),
               MRStep(mapper = self.mapper2, reducer = self.reducer2)]
    
if __name__ == "__main__":
    NumberNodes.run()

Writing numberNodesMR.py


In [3]:
from numberNodesMR import NumberNodes

filename = 'PageRank-test.txt'

mr_job = NumberNodes(args = [filename])

with mr_job.make_runner() as runner:
    runner.run()
    print "Number of nodes in " + filename
    for line in runner.stream_output():
        print mr_job.parse_output_line(line)[1]



Number of nodes in PageRank-test.txt
11


In [17]:
%%writefile initPR.py

from mrjob.job import MRJob
from mrjob.step import MRStep

class InitPRJob(MRJob):
    
    def configure_options(self):
        super(InitPRJob, self).configure_options()
        self.add_passthrough_option('--numNodes', type = float, default = 10, help = 'Number of Nodes in Graph')
    
    
    def mapper(self, _, line):
        
        line = line.split('\t')
        node = line[0]
        adj = eval(line[1])
        for neighbor in adj.keys():
            yield neighbor, {}
        yield node, adj
            
    
    def reducer(self, key, values):
        nid = key
        adj = {}
        dangling = True
        newValues = [value for value in values]
        for dictionary in newValues:
            if len(dictionary) != 0:
                adj = dictionary
        
        PageRank = float(1)/self.options.numNodes
        
        yield nid, (PageRank, adj)

    def steps(self):
        return [
            MRStep(mapper = self.mapper, reducer = self.reducer)
        ]

if __name__ == "__main__":
    InitPRJob.run()

Overwriting initPR.py


In [18]:
from initPR import InitPRJob

mr_job = InitPRJob(args = ['PageRank-test.txt', '--numNodes', '11'])

with open('initData.txt', 'w+') as myfile:
    with mr_job.make_runner() as runner:
        runner.run()
        for line in runner.stream_output():
            myfile.write(line)



In [7]:
%%writefile pageRank.py

from mrjob.job import MRJob
from mrjob.step import MRStep

class PageRankMR(MRJob):
    
    def configure_options(self):
        super(PageRankMR, self).configure_options()
        self.add_passthrough_option('--alpha', type = float, default = 0.15, help = "Alpha")
        self.add_passthrough_option('--numNodes', type = int, default = 10, help = 'Number Nodes')  
    
    def mapper1(self, _, line):
        data = line.split('\t')
        nid = eval(data[0])
        data2 = eval(data[1])
        curPr = float(data2[0])
        neighbors = data2[1]
        
        if len(neighbors) == 0:
            yield '*', curPr 
        else:
            yield nid, neighbors
            newPR = curPr/len(neighbors)
            for nid in neighbors.keys():
                yield nid, newPR
                
    def reducer_init(self):
        self.mass = 0
    
    def reducer1(self, key, values):
        if key == '*':
            for value in values:
                curPr = value
                self.mass += curPr
        else:
            newPr = float(0)
            adj = {}
            
            for value in values:
                if type(value) == dict:
                    adj = value
                else:
                    newPr += value
           
            yield key, (newPr, adj)
    
                
    def reducer_final(self):
        yield '*', self.mass
        
    def reducer2_init(self):
        self.mass = 0.0
      
    def reducer2(self, key, values):
        nid = key
        if nid == '*': #pass mass along here
            for value in values:
                self.mass += value
        else:
            valList = [value for value in values][0]
            curPr = float(valList[0])
            neighbors = valList[1]
            
            newMass = self.mass/self.options.numNodes
            alpha = self.options.alpha
            
            newPr = (alpha/float(self.options.numNodes)) + ((1-alpha) * float(newMass + curPr))
        
            yield nid, (newPr, neighbors)
        
    def steps(self):
        return [
            MRStep(mapper = self.mapper1, 
                   reducer_init = self.reducer_init, 
                   reducer = self.reducer1, 
                   reducer_final = self.reducer_final),
            MRStep(reducer_init = self.reducer_init, reducer = self.reducer2)
        ]
        
if __name__ == "__main__":
    PageRankMR.run()

Overwriting pageRank.py


In [8]:
from pageRank import PageRankMR

mr_job = PageRankMR(args = ['initData.txt', '--alpha', '0.15', '--numNodes', '11'])

with mr_job.make_runner() as runner:
    runner.run()
    for line in runner.stream_output():
        print mr_job.parse_output_line(line)



('A', [0.059297520661157024, {}])
('B', [0.3168732782369146, {'C': 1}])
('C', [0.09793388429752066, {'B': 1}])
('D', [0.046418732782369146, {'A': 1, 'B': 1}])
('E', [0.32975206611570246, {'B': 1, 'D': 1, 'F': 1}])
('F', [0.046418732782369146, {'B': 1, 'E': 1}])
('G', [0.02066115702479339, {'B': 1, 'E': 1}])
('H', [0.02066115702479339, {'B': 1, 'E': 1}])
('I', [0.02066115702479339, {'B': 1, 'E': 1}])
('J', [0.02066115702479339, {'E': 1}])
('K', [0.02066115702479339, {'E': 1}])


In [51]:
from initPR import InitPRJob
from pageRank import PageRankMR
import os

# mr_job_init = InitPRJob(args = ['PageRank-test.txt'])
# with open('initData.txt', 'w+') as myfile:
#     with mr_job_init.make_runner() as runner:
#         runner.run()
#         for line in runner.stream_output():
#             myfile.write(line)

iteration = 0

while(iteration < 40):
    if iteration == 0:
        with open('interResults.txt', 'w+') as myfile:
            mr_job = PageRankMR(args = ['initData.txt', '--alpha', '0.15', '--numNodes', '11'])
            with mr_job.make_runner() as runner:
                runner.run()
                for line in runner.stream_output():
                    myfile.write(line)
        iteration += 1
    else:
        with open('newFile.txt', 'w+') as myfile:
            mr_job = PageRankMR(args = ['interResults.txt', '--alpha', '0.15', '--numNodes', '11'])
            with mr_job.make_runner() as runner:
                runner.run()
                for line in runner.stream_output():
                    myfile.write(line)
        iteration += 1
        os.rename('newFile.txt', 'interResults.txt')
        
print "All done"       




All done


## 9.2: Exploring PageRank teleportation and network plots

In order to overcome  problems such as disconnected components, the damping factor (a typical value for d is 0.85) can be varied. 

Using the graph in HW1, plot the test graph (using networkx, https://networkx.github.io/) for several values of the damping parameter alpha,so that each nodes radius is proportional to its PageRank score. 
In particular you should do this for the following damping factors: [0,0.25,0.5,0.75, 0.85, 1]. Note your plots should look like the following:

https://en.wikipedia.org/wiki/PageRank#/media/File:PageRanks-Example.svg