In [None]:
class PageRank:
    def __init__(self, pages, d=0.85, initial_guess=1.0):
        """
        :param pages: A graph of pages represented as linked lists
        :param d: The dampening factor
        :param initial_guess: The initial value for page rank
        """
        self.d = d
        self.pages = pages
        self.inverse_pages = {}
        self.outlinks = {}
        self.inlinks = {}
        self.pr = {}
        self.all_pages = set()

        for page in pages:
            self.outlinks[page] = len(pages[page])
            self.all_pages.add(page)
            for out in pages[page]:
                self.all_pages.add(out)
                self.inlinks[out] = self.inlinks.get(out, 0) + 1

                if out not in self.inverse_pages:
                    self.inverse_pages[out] = []
                self.inverse_pages[out].append(page)

        for page in self.all_pages:
            self.pr[page] = initial_guess

    def calculate(self, iters, report=False, report_every=1, rep_function=None):
        """
        Calculates the page rank of the given page graph
        :param iters: Number of iterations
        :param report: Wether you want the code to report every iteration to the console
        :return:
        """
        for i in range(1, iters+1):
            pr_copy = dict(self.pr)
            for page in self.pr:
                self.pr[page] = 1 - self.d
                mid = 0
                if page in self.inverse_pages:
                    for page_linked in self.inverse_pages[page]:
                        mid += pr_copy[page_linked] / self.outlinks[page_linked]
                self.pr[page] += mid * self.d
            if report:
                if i % report_every == 0:
                    if rep_function is not None:
                        rep_function(i, self.pr)
                    else:
                        print("Iteration %d" % i)
                        print(self.pr)

    def get_avg(self):
        d = 0
        for p in self.pr:
            d += self.pr[p]
        return d / len(self.pr)
    
# A small reporting function
def report(iteration, page_rank):
    print("Iteration %d" % iteration)
    for page in page_rank:
        print("\t%s: %0.6f" % (page, page_rank[page]))

In [None]:
import re

nodes = {}
revisions = {}
currentRevision = 0
parent = ''
with open("enwiki-20080103-sample.txt", "r") as wikiinput:
    i = 0
    for l in wikiinput.readlines():
        i += 1;
        i = i % 14;
        if i == 1:
            values = l.split(' ')
            parent = values[3]
            currentRevision = int(values[2].strip())
        if i == 4:
            outlinks = l[4:].strip()
            if currentRevision >= revisions.get(parent, 0):
                revisions[parent] = currentRevision
                out = []
                if outlinks != '':
                    out = re.split("[ |\t]", outlinks)
                nodes[parent] = out

In [None]:
import json
with open("nodes.json", "w") as nodes_file:
    json.dump(nodes, nodes_file, sort_keys=True, indent=4)

In [None]:
pr = PageRank(nodes)
pr.calculate(20, report=False, report_every=10, rep_function=report)

In [None]:
with open("PageRankResults_round0.txt", "w") as results:
    sortedk = sorted(pr.pr.keys())
    for key in sortedk:
        results.write(key + " " + str(pr.pr[key]) + "\n")