In [1]:
import os
import random
import re
import copy

DAMPING = 0.85
SAMPLES = 10000

def crawl(directory):
    pages = dict()

    for filename in os.listdir(directory):
        if not filename.endswith(".html"):
            continue
        with open(os.path.join(directory, filename)) as f:
            contents = f.read()
            links = re.findall(r"<a\s+(?:[^>]*?)href=\"([^\"]*)\"", contents)
            print(f"Found links in {filename}: {links}")  # Debugging output
            pages[filename] = set(links) - {filename}

    for filename in pages:
        pages[filename] = set(
            link for link in pages[filename]
            if link in pages
        )

    return pages

def transition_model(corpus, page, damping_factor):
    """
    Return a probability distribution over which page to visit next,
    given a current page.
    """
    d = {}
    links = corpus[page]
    num_pages = len(corpus)
    num_links = len(links)

    if links:
        for key in corpus:
            d[key] = (1 - damping_factor) / num_pages
        for key in links:
            d[key] += damping_factor / num_links
    else:
        for key in corpus:
            d[key] = 1.0 / num_pages

    return d

def sample_pagerank(corpus, damping_factor, n):
    """
    Return PageRank values for each page by sampling `n` pages.
    """
    d = {}.fromkeys(corpus.keys(), 0)
    page = random.choices(list(corpus.keys()))[0]

    for i in range(1, n):
        current_dist = transition_model(corpus, page, damping_factor)
        for _page in d:
            d[_page] = (((i - 1) * d[_page]) + current_dist[_page]) / i
        page = random.choices(list(d.keys()), weights=list(d.values()), k=1)[0]

    return d

def iterate_pagerank(corpus, damping_factor):
    """
    Return PageRank values for each page by iteratively updating
    PageRank values until convergence.
    """
    total_pages = len(corpus)
    distribution = {}.fromkeys(corpus.keys(), 1.0 / total_pages)
    change = True

    while change:
        change = False
        old_distribution = copy.deepcopy(distribution)
        for page in corpus:
            distribution[page] = ((1 - damping_factor) / total_pages) + \
                (damping_factor * get_sum(corpus, distribution, page))
            change = change or abs(
                old_distribution[page] - distribution[page]) > 0.001

    return distribution

def get_sum(corpus, distribution, page):
    result = 0
    for p in corpus:
        if page in corpus[p]:
            result += distribution[p] / len(corpus[p])
    return result

directory = "site"  

corpus = crawl(directory)
print(corpus) 

ranks_sampled = sample_pagerank(corpus, DAMPING, SAMPLES)
print("PageRank Results from Sampling")
for page, rank in ranks_sampled.items():
    print(f"{page}: {rank:.4f}")

ranks_iterated = iterate_pagerank(corpus, DAMPING)
print("\nPageRank Results from Iteration")
for page, rank in ranks_iterated.items():
    print(f"{page}: {rank:.4f}")


Found links in c.html: ['programming.html']
Found links in 2.html: ['1.html', '3.html']
Found links in ai.html: ['inference.html', 'algorithms.html']
Found links in tictactoe.html: ['games.html', 'minimax.html']
Found links in 3.html: ['2.html', '4.html']
Found links in dfs.html: ['bfs.html', 'search.html']
Found links in minimax.html: ['search.html', 'games.html']
Found links in algorithms.html: ['programming.html', 'recursion.html']
Found links in programming.html: ['c.html', 'python.html']
Found links in 1.html: ['2.html']
Found links in python.html: ['programming.html', 'ai.html']
Found links in recursion.html: ['recursion.html']
Found links in minesweeper.html: ['games.html']
Found links in inference.html: ['ai.html']
Found links in 4.html: ['2.html']
Found links in games.html: ['tictactoe.html', 'minesweeper.html']
Found links in logic.html: ['inference.html']
Found links in search.html: ['dfs.html', 'bfs.html', 'minimax.html']
Found links in bfs.html: ['search.html']
{'c.html': 