In [30]:
import numpy as np

# Function to Read Graph Files

In [31]:
def read_graph_file(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()

    vertex_map = {}
    for line in lines:
        for vertex in line.split():
            if vertex not in vertex_map:
                vertex_map[vertex] = len(vertex_map)

    num_vertices = len(vertex_map)
    adj_matrix = np.zeros((num_vertices, num_vertices))
    for line in lines:
        vertices = line.split()
        from_vertex_idx = vertex_map[vertices[0]]
        to_vertex_indices = [vertex_map[v] for v in vertices[1:]]
        for to_vertex_idx in to_vertex_indices:
            adj_matrix[from_vertex_idx, to_vertex_idx] = 1

    return adj_matrix

# Q1: Page Rank and Markov Chains

In [37]:
# PART (A)
M = read_graph_file('graph.txt')

row_sums = M.sum(axis=1)

P = M / row_sums[:, np.newaxis]

print(P)

[[0.         0.33333333 0.33333333 0.33333333 0.         0.        ]
 [0.         0.         0.         0.         0.5        0.5       ]
 [0.         0.25       0.         0.25       0.25       0.25      ]
 [0.33333333 0.33333333 0.         0.         0.33333333 0.        ]
 [0.5        0.         0.         0.5        0.         0.        ]
 [0.33333333 0.33333333 0.         0.         0.33333333 0.        ]]


In [39]:
# PART (B)
A = np.vstack((P.T - np.eye(6), np.ones(6)))

b = np.zeros(7)
b[-1] = 1

pi = np.linalg.lstsq(A, b, rcond=None)[0]

print(pi)

[0.2115869  0.19143577 0.07052897 0.19647355 0.21662469 0.11335013]


In [41]:
# PART (C)
d = 0.85

N = len(P)
P_prime = (1-d)/N * np.ones((N, N)) + d * P

pi_0 = np.array([1/6, 1/6, 1/6, 1/6, 1/6, 1/6])

max_abs_diff = np.inf
n = 0
while max_abs_diff > 0.01:
    n += 1
    pi_n = pi_0.dot(P_prime)
    max_abs_diff = np.max(np.abs(pi_n - pi_0))
    pi_0 = pi_n

print(pi_0)
print("Number of iterations:", n)

[0.20559467 0.1905434  0.08412452 0.19183791 0.20877479 0.11912471]
Number of iterations: 3


# Task 1: Iterative Page Rank Algorithm

In [32]:
def pagerank(M, num_iterations=100, d=0.85):
    N = M.shape[1]
    v = np.random.rand(N, 1)
    v = v / np.linalg.norm(v, 1)
    M_hat = (d * M + (1 - d) / N)
    for i in range(num_iterations):
        v = M_hat @ v
    return v

In [33]:
M = read_graph_file('graph.txt')

scores = pagerank(M)
print(scores)

[[2.13079785e+37]
 [1.33049328e+37]
 [2.46980370e+37]
 [1.88762061e+37]
 [1.53974427e+37]
 [1.88762061e+37]]


# Task 2: Running the algo on given file

In [None]:
M = read_graph_file('wt2g_inlinks.txt')

scores = pagerank(M)
print(scores)

# Task 3: Web Crawler

In [36]:
import requests
from bs4 import BeautifulSoup
import time
from urllib.robotparser import RobotFileParser

MAX_PAGES = 100

DOMAIN = 'mit.edu'

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'

DELAY = 5

OUTPUT_FILE = 'output.txt'

ROBOTS_TXT_URL = 'https://www.mit.edu/robots.txt'

visited_urls = set()

urls_to_visit = ['https://www.mit.edu']

def is_valid_domain(url):
    return DOMAIN in url

def canonicalize_url(url):
    url = url.split('#')[0]
    url = url.split('?')[0]
    url = url.rstrip('/')
    if '.' not in url.split('/')[-1]:
        url += '/'
    return url

def is_allowed_by_robots(url):
    response = requests.get(ROBOTS_TXT_URL, headers={'User-Agent': USER_AGENT})
    robots_txt = response.text
    parsed_robots_txt = RobotFileParser()
    parsed_robots_txt.parse(robots_txt.splitlines())
    return parsed_robots_txt.can_fetch(USER_AGENT, url)
    
def crawl_url(url):
    global visited_urls
    global urls_to_visit
    
    if url in visited_urls:
        return
    if not is_valid_domain(url):
        return
    if not is_allowed_by_robots(url):
        return

    visited_urls.add(url)

    response = requests.get(url, headers={'User-Agent': USER_AGENT})

    if 'text/html' not in response.headers['Content-Type']:
        return

    soup = BeautifulSoup(response.content, 'html.parser')

    canonical_url = canonicalize_url(response.url)

    outgoing_links = []
    for link in soup.find_all('a'):
        href = link.get('href')
        if href is not None and is_valid_domain(href):
            outgoing_links.append(canonicalize_url(href))

    with open(OUTPUT_FILE, 'a') as f:
        f.write(canonical_url + ' ' + ' '.join(outgoing_links) + '\n')

    for link in outgoing_links:
        if link not in visited_urls:
            urls_to_visit.append(link)

while len(visited_urls) < MAX_PAGES and len(urls_to_visit) > 0:
    url = urls_to_visit.pop(0)

    crawl_url(url)

    time.sleep(DELAY)

InvalidSchema: No connection adapters were found for 'mailto:abbya@mit.edu'