In [8]:
from selectors import DefaultSelector, EVENT_READ, EVENT_WRITE
from bs4 import BeautifulSoup
import socket
from urllib.parse import urlparse

# // create global variables
urls_todo = set() # the ones that have Fetcher instances terminated
seen_urls = set() # the ones that have Fetcher instances created

selector = DefaultSelector()
stopped = False

class Fetcher:
    def __init__(self, host, url, level=0):
        self.response = b''  # Empty array of bytes.
        self.host = host
        self.url = url
        self.sock = None
        self.level = level
        
    # Method on Fetcher class.
    def fetch(self):
        self.sock = socket.socket()
        self.sock.setblocking(False)
        try:
            self.sock.connect((self.host, 80))
        except BlockingIOError:
            pass
        # Register next callback.
        selector.register(self.sock.fileno(),
                          EVENT_WRITE,
                          self.connected)

    def connected(self, key, mask):
#         print('connected!', flush=True)
#         print(key)
#         print(key.fd)
        selector.unregister(key.fd) # key and passed in by the selector upon completion
        request = 'GET {} HTTP/1.0\r\nHost: {}\r\n\r\n'.format(self.url, self.host)
        self.sock.send(request.encode('ascii'))

        # Register the next callback.
        selector.register(key.fd,
                          EVENT_READ,
                          self.read_response)
        
    def read_response(self, key, mask):
        global stopped
        
        chunk = self.sock.recv(4096)  # USUALLY 4k chunk size, here small
        if chunk:
#             print("read chunk", flush=True)
            self.response += chunk
        else: # finished reading
#             print("all read", flush=True)
            selector.unregister(key.fd)  # Done reading.
            # stopped=True
            
            if self.level == 1:# if first level, add links
                links = self.parse_links() # return a set of links: (host,url) tuples

#                 print(links,flush=True)
                    
                # use the set difference method to add new links to urls_todo and recursively set up a Fetcher instance.
                links = links.difference(seen_urls) 
#                 print(links,flush=True)
                seen_urls.update(links) # // update the global links
                
#                 print(links,flush=True)
                
                for host, url in links:
#                     print("creating Fetcher for {}".format(url),flush=True)
                    fetcher = Fetcher(host, url)
                    fetcher.fetch()
                    urls_todo.add((host,url))
                
            # remove this link from list, and stop if completed
#             print("removing {}".format(self.host+self.url),flush=True)
            urls_todo.remove((self.host,self.url))
#             print("left: {}".format(urls_todo),flush=True)
            if not urls_todo:
                print("finished!",flush=True)
                stopped = True
                
    def parse_links(self):
        links = set()
        soup = BeautifulSoup(self.response, 'html.parser')
        
        for link in soup.find_all('a'):
            href = link.get('href')
            parsed_href = urlparse(href)
#             print(href)
#             print(parsed_href)
            if parsed_href.netloc != '':
                if parsed_href.path != '':
                    links.add((parsed_href.netloc, parsed_href.path))
                else:
                    links.add((parsed_href.netloc, '/'))
            else:
                if parsed_href.path != '':
                    links.add((self.host, parsed_href.path))
                else:
                    links.add((self.host, '/'))
        return set(links)
            
def loop():
    while not stopped:
        events = selector.select()
        for event_key, event_mask in events:
            callback = event_key.data
            callback(event_key, event_mask)

In [None]:
from time import time
fetcher = Fetcher('www.xkcd.com','/353/',level=1)
fetcher.fetch()
urls_todo.add((fetcher.host,fetcher.url))
t0 = time()
loop() # figure out which loops are not running
print("runtime: {}".format(time()-t0))