# **Crawler**
This notebook contains started code structure for creating a crawler on single machine

**Author:** Noshaba Nasir

**Date**: 26/3/2021

**Updated by**: Feza Roheel, 17L-4005

In [1]:
import os 
import random
import queue
from urllib.parse import urlparse
import datetime
from time import sleep
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
from socket import *
from urllib.robotparser import RobotFileParser
import urllib.request
import urllib.robotparser as urobot
import uuid
import threading
from threading import Event
import socket
# Add any library to be imported here

In [2]:
# Crawler Parameters
BACKQUEUES= 3
THREADS= BACKQUEUES*3
FRONTQUEUES= 5
WAITTIME= 15 ; # wait 15 seconds before fetching URLS from
# Maps urls to doc ie <url,docid>
url_to_doc = {}
# Maps urls to robot <domain,robot>
domain_to_robot = {}
# crawled links
final_links = []

# Add any other global parameters here

# **FRONTIER**
Frontier should use the Mercator frontier design as discussed in lecture.

Preferably it should be a class and should have the given functions.

*prioritizer* function is a stub right now, it will return a random number between 1 to f for given URL

In [3]:
class frontier:
# add the code for frontier here
# should have functions __init__, get_URL, add_URLs, add_to_backqueue

    def __init__(self):
        """
        Initalizes the crawler
        """
        # Name of crawler
        self.name = "MyCrawler"

        # Intializing domain that back queue is processing 'domain' : 'queueno'
        self.backqueuedomain = {}
        
        # Intializing the front & back queues
        self.backqueuelist = [queue.Queue(maxsize = 0) for i in range(BACKQUEUES)]
        self.frontqueuelist = [queue.Queue(maxsize = 0) for i in range(FRONTQUEUES)]

        # Starting seeds
        self.seeds = [ "https://docs.oracle.com/en/",
                      "https://www.oracle.com/corporate/",
                      "https://en.wikipedia.org/wiki/Machine_learning",
                      "https://www.csie.ntu.edu.tw/~cjlin/libsvm/index.html",
                      "https://docs.oracle.com/middleware/jet210/jet/index.html",
                      "https://en.wikipedia.org/w/api.php",
                      "https://en.wikipedia.org/api/",
                      "https://en.wikipedia.org/wiki/Weka_(machine_learning)",]

        # Adding the seed URLs in front queues based on priority
        self.add_URLs(self.seeds)

        # Atleast 1 link in each back queue using push to backqueue and update heap
        self.time_priorityqueue = queue.PriorityQueue()

        for i in range(BACKQUEUES):
          self.add_to_backqueue(i)
          self.time_priorityqueue.put((datetime.datetime.now(),i))


    def front_queue_selector(self):
      """
      Returns the non - empty front queue number to give chance to come in the backqueue
      """
      queue_empty = 1
      # Make sure that Front queue is not empty
      while queue_empty == 1:
        # More priority to later queues
        w = list(range(1, FRONTQUEUES+1))
        # Give chance to Front queue domain to come in backqueue
        selected_frontqueue = random.choices(range(0, FRONTQUEUES), weights= w )[0]
        # print(selected_frontqueue)
        if not self.frontqueuelist[selected_frontqueue].empty():
          queue_empty = 0
          return selected_frontqueue
        

    def get_URL(self):
      """ 
      Updates heap, updates backqueue and maintains politeness
      returns url
      """      
      # Get the mintime & backqueue from priority queue
      element = self.time_priorityqueue.get()
      time = element[0]
      backqueue_number = element[1]
      # get url
      url = self.backqueuelist[backqueue_number].get()

      # Politeness
      difference = time - datetime.datetime.now()
      seconds = difference.total_seconds()
      if seconds > 0:
        sleep(seconds)

      # update backqueue
      if self.backqueuelist[backqueue_number].empty():
        self.add_to_backqueue(backqueue_number)
      
      # update priority queue
      self.time_priorityqueue.put((self.get_updatedtime(), backqueue_number))

      return url
      
        
    def add_URLs(self, URLs):
      """
      Takes in URLs
      It will add URLs to front queues based on priority
      """
      for i in range(len(URLs)):
        # Index is from 0 to F - 1
        queue_number = prioritizer(URLs[i], FRONTQUEUES) - 1
        self.frontqueuelist[queue_number].put(URLs[i])
    

    def get_domain(self, URL):
      """
      Takes a URL
      It will return domain of that url
      """
      domain = urlparse(URL).netloc
      return domain


    def get_updatedtime(self):
      """
      It will return the updated time = current time + Waittime
      """
      current_time = datetime.datetime.now()
      updated_time = current_time + datetime.timedelta(0,WAITTIME)
      return updated_time



    def add_to_backqueue(self, backqueue_number):
      """
      Takes a backqueue_number 
      It will add a link from frontqueue to backqueue 
      """
      checkbackqueue = 1

      while checkbackqueue == 1:
        # Get a non empty front queue number
        frontqueue_number = self.front_queue_selector()

        domain = self.get_domain(self.frontqueuelist[frontqueue_number].queue[0])

        # Check if domain exists in already backqueue
        if domain in self.backqueuedomain:
          existingqueue_no = self.backqueuedomain[domain]
          # Add domain in back queue (no need to update domain)
          self.backqueuelist[existingqueue_no].put(self.frontqueuelist[frontqueue_number].get())
        

        # Check if backqueue is empty
        elif self.backqueuelist[backqueue_number].empty():
          # Add the domain from front queue & update timer
          # Remove the domain first
          if domain in self.backqueuedomain:
            self.backqueuedomain.pop(domain)
          self.backqueuedomain[domain] = backqueue_number
          self.backqueuelist[backqueue_number].put(self.frontqueuelist[frontqueue_number].get())
        
        
        if not self.backqueuelist[backqueue_number].empty() or len(url_to_doc)>=limit_documents:
          checkbackqueue = 0


def prioritizer(URL,f):
    """
    Take URL and returns priority from 1 to F
    Right now it like a stub function. 
    It will return a random number from 1 to f for given inputs. 
    """
    return random.randint(1,f)

# **FILTER URLS**
Filter the URLS that are in robots.txt files of server and the have been already processed.

In [4]:

def is_url(url):
  """
  Takes in url
  returns true if the link is a url, false otherwise
  """
  try:
    result = urlparse(url)
    return all([result.scheme, result.netloc])
  except ValueError:
    return False


def fetch(url):
  """
  fetch the content and store it from the url passed as parameter
  rteurns the unique name of the file
  """
  unique_filename = uuid.uuid4().hex
  try:
    if url not in url_to_doc and len(url_to_doc) < limit_documents:
      # Extract from webserver and store in xml file
      r = urllib.request.urlretrieve(url, unique_filename + ".xml")
      return (unique_filename)+".xml"
  except:
    return None
    pass


def parse(url):
  """
  Takes a url
  returns a list of html pages urls extracted from url and also converts them into
  absolute urls
  """
  r = requests.get(url)
  soup = BeautifulSoup(r.content, 'html.parser',from_encoding="iso-8859-1")
  links = []
 
  for link in soup.find_all('a',href= True):
    link = link['href']
    if link != "" and link!= None and link!= '#' and is_url(link) == True:
      # Relative urls also converted to Absolute urls
      link = urljoin(url, link)
     
      # Only html pages
      if "text/html" in r.headers.get("content-type", ''):
        links.append(link)
        
  return links


def urlfilter(urls):
  """
  takes in urls
  for each url checks if robot.txt already stored or not. If the url is valid then 
  appends in list of filtered urls and returns it
  """
  socket.setdefaulttimeout(1) # in seconds (float)
  urlsx = []
  for url in urls:
    try:
      rp = None
      domain = urlparse(url)
      scheme = domain.scheme
      domain = domain.netloc
      if domain in domain_to_robot:
        rp = domain_to_robot[domain]
      else:
        rp = urobot.RobotFileParser(scheme+"://"+domain+ "/robots.txt")
        rp.read()
        domain_to_robot[domain] = rp

      # Can fetch or not
      if rp.can_fetch("*", url):
        urlsx.append(url)

    except Exception as e:
      continue
  return urlsx


def duplicateUrlElimination(urls):
  """
  Takes in urls
  returns urls that are not already processed and no duplicates
  """
  processed_links = []
  for url in urls:
    if not url in final_links:
      processed_links.append(url)
  
  return processed_links
  


# **RUN CRAWLER**

In [5]:
# intialize every thing
threads = []
threadID = 1
limit_documents = 100
my_Lock = threading.RLock()
my_frontier = frontier()


In [6]:
# Theard task
# define individual crawler thread's function here as studies in class
class crawler_thread_task(threading.Thread):
   def __init__(self, threadID, name, res,my_frontier):
      threading.Thread.__init__(self)
      self.threadID = threadID
      self.name = name
      self.res = res
      self.my_frontier = my_frontier
      self._has_lock = False

   def run(self):
      print ("Starting Thread " + self.name)
      process_data(self.threadID, self.res, self.my_frontier)
      print ("Exiting Thread" + self.name)
      

def process_data(threadId, res, my_frontier):
  """
  gets url, fetches the content[polite too],extract links, filters links,
  eliminates duplicate urls and add them in frontier. Also uses lock for
  shared variables
  """
  while True:
    
    try:
      # get URL (politness also maintained in get_URL)
      with my_Lock:
        process_url = my_frontier.get_URL()
      
      # Fetch Url 
      name_of_file = fetch(process_url)
      if name_of_file != "" or name_of_file!=None:
        # Parsing url
        new_links = parse(process_url)
        # filter links
        filtered_links = urlfilter(new_links)
        # remove duplicates
        result_links= duplicateUrlElimination(filtered_links)
        # add resultant links to frontier
        my_frontier.add_URLs(result_links)
        # check to add process link or terminate
        if check(name_of_file, process_url) == True:
          break
        
    except Exception as e:
      pass


def check(name_of_file, process_url):
  res = False
  with my_Lock:
    # only one thread can execute code there
    if len(url_to_doc) >= limit_documents:
      res = True
    else:
      if process_url not in url_to_doc and name_of_file != None:
        url_to_doc[process_url] = name_of_file
        final_links.append(process_url)
        print("Processed: " + process_url)
  return res



In [7]:
# start the threads
try:
  for i in range(THREADS):
    thread = crawler_thread_task(threadID, threadID , final_links, my_frontier= my_frontier)
    thread.start()
    threads.append(thread)
    threadID += 1

  # Wait for all threads to complete
  for t in threads:
    if t.is_alive():
      t.join()

  print ("Exiting Main Thread")

  print(url_to_doc)

  # Stores the mapping of the files
  doc = "urlTodocid.txt"
  # url to docid file save
  for url in url_to_doc:
    with open(doc, "w") as f:
      for url in url_to_doc:
        f.write(url + " , " + url_to_doc[url] + "\n")
      f.close()

except Exception as e:
  pass


Starting Thread 1
Starting Thread 2
Starting Thread 3Starting Thread 4

Starting Thread 5
Starting Thread 6
Starting Thread 7
Starting Thread 8Starting Thread 9

Processed: https://docs.oracle.com/en/
Processed: https://docs.oracle.com/middleware/jet210/jet/index.html
Processed: https://go.oracle.com/subscriptions
Processed: https://www.oracle.com/corporate/
Processed: https://www.youtube.com/oracle/
Processed: https://academy.oracle.com/en/oa-web-overview.html
Processed: https://developer.oracle.com/
Processed: https://en.wikipedia.org/wiki/Weka_(machine_learning)
Processed: https://twitter.com/oracle
Processed: https://www.youtube.com/about/copyright/
Processed: https://en.wikipedia.org/api/
Processed: https://developers.google.com/youtube
Processed: https://foundation.wikimedia.org/wiki/Developer_app_guidelines
Processed: https://www.oracle.com/corporate/accessibility/
Processed: https://policies.google.com/privacy?hl=ur
Processed: https://www.youtube.com/watch?v=kg1Z72T6Ass&list=PL