#### 1. Implement a python progtam to implement a multi threaded web scrapper the respects robot.txt rules

In [None]:
import threading
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import csv

class RobotsParser:
    def __init__(self, base_url):
        self.base_url = base_url
        self.allowed_paths = set()
        self.disallowed_paths = set()
        self.parse_robots_txt()

    def parse_robots_txt(self):
        robots_url = urljoin(self.base_url, "robots.txt")
        try:
            response = requests.get(robots_url, timeout=5)
            if response.status_code == 200:
                lines = response.text.splitlines()
                for line in lines:
                    if line.startswith("Disallow:"):
                        path = line[len("Disallow:"):].strip()
                        self.disallowed_paths.add(path)
                    elif line.startswith("Allow:"):
                        path = line[len("Allow:"):].strip()
                        self.allowed_paths.add(path)
        except requests.exceptions.RequestException:
            print(f"Failed to fetch robots.txt from {robots_url}. Assuming full access.")

    def is_allowed(self, url_path):
        for disallowed_path in self.disallowed_paths:
            if url_path.startswith(disallowed_path):
                return False
        return True


class MultiThreadedScraper:
    def __init__(self, base_url, max_threads=5):
        self.base_url = base_url
        self.visited_urls = set()  # URLs we've visited
        self.queue = []  # URLs we still need to scrape
        self.lock = threading.Lock()  # Lock for thread safety
        self.robots_parser = RobotsParser(base_url)  # Respect robots.txt
        self.max_threads = max_threads
        self.found_urls = []  # List to store the scraped URLs

    def fetch_url(self, url):
        """ Fetch the content of the URL. """
        try:
            response = requests.get(url, timeout=5)
            if response.status_code == 200:
                return response.text
        except requests.exceptions.RequestException as e:
            print(f"Error fetching {url}: {e}")
        return None

    def parse_links(self, html):
        """ Parse the HTML to extract links. """
        soup = BeautifulSoup(html, "html.parser")
        links = set()
        for a_tag in soup.find_all("a", href=True):
            full_url = urljoin(self.base_url, a_tag["href"])
            parsed_url = urlparse(full_url)
            if parsed_url.netloc == urlparse(self.base_url).netloc:  # Same domain
                links.add(full_url)
        return links

    def scrape_url(self, url):
        """ Scrape the URL and extract links. """
        with self.lock:
            if url in self.visited_urls:
                return
            self.visited_urls.add(url)

        print(f"Scraping: {url}")
        self.found_urls.append(url)  # Store the visited URL in found_urls
        html = self.fetch_url(url)
        if html:
            links = self.parse_links(html)
            with self.lock:
                for link in links:
                    if link not in self.visited_urls and self.robots_parser.is_allowed(urlparse(link).path):
                        self.queue.append(link)

    def worker(self):
        """ Worker thread that processes the queue of URLs. """
        while True:
            with self.lock:
                if not self.queue:
                    return
                url = self.queue.pop(0)

            self.scrape_url(url)

    def save_to_csv(self, filename="scraped_urls.csv"):
        """ Save the found URLs to a CSV file. """
        with open(filename, "w", newline="") as file:
            writer = csv.writer(file)
            writer.writerow(["Scraped URLs"])  # CSV Header
            for url in self.found_urls:
                writer.writerow([url])

    def save_to_text_file(self, filename="scraped_urls.txt"):
        """ Save the found URLs to a text file. """
        with open(filename, "w") as file:
            for url in self.found_urls:
                file.write(url + "\n")

    def run(self, start_path="/"):
        """ Start scraping from the given start path. """
        self.queue.append(urljoin(self.base_url, start_path))
        threads = []

        for _ in range(self.max_threads):
            t = threading.Thread(target=self.worker)
            threads.append(t)
            t.start()

        for t in threads:
            t.join()

        # After scraping, save the results to both CSV and text files
        self.save_to_csv()  # Save URLs to CSV
        self.save_to_text_file()  # Save URLs to text file


if __name__ == "__main__":
    base_url = "https://quotes.toscrape.com/"  # Replace with the website URL you want to scrape
    scraper = MultiThreadedScraper(base_url, max_threads=20)
    scraper.run(start_path="/")  # Start scraping from the homepage


#### 5. Implement a thread safe priority queue

In [8]:
import threading
from queue import PriorityQueue
import time
import random

class ThreadSafePriorityQueue:
    def __init__(self):
        # Initialize a PriorityQueue
        self.pq = PriorityQueue()

    def enqueue(self, priority, item):
        """Add an item to the priority queue with a given priority."""
        self.pq.put((priority, item))

    def dequeue(self):
        """Remove and return the highest-priority item."""
        is_empty = self.is_empty()
        if not is_empty:
            return self.pq.get()
        return None

    def is_empty(self):
        """Check if the queue is empty."""
        return self.pq.empty()

# Worker function for producer threads
def producer(queue, producer_id):
    for i in range(5):
        priority = random.randint(1, 10)  # Generate a random priority
        item = f"Item-{i} from Producer-{producer_id}"
        print(f"Producer-{producer_id} enqueueing: {item} with priority {priority}")
        queue.enqueue(priority, item)
        time.sleep(random.random())  # Simulate variable processing time

# Worker function for consumer threads
def consumer(queue, consumer_id):
    while True:
        if not queue.is_empty():
            priority, item = queue.dequeue()
            print(f"Consumer-{consumer_id} dequeued: {item} with priority {priority}")
        else:
            print(f"Consumer-{consumer_id} waiting for items...")
        time.sleep(random.random())  # Simulate variable processing time

# Main function
def main():
    # Create a thread-safe priority queue
    priority_queue = ThreadSafePriorityQueue()

    # Start producer threads
    producers = [threading.Thread(target=producer, args=(priority_queue, i)) for i in range(2)]

    # Start consumer threads
    consumers = [threading.Thread(target=consumer, args=(priority_queue, i)) for i in range(2)]

    # Start all threads
    for p in producers:
        p.start()
    for c in consumers:
        c.start()

    # Wait for producers to finish
    for p in producers:
        p.join()

    # Let consumers run for a while and then terminate
    time.sleep(5)
    print("Stopping consumers...")

if __name__ == "__main__":
    main()


Producer-0 enqueueing: Item-0 from Producer-0 with priority 3
Producer-1 enqueueing: Item-0 from Producer-1 with priority 5
Consumer-0 dequeued: Item-0 from Producer-0 with priority 3
Consumer-1 dequeued: Item-0 from Producer-1 with priority 5
Consumer-1 waiting for items...
Consumer-1 waiting for items...
Producer-1 enqueueing: Item-1 from Producer-1 with priority 10
Producer-0 enqueueing: Item-1 from Producer-0 with priority 9
Consumer-0 dequeued: Item-1 from Producer-0 with priority 9
Producer-1 enqueueing: Item-2 from Producer-1 with priority 8
Consumer-1 dequeued: Item-2 from Producer-1 with priority 8
Producer-1 enqueueing: Item-3 from Producer-1 with priority 8
Producer-0 enqueueing: Item-2 from Producer-0 with priority 4
Consumer-0 dequeued: Item-2 from Producer-0 with priority 4
Producer-0 enqueueing: Item-3 from Producer-0 with priority 9
Producer-1 enqueueing: Item-4 from Producer-1 with priority 8
Consumer-1 dequeued: Item-3 from Producer-1 with priority 8
Consumer-0 dequeu