In [4]:
#Assignment 1

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

def analyze_website_link_structure(base_url, max_recursion_depth=1):
    """Analyzes the link structure of a website to determine its hub/authority characteristics.

    Args:
        base_url: The starting URL for the analysis.
        max_recursion_depth: The maximum depth to explore links (default: 1).

    Returns:
        None, but prints a conclusion about the website's role as a hub or authority.
    """

# Initializing the variables to store urls as set (since set doesnt allow duplicate values and ensure unique links)
    processed_urls = set()  # Stores URLs that have already been visited
    internal_links = set()  # Stores URLs within the same domain
    external_links = set()  # Stores URLs linking to external domains
    unique_external_domains = set()  # Stores unique external domains linked to

    def explore_page(url, current_depth):
        if url in processed_urls or current_depth > max_recursion_depth:
            return

        processed_urls.add(url)

        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.content, "lxml")

            for anchor_tag in soup.find_all("a", href=True):
                href = anchor_tag["href"]
                complete_url = urljoin(url, href)
                parsed_url = urlparse(complete_url)

                if parsed_url.netloc == urlparse(base_url).netloc:
                    internal_links.add(complete_url)
                    explore_page(complete_url, current_depth + 1)
                else:
                    external_links.add(complete_url)
                    unique_external_domains.add(parsed_url.netloc)

        except requests.exceptions.RequestException as e:
            print(f"Error fetching URL: {url} ({e})")

            # Write collected URLs to files
        with open("processed_urls.txt", "w") as f:
            for url in processed_urls:
                f.write(url + "\n")

        with open("internal_links.txt", "w") as f:
            for url in internal_links:
                f.write(url + "\n")

        with open("external_links.txt", "w") as f:
            for url in external_links:
                f.write(url + "\n")

        with open("unique_external_domains.txt", "w") as f:
            for domain in unique_external_domains:
                f.write(domain + "\n")

    explore_page(base_url, 1)

    if len(unique_external_domains) >= len(internal_links):
        print("Website is likely a HUB, connecting to many external domains.")
    else:
        print("Website is likely an AUTHORITY, with a strong internal link structure.")

    

# Crawling online khabar .com
starting_url = "https://www.onlinekhabar.com/"  
analyze_website_link_structure(starting_url, max_recursion_depth=2) 


Website is likely an AUTHORITY, with a strong internal link structure.


In [None]:
# import re
# import requests
# from urllib.parse import urlparse

# def navigate_news_web(start_url, depth=1):
#     """Embark on a journey through a news website, exploring its intricate web of links.

#     Args:
#         start_url: The initial portal to the news world.
#         depth: How deep we delve into the rabbit hole of hyperlinks (default: 1).

#     Returns:
#         None, but leaves behind breadcrumbs of insights about the website's role (hub or authority).
#     """

#     visited_paths = set()  # Places we've already stepped foot in.
#     internal_trails = set()  # Paths within the same domain's walls.
#     external_avenues = set()  # Roads leading to other digital lands.
#     unique_domain_maps = set()  # A collection of visited continents (external domains).

#     def explore_page(path, current_depth):
#         if path in visited_paths or current_depth > depth:
#             return

#         visited_paths.add(path)

#         try:
#             page_content = requests.get(path).text
#             for hyperlink in re.findall(r'href="([^"]+)"', page_content):
#                 full_path = urljoin(path, hyperlink)
#                 parsed_path = urlparse(full_path)

#                 if parsed_path.netloc == urlparse(start_url).netloc:
#                     internal_trails.add(full_path)
#                     explore_page(full_path, current_depth + 1)
#                 else:
#                     external_avenues.add(full_path)
#                     unique_domain_maps.add(parsed_path.netloc)

#         except requests.exceptions.RequestException as e:
#             print(f"Encountered a roadblock while fetching {path}: {e}")

#     explore_page(start_url, 1)

#     # Unveil the website's true nature based on its link tapestry.
#     if len(unique_domain_maps) >= len(internal_trails):
#         print("This website appears to be a bustling HUB, connecting you to diverse digital landscapes")
#     else:
#         print("Its focus on internal trails suggests it's an authoritative figure, a stronghold of knowledge within its domain")

#     # Further analysis and interpretation of the website's link structure can be performed here.

# # Set sail on your news adventure!
# start_url = "https://www.onlinekhabar.com/"  # Replace with your desired Nepali news website
# navigate_news_web(start_url, depth=2)  # Adjust the depth of your exploration as needed.
