In [5]:
import requests
import re
import urllib.request
from bs4 import BeautifulSoup
from collections import deque
from html.parser import HTMLParser
from urllib.parse import urlparse
import os
import ssl
import json

#workaround for ssl issues
ssl._create_default_https_context = ssl._create_unverified_context

# Regex pattern to match a URL
HTTP_URL_PATTERN = r'^http[s]*://.+'

# Create a class to parse the HTML and get the hyperlinks
class HyperlinkParser(HTMLParser):
    def __init__(self):
        super().__init__()
        # Create a list to store the hyperlinks
        self.hyperlinks = []

    # Override the HTMLParser's handle_starttag method to get the hyperlinks
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)

        # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks
        if tag == "a" and "href" in attrs:
            self.hyperlinks.append(attrs["href"])

# Function to get the hyperlinks from a URL
def get_hyperlinks(url):

    # Try to open the URL and read the HTML
    try:
        # Open the URL and read the HTML
        with urllib.request.urlopen(url) as response:

            # If the response is not HTML, return an empty list
            if not response.info().get('Content-Type').startswith("text/html"):
                return []

            # Decode the HTML
            html = response.read().decode('utf-8')
    except Exception as e:
        #print(url, e)
        return []

    # Create the HTML Parser and then Parse the HTML to get hyperlinks
    parser = HyperlinkParser()
    parser.feed(html)

    return parser.hyperlinks

# Function to get the hyperlinks from a URL that are within the same domain
def get_domain_hyperlinks(local_domain, url):
    clean_links = []
    for link in set(get_hyperlinks(url)):
        clean_link = None

        # If the link is a URL, check if it is within the same domain
        if re.search(HTTP_URL_PATTERN, link):
            # Parse the URL and check if the domain is the same
            url_obj = urlparse(link)
            if url_obj.netloc == local_domain:
                clean_link = link

        # If the link is not a URL, check if it is a relative link
        else:
            if link.startswith("/"):
                link = link[1:]
            elif link.startswith("#") or link.startswith("mailto:"):
                continue
            clean_link = "https://" + local_domain + "/" + link

        if clean_link is not None:
            if clean_link.endswith("/"):
                clean_link = clean_link[:-1]
            clean_links.append(clean_link)

    # Return the list of hyperlinks that are within the same domain
    return list(set(clean_links))

def crawl(url, max_urls=None):
    # Parse the URL and get the domain
    local_domain = urlparse(url).netloc

    # Create a queue to store the URLs to crawl
    queue = deque([url])

    # Create a set to store the URLs that have already been seen (no duplicates)
    seen = set([url])

    # empty list to store all URLs retrieved
    urls_list = []

    # While the queue is not empty, continue crawling
    while queue:

        # Get the next URL from the queue
        url = queue.pop()
        if max_urls and len(urls_list) >= max_urls:
            break
        urls_list.append({"url": url})

        print(url) # for debugging and to see the progress

        # Get the hyperlinks from the URL and add them to the queue
        for link in get_domain_hyperlinks(local_domain, url):
            if link not in seen:
                queue.append(link)
                seen.add(link)

    return urls_list
url = "https://tuoitre.vn/"
all_url = crawl(url, max_urls=10)







https://tuoitre.vn/
https://tuoitre.vn/the-thao/binh-luan.htm
https://tuoitre.vn/boi-sai-de-tay-chu-s-hay-chu-l-2024090117541202.htm
https://tuoitre.vn/3-mon-phoi-hop.html
https://tuoitre.vn/boi.html
https://tuoitre.vn/le-tien-dat-do-thanh-hai-hut-huy-chuong-paralympic-2024-20240902011526044.htm
https://tuoitre.vn/print/le-tien-dat-do-thanh-hai-hut-huy-chuong-paralympic-2024-20240902011526044.htm
https://tuoitre.vn/the-thao-nguoi-khuyet-tat.html
https://tuoitre.vn/le-khai-mac-paralympic-paris-2024-co-gi-dac-biet-20240827100558904.htm
https://tuoitre.vn/tong-dao-dien-le-khai-mac-olympic-paris-2024-la-ai-20240726170625549.htm


In [6]:
url_tn = "https://thanhnien.vn/"
all_url_tn = crawl(url_tn, max_urls=10)
all_url.extend(all_url_tn)
print(all_url)

output_file_path = "urls.json"

try:
    with open(output_file_path, "r", encoding="utf-8") as f:
        existing_data = json.load(f)
except FileNotFoundError:
    # Nếu file không tồn tại, khởi tạo danh sách trống
    existing_data = []

combined_QA = existing_data + all_url
# Xuất kết quả ra file JSON
with open(output_file_path, "w", encoding="utf-8") as f:
    json.dump(combined_QA, f, ensure_ascii=False, indent=4)

https://thanhnien.vn/
https://thanhnien.vn/volvo-xc90-2025-tan-trang-ngoai-hinh-cai-tien-dong-co-hybrid-185240906104231572.htm
https://thanhnien.vn/volvo-xc90-recharge-ultimate-chay-dong-co-hybrid-cam-sac-gia-465-ti-tai-viet-nam-1851515889.htm
https://thanhnien.vn/volvo-trinh-lang-dan-xe-the-he-moi-dung-dong-co-hybrid-tai-viet-nam-1851410160.htm
https://thanhnien.vn/cong-nghe/kinh-nghiem
https://thanhnien.vn/174-dan-so-tai-viet-nam-so-huu-tai-san-ma-hoa-185240828193155653.htm
https://thanhnien.vn/rua-tien-tags495406.html
https://thanhnien.vn/vu-an-truong-my-lan-van-thinh-phat-tranh-cai-ve-thiet-hai-cua-scb-va-ai-chiu-trach-nhiem-boi-thuong-185240411001334223.htm
https://thanhnien.vn/van-thinh-phat-tags1171780.html
https://thanhnien.vn/ong-le-thanh-hai-bi-cach-chuc-nguyen-bi-thu-thanh-uy-tphcm-nhiem-ky-2010-2015-185937379.htm
[{'url': 'https://tuoitre.vn/'}, {'url': 'https://tuoitre.vn/the-thao/binh-luan.htm'}, {'url': 'https://tuoitre.vn/boi-sai-de-tay-chu-s-hay-chu-l-2024090117541202.