In [1]:
import requests
import json
from urllib.parse import urljoin
from bs4 import BeautifulSoup

# base URL
base_url = "https://www.haberturk.com"

# load URLs
with open('urls.json', 'r') as file:
    data = json.load(file)
    urls = data['unique_urls']

# list to hold the results
results = []

# loop through the URLs and get their heading tags
total_urls = len(urls)
for i, url in enumerate(urls, start=1):
    full_url = urljoin(base_url, url)
    try:
        # get URL content
        response = requests.get(full_url)
        
        # parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # extract the heading tags
        headings = {}
        for level in range(1, 7):
            tag = f'h{level}'
            headings[tag] = [heading.get_text(strip=True) for heading in soup.find_all(tag)]

        # progress print
        print(f"{i}/{total_urls} - {full_url}: Heading tags fetched")
        
        # append the result to the list
        results.append({'url': full_url, 'headings': headings})
    except requests.RequestException as e:
        # for request errors
        print(f"{i}/{total_urls} - {full_url}: Failed to retrieve heading tags ({e})")
        results.append({'url': full_url, 'headings': None, 'error': str(e)})

# save the results to JSON file
with open('url_headings.json', 'w', encoding='utf-8') as json_file:
    json.dump(results, json_file, indent=4, ensure_ascii=False)

# save the results to HTML file
html_content = "<html><head><title>URL Heading Tags</title></head><body><table border='1'>"
html_content += "<tr><th>URL</th><th>Heading Tags</th></tr>"

for result in results:
    if result['headings'] is not None:
        headings_str = ""
        for level, texts in result['headings'].items():
            headings_str += f"<b>{level}:</b><br>" + "<br>".join(texts) + "<br>"
    else:
        headings_str = "N/A"
    html_content += f"<tr><td>{result['url']}</td><td>{headings_str}</td></tr>"

html_content += "</table></body></html>"

with open('url_headings.html', 'w', encoding='utf-8') as html_file:
    html_file.write(html_content)

print("Data saved to url_headings.json and url_headings.html")


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1/536 - https://www.haberturk.com/images/common/manifest/180x180.png: Heading tags fetched


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2/536 - https://www.haberturk.com/images/common/favicon/32x32.png: Heading tags fetched


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


3/536 - https://www.haberturk.com/images/common/favicon/16x16.png: Heading tags fetched


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4/536 - https://www.haberturk.com/images/common/favicon/favicon.ico?v=001: Heading tags fetched
5/536 - https://www.haberturk.com/manifest.webmanifest: Heading tags fetched
6/536 - https://im.haberturk.com: Heading tags fetched
7/536 - https://adsp.haberturk.com: Heading tags fetched
8/536 - https://im.hthayat.com: Failed to retrieve heading tags (HTTPSConnectionPool(host='im.hthayat.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1000)'))))
9/536 - https://geoim.bloomberght.com: Heading tags fetched
10/536 - https://im.showtv.com.tr: Heading tags fetched
11/536 - https://mo.ciner.com.tr: Heading tags fetched
12/536 - https://static.criteo.net: Heading tags fetched
13/536 - https://pagead2.googlesyndication.com: Heading tags fetched
14/536 - https://www.google.com: Heading tags fetched
15/536 - https://www.google-analytics.com: Heading tags fetc

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


22/536 - https://im.haberturk.com/l/2024/06/19/ver1718799771/3696380/jpg/320x640: Heading tags fetched


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


23/536 - https://im.haberturk.com/l/2024/06/20/ver1718831744/3696460/jpg/320x640: Heading tags fetched


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


24/536 - https://im.haberturk.com/l/2024/06/20/ver1718863122/3696489_manset/jpg/640x640: Heading tags fetched


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


25/536 - https://im.haberturk.com/l/2024/06/20/ver1718858199/3696483_manset/jpg/640x640: Heading tags fetched
26/536 - https://www.haberturk.com/adscbg/cbglout.js: Heading tags fetched
27/536 - https://www.haberturk.com/ekonomi: Heading tags fetched
28/536 - https://www.haberturk.com/ekonomi/altin: Heading tags fetched
29/536 - https://www.haberturk.com/ekonomi/piyasa/88-dolar: Heading tags fetched
30/536 - https://www.haberturk.com/ekonomi/piyasa/89-euro: Heading tags fetched
31/536 - https://www.haberturk.com/ekonomi/altin/610-gram-altin: Heading tags fetched
32/536 - https://www.haberturk.com/ekonomi/piyasa/1947-faiz: Heading tags fetched
33/536 - https://www.haberturk.com/ekonomi/piyasa/2211-bitcoin: Heading tags fetched
34/536 - https://www.haberturk.com/ekonomi/borsa: Heading tags fetched
35/536 - https://www.haberturk.com/canliyayin: Heading tags fetched
36/536 - https://www.haberturk.com/: Heading tags fetched
37/536 - https://www.haberturk.com/havadurumu/Turkiye-TR/Adana/LTAG: