In [3]:
import requests
import json
from urllib.parse import urljoin
import math

# base URL
base_url = "https://www.haberturk.com"

# load URLs
with open('urls.json', 'r') as file:
    data = json.load(file)
    urls = data['unique_urls']

# list to hold the results
results = []

# function to convert bytes to kb mb etc
def convert_size(size_bytes):
    if size_bytes is None:
        return "N/A"
    elif size_bytes == 0:
        return "0 B"
    size_name = ("B", "KB", "MB", "GB", "TB")
    i = int(math.floor(math.log(size_bytes, 1024)))
    p = math.pow(1024, i)
    s = round(size_bytes / p, 2)
    return f"{s} {size_name[i]}"

# loop over the URLs and get their sizes
total_urls = len(urls)
for i, url in enumerate(urls, start=1):
    full_url = urljoin(base_url, url)
    try:
        response = requests.get(full_url, stream=True)
        
        # get size of the content in bytes
        page_size = len(response.content)
        
        # progress print
        print(f"{i}/{total_urls} - {full_url}: {page_size} bytes")
        
        # append result to the list
        results.append({'url': full_url, 'size': page_size})
    except requests.RequestException as e:
        # for request errors
        print(f"{i}/{total_urls} - {full_url}: Failed to retrieve ({e})")
        results.append({'url': full_url, 'size': None, 'error': str(e)})

# sort the results by size
results.sort(key=lambda x: x['size'] if x['size'] is not None else 0)

# save results to JSON file
with open('url_sizes.json', 'w') as json_file:
    json.dump(results, json_file, indent=4)

# save results to HTML file
html_content = "<html><head><title>URL Sizes</title></head><body><table border='1'>"
html_content += "<tr><th>URL</th><th>Size</th></tr>"

for result in results:
    size_str = convert_size(result['size'])
    html_content += f"<tr><td>{result['url']}</td><td>{size_str}</td></tr>"

html_content += "</table></body></html>"

with open('url_sizes.html', 'w') as html_file:
    html_file.write(html_content)

print("Data saved to url_sizes.json and url_sizes.html")


1/536 - https://www.haberturk.com/images/common/manifest/180x180.png: 3530 bytes
2/536 - https://www.haberturk.com/images/common/favicon/32x32.png: 913 bytes
3/536 - https://www.haberturk.com/images/common/favicon/16x16.png: 529 bytes
4/536 - https://www.haberturk.com/images/common/favicon/favicon.ico?v=001: 3530 bytes
5/536 - https://www.haberturk.com/manifest.webmanifest: 744 bytes
6/536 - https://im.haberturk.com: 18 bytes
7/536 - https://adsp.haberturk.com: 145 bytes
8/536 - https://im.hthayat.com: Failed to retrieve (HTTPSConnectionPool(host='im.hthayat.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1000)'))))
9/536 - https://geoim.bloomberght.com: 162 bytes
10/536 - https://im.showtv.com.tr: 162 bytes
11/536 - https://mo.ciner.com.tr: 162 bytes
12/536 - https://static.criteo.net: 43 bytes
13/536 - https://pagead2.googlesyndication.com: 15