In [1]:
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# fetch HTML and CSS content
def fetch_content(base_url, relative_url):
    full_url = urljoin(base_url, relative_url)
    try:
        response = requests.get(full_url)
        response.raise_for_status()  # for request errors
    except requests.exceptions.RequestException as e:
        print(f"Failed to retrieve {full_url}: {e}")
        return None, None

    html_content = response.content
    soup = BeautifulSoup(html_content, 'html.parser')
    
    css_content = {}
    # external CSS
    for link in soup.find_all('link', rel='stylesheet', href=True):
        css_url = urljoin(full_url, link['href'])
        try:
            css_response = requests.get(css_url)
            css_response.raise_for_status()
            css_content[f"external_{css_url}"] = css_response.text
            print(f"Fetched external CSS from {css_url}")
        except requests.exceptions.RequestException as e:
            print(f"Failed to retrieve external CSS file {css_url}: {e}")

    # internal CSS
    for i, style in enumerate(soup.find_all('style'), 1):
        internal_css = style.string
        if internal_css:
            css_content[f'internal_style_{i}'] = internal_css
            print(f"Found internal CSS within <style> tags {i}")

    # inline CSS
    for tag in soup.find_all(style=True):
        inline_css = tag['style']
        if inline_css:
            css_content[f'inline_style_{tag.name}_{tag.get("class", "")}'] = inline_css
            print(f"Found inline CSS in <{tag.name}> tag")

    return html_content, css_content

# base URL
base_url = 'https://www.haberturk.com'

# load URLs
with open('urls.json', 'r') as file:
    data = json.load(file)

unique_urls = data.get('unique_urls', [])
total_urls = len(unique_urls)

# dictionary to store the CSS files
css_files = {}

# process all URLs
for index, relative_url in enumerate(unique_urls):
    progress = f"({index + 1}/{total_urls})"
    print(f"{progress} Analyzing {urljoin(base_url, relative_url)}...")
    
    _, css_content_dict = fetch_content(base_url, relative_url)
    if css_content_dict:
        for css_key, css_content in css_content_dict.items():
            css_files[css_key] = css_content
        print(f"{progress} Successfully fetched CSS from {urljoin(base_url, relative_url)}")
    else:
        print(f"{progress} No CSS content found for {urljoin(base_url, relative_url)}")

# save CSS data to a JSON file
with open('css_files.json', 'w', encoding='utf-8') as file:
    json.dump(css_files, file, indent=4)

# generate HTML report
html_report = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>CSS File Log</title>
</head>
<body>
    <h1>CSS File Log</h1>
"""
for css_key in css_files:
    css_type = "Internal" if "internal" in css_key else "External" if "external" in css_key else "Inline"
    html_report += f"<h2>{css_type} CSS: {css_key}</h2>\n"
html_report += """
</body>
</html>
"""

# save HTML report to a file
with open('css_file_log.html', 'w', encoding='utf-8') as file:
    file.write(html_report)

print("Logging complete. HTML and JSON reports have been saved.")


(1/536) Analyzing https://www.haberturk.com/images/common/manifest/180x180.png...


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


(1/536) No CSS content found for https://www.haberturk.com/images/common/manifest/180x180.png
(2/536) Analyzing https://www.haberturk.com/images/common/favicon/32x32.png...


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


(2/536) No CSS content found for https://www.haberturk.com/images/common/favicon/32x32.png
(3/536) Analyzing https://www.haberturk.com/images/common/favicon/16x16.png...


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


(3/536) No CSS content found for https://www.haberturk.com/images/common/favicon/16x16.png
(4/536) Analyzing https://www.haberturk.com/images/common/favicon/favicon.ico?v=001...


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


(4/536) No CSS content found for https://www.haberturk.com/images/common/favicon/favicon.ico?v=001
(5/536) Analyzing https://www.haberturk.com/manifest.webmanifest...
(5/536) No CSS content found for https://www.haberturk.com/manifest.webmanifest
(6/536) Analyzing https://im.haberturk.com...
(6/536) No CSS content found for https://im.haberturk.com
(7/536) Analyzing https://adsp.haberturk.com...
(7/536) No CSS content found for https://adsp.haberturk.com
(8/536) Analyzing https://im.hthayat.com...
Failed to retrieve https://im.hthayat.com: HTTPSConnectionPool(host='im.hthayat.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1000)')))
(8/536) No CSS content found for https://im.hthayat.com
(9/536) Analyzing https://geoim.bloomberght.com...
Failed to retrieve https://geoim.bloomberght.com: 403 Client Error: Forbidden for url: https://geoim.bloomber

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


(22/536) No CSS content found for https://im.haberturk.com/l/2024/06/19/ver1718799771/3696380/jpg/320x640
(23/536) Analyzing https://im.haberturk.com/l/2024/06/20/ver1718831744/3696460/jpg/320x640...


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


(23/536) No CSS content found for https://im.haberturk.com/l/2024/06/20/ver1718831744/3696460/jpg/320x640
(24/536) Analyzing https://im.haberturk.com/l/2024/06/20/ver1718863122/3696489_manset/jpg/640x640...


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


(24/536) No CSS content found for https://im.haberturk.com/l/2024/06/20/ver1718863122/3696489_manset/jpg/640x640
(25/536) Analyzing https://im.haberturk.com/l/2024/06/20/ver1718858199/3696483_manset/jpg/640x640...


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


(25/536) No CSS content found for https://im.haberturk.com/l/2024/06/20/ver1718858199/3696483_manset/jpg/640x640
(26/536) Analyzing https://www.haberturk.com/adscbg/cbglout.js...
Found inline CSS in <a> tag
Found inline CSS in <span> tag
Found inline CSS in <span> tag
Found inline CSS in <d[r];u++){var> tag
(26/536) Successfully fetched CSS from https://www.haberturk.com/adscbg/cbglout.js
(27/536) Analyzing https://www.haberturk.com/ekonomi...
Fetched external CSS from https://www.haberturk.com/css/desktop/style.css?v=258
Found internal CSS within <style> tags 1
Found inline CSS in <div> tag
Found inline CSS in <div> tag
Found inline CSS in <span> tag
Found inline CSS in <span> tag
Found inline CSS in <span> tag
Found inline CSS in <span> tag
Found inline CSS in <span> tag
Found inline CSS in <span> tag
Found inline CSS in <span> tag
Found inline CSS in <span> tag
Found inline CSS in <span> tag
Found inline CSS in <span> tag
Found inline CSS in <div> tag
Found inline CSS in <iframe> ta