In [1]:
import json
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup

# base URL
base_url = 'https://www.haberturk.com'

# get scripts from a URL
def get_scripts_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Failed to retrieve {url}: {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    scripts = [script.get('src') or script.string for script in soup.find_all('script')]
    return scripts

# load URLs
with open('urls.json', 'r') as file:
    data = json.load(file)

unique_urls = data.get('unique_urls', [])

# dictionary to hold scripts from each URL
scripts_data = {}

# loop each URL to get scripts
for url in unique_urls:
    full_url = urljoin(base_url, url)
    scripts = get_scripts_from_url(full_url)
    scripts_data[full_url] = scripts
    print(f"Successfully analyzed {full_url}")

# save scripts data to scripts.json
with open('scripts.json', 'w', encoding='utf-8') as file:
    json.dump(scripts_data, file, indent=4, ensure_ascii=False)

# create an HTML file to display scripts
html_content = '<html><body>'
for url, scripts in scripts_data.items():
    html_content += f'<h2>Scripts from {url}</h2><ul>'
    for script in scripts:
        html_content += f'<li>{script}</li>'
    html_content += '</ul>'
html_content += '</body></html>'

# save HTML content to scripts.html
with open('scripts.html', 'w', encoding='utf-8') as file:
    file.write(html_content)

print("Scripts have been saved to scripts.json and scripts.html")


Successfully analyzed https://www.haberturk.com/images/common/manifest/180x180.png
Successfully analyzed https://www.haberturk.com/images/common/favicon/32x32.png
Successfully analyzed https://www.haberturk.com/images/common/favicon/16x16.png
Successfully analyzed https://www.haberturk.com/images/common/favicon/favicon.ico?v=001
Successfully analyzed https://www.haberturk.com/manifest.webmanifest
Successfully analyzed https://im.haberturk.com
Successfully analyzed https://adsp.haberturk.com
Failed to retrieve https://im.hthayat.com: HTTPSConnectionPool(host='im.hthayat.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1000)')))
Successfully analyzed https://im.hthayat.com
Failed to retrieve https://geoim.bloomberght.com: 403 Client Error: Forbidden for url: https://geoim.bloomberght.com/
Successfully analyzed https://geoim.bloomberght.com
Failed t