In [2]:
import json
from urllib.parse import urljoin, urlparse
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException, WebDriverException
from webdriver_manager.chrome import ChromeDriverManager
import time

json_file_path = 'urls.json'
base_url = 'https://www.haberturk.com'

# Load the URLs from the JSON file
with open(json_file_path, 'r') as file:
    data = json.load(file)

unique_urls = data['unique_urls']

# set selenium webdriver
def setup_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.set_page_load_timeout(10)  # Set page load timeout
    return driver

# validate URL
def is_valid_url(url):
    parsed = urlparse(url)
    return bool(parsed.scheme) and bool(parsed.netloc)

# get image data from a single URL
def get_image_data(driver, url, exclude_url_contains):
    if not is_valid_url(url):
        print(f"Invalid URL: {url}")
        return []

    images_data = []

    try:
        driver.get(url)
        start_time = time.time()
        while True:
            elapsed_time = time.time() - start_time
            if (elapsed_time > 10):
                print(f"Skipping page due to long load time: {url}")
                return images_data
            try:
                WebDriverWait(driver, 1).until(EC.presence_of_all_elements_located((By.TAG_NAME, 'img')))
                break
            except TimeoutException:
                pass

        images = driver.find_elements(By.TAG_NAME, 'img')
        for img in images:
            src = img.get_attribute('src')
            alt = img.get_attribute('alt')
            if src and exclude_url_contains not in src:
                full_url = urljoin(url, src)
                images_data.append({"url": full_url, "alt": alt})

    except (StaleElementReferenceException, WebDriverException) as e:
        print(f"Error for {url}: {e}")

    return images_data

# start WebDriver
driver = setup_driver()

all_images_data = {}

# loop through every URL to get images
for index, url in enumerate(unique_urls):
    print(f"Analyzing URL {index + 1}/{len(unique_urls)}: {url}")
    full_url = urljoin(base_url, url)
    images_data = get_image_data(driver, full_url, 'placeholder-image.gif')
    if images_data:
        all_images_data[full_url] = images_data

# clean and close webdriver
driver.quit()

# save image data to JSON
def save_image_data_to_json(image_data, filename='image_data.json'):
    with open(filename, 'w', encoding='utf-8') as file:
        json.dump(image_data, file, indent=4)

# save image data to HTML
def save_image_data_to_html(image_data, filename='image_data.html'):
    html_content = '<html><head><title>Image Data</title></head><body><h1>Collected Image Data</h1>'
    for page_url, images in image_data.items():
        html_content += f'<h2>URL: {page_url}</h2><table border="1"><tr><th>Image URL</th><th>Alt Text</th></tr>'
        for img in images:
            html_content += f'<tr><td>{img["url"]}</td><td>{img["alt"]}</td></tr>'
        html_content += '</table>'
    html_content += '</body></html>'
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(html_content)

save_image_data_to_json(all_images_data)
save_image_data_to_html(all_images_data)

# print count of unique and total image URLs
print(f"Total Image Data Entries: {len(all_images_data)}")
print("Sample Image Data:", list(all_images_data.items())[:1])


Analyzing URL 1/536: /images/common/manifest/180x180.png
Analyzing URL 2/536: https://www.haberturk.com/images/common/favicon/32x32.png
Analyzing URL 3/536: https://www.haberturk.com/images/common/favicon/16x16.png
Analyzing URL 4/536: https://www.haberturk.com/images/common/favicon/favicon.ico?v=001
Analyzing URL 5/536: /manifest.webmanifest
Analyzing URL 6/536: //im.haberturk.com
Skipping page due to long load time: https://im.haberturk.com
Analyzing URL 7/536: //adsp.haberturk.com
Analyzing URL 8/536: //im.hthayat.com
Skipping page due to long load time: https://im.hthayat.com
Analyzing URL 9/536: //geoim.bloomberght.com
Skipping page due to long load time: https://geoim.bloomberght.com
Analyzing URL 10/536: //im.showtv.com.tr
Skipping page due to long load time: https://im.showtv.com.tr
Analyzing URL 11/536: //mo.ciner.com.tr
Skipping page due to long load time: https://mo.ciner.com.tr
Analyzing URL 12/536: https://static.criteo.net
Analyzing URL 13/536: https://pagead2.googlesyndi