In [15]:
import json
from urllib.parse import urljoin, urlparse
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException, WebDriverException
from webdriver_manager.chrome import ChromeDriverManager
import time

json_file_path = 'urls.json'
base_url = 'https://www.haberturk.com'

# Load the unique URLs from the JSON file
with open(json_file_path, 'r') as file:
    data = json.load(file)

unique_urls = data['unique_urls']

# set up selenium webdriver
def setup_driver():
    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.set_page_load_timeout(10)  # Set page load timeout
    return driver

# validate URL
def is_valid_url(url):
    parsed = urlparse(url)
    return bool(parsed.scheme) and bool(parsed.netloc)

# function to get image data from a single URL
def get_image_data(driver, url, exclude_url_contains, start_index):
    if not is_valid_url(url):
        print(f"Invalid URL: {url}")
        return [], start_index

    images_data = []

    try:
        driver.get(url)
        start_time = time.time()
        while True:
            elapsed_time = time.time() - start_time
            if elapsed_time > 10:
                print(f"Skipping page due to long load time: {url}")
                return images_data, start_index
            try:
                WebDriverWait(driver, 1).until(EC.presence_of_all_elements_located((By.TAG_NAME, 'img')))
                break
            except TimeoutException:
                pass

        images = driver.find_elements(By.TAG_NAME, 'img')
        for img in images:
            src = img.get_attribute('src')
            alt = img.get_attribute('alt')
            if src and exclude_url_contains not in src:
                full_url = urljoin(url, src)
                images_data.append({"index": start_index, "url": full_url, "alt": alt})
                start_index += 1

    except (StaleElementReferenceException, WebDriverException) as e:
        print(f"Error for {url}: {e}")

    return images_data, start_index

# start webdriver
driver = setup_driver()

# collect all images data
all_images_data = []
current_index = 0

# loop through every URL to get images
for url in unique_urls:
    full_url = urljoin(base_url, url)
    images_data, current_index = get_image_data(driver, full_url, 'placeholder-image.gif', current_index)
    all_images_data.extend(images_data)

# clean and close webdriver
driver.quit()

# save image data to a JSON file
def save_image_data_to_json(image_data, filename='image_data.json'):
    with open(filename, 'w', encoding='utf-8') as file:
        json.dump(image_data, file, indent=4)

# save image data to an HTML file
def save_image_data_to_html(image_data, filename='image_data.html'):
    html_content = '<html><head><title>Image Data</title></head><body><h1>Collected Image Data</h1>'
    html_content += '<table border="1"><tr><th>Index</th><th>URL</th><th>Alt Text</th></tr>'
    for item in image_data:
        html_content += f'<tr><td>{item["index"]}</td><td>{item["url"]}</td><td>{item["alt"]}</td></tr>'
    html_content += '</table></body></html>'
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(html_content)

save_image_data_to_json(all_images_data)

save_image_data_to_html(all_images_data)

# print the count of unique and total image URLs
print(f"Total Image Data Entries: {len(all_images_data)}")
print("Sample Image Data:", all_images_data[:50])


Skipping page due to long load time: https://im.haberturk.com
Skipping page due to long load time: https://im.hthayat.com
Skipping page due to long load time: https://geoim.bloomberght.com
Skipping page due to long load time: https://im.showtv.com.tr
Skipping page due to long load time: https://mo.ciner.com.tr
Skipping page due to long load time: https://pagead2.googlesyndication.com
Skipping page due to long load time: https://www.googletagmanager.com
Skipping page due to long load time: https://www.googletagservices.com
Skipping page due to long load time: https://securepubads.g.doubleclick.net
Error for https://www.haberturk.com: Message: stale element reference: stale element not found in the current frame
  (Session info: chrome=126.0.6478.57); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#stale-element-reference-exception
Stacktrace:
	GetHandleVerifier [0x00A60F03+27667]
	(No symbol) [0x009F6C04]
	(No symbol