In [21]:
import os
import requests
import json
import socket
import ipaddress
import base64
import glob
import replicate
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import display, Markdown, update_display
from openai import OpenAI
from urllib.parse import urlparse
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import WebDriverException
from webdriver_manager.chrome import ChromeDriverManager

In [26]:

def is_safe_url(url):
    try:
        parsed = urlparse(url)
        if parsed.scheme not in ["http", "https"] or parsed.netloc == "":
            return False
        host = parsed.hostname
        ip = ipaddress.ip_address(socket.gethostbyname(host))
        if ip.is_private or ip.is_loopback or ip.is_reserved or ip.is_link_local:
            return False
    except Exception:
        return False
    return True

class Image_Scrapper:
    def __init__(self, url):
        if not is_safe_url(url):
            raise ValueError("Invalid or unsafe URL")

        self.images = []

        # Setup Selenium
        options = Options()
        options.add_argument("--headless")
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("--disable-extensions")
        options.add_argument("--disable-blink-features=AutomationControlled")

        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

        try:
            driver.set_page_load_timeout(20)
            driver.get(url)
            soup = BeautifulSoup(driver.page_source, "html.parser")

            self.title = soup.title.string.strip() if soup.title else "No title found"

            img_tags = soup.find_all('img')
            # self.images = [img['src'] for img in img_tags if not img.get('alt')]
            self.images = [img['src'] for img in img_tags]
            # self.images = [urljoin(url, img['src']) for img in img_tags if img.get('src')]

        except Exception as e:
            print(f"Error loading page: {e}")
        finally:
            driver.quit()


In [27]:
website_Images = Image_Scrapper("https://edwarddonner.com")
website_Images.images

['https://i0.wp.com/edwarddonner.com/wp-content/uploads/2023/12/cropped-edworkprofile2.png?fit=1128%2C1128&ssl=1',
 'https://edwarddonner.com/wp-content/uploads/2025/05/exec-copy2.jpeg',
 'https://edwarddonner.com/wp-content/uploads/2025/05/exec.png',
 'https://edwarddonner.com/wp-content/uploads/2025/04/autonomy.png',
 'https://edwarddonner.com/wp-content/uploads/2025/01/handson.jpg',
 'https://pixel.wp.com/g.gif?v=ext&blog=227378255&post=57&tz=-4&srv=edwarddonner.com&hp=atomic&ac=2&amp=0&j=1%3A14.9-a.1&host=edwarddonner.com&ref=&fcp=320&rand=0.7381890454109881']

In [10]:
# website_Images = Image_Scrapper("https://gopalinfo.com")
# website_Images.images

In [11]:
load_dotenv(override=True)
API_KEY = os.getenv("REPLICATE_API_KEY")

if not API_KEY:
    print("No API_Key found, Please set the API_KEY.")
    exit(1)
elif API_KEY.strip() != API_KEY:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook")
else:
    print("API key found and looks good so far!")


client = replicate.Client(api_token=API_KEY)
model = client.models.get("salesforce/blip")
version = model.versions.get("2e1dddc8621f72155f24cf2e0adbde548458d3cab9f00c0139eea840d0ac4746")


API key found and looks good so far!


In [17]:
# STEP 2: Initialize replicate client and model


for img_url in website_Images.images:
    if is_safe_url(img_url):
        try:
            caption = version.predict(image=img_url, task="caption")
            print(f"{img_url} → {caption}")
        except Exception as e:
            print(f"❌ Failed to caption {img_url}: {e}")
    else:
        print(f"⚠️ Skipping unsafe URL: {img_url}")

❌ Failed to caption https://i0.wp.com/edwarddonner.com/wp-content/uploads/2023/12/cropped-edworkprofile2.png?fit=1128%2C1128&ssl=1: 'Version' object has no attribute 'predict'
❌ Failed to caption https://edwarddonner.com/wp-content/uploads/2025/05/exec-copy2.jpeg: 'Version' object has no attribute 'predict'
❌ Failed to caption https://edwarddonner.com/wp-content/uploads/2025/05/exec.png: 'Version' object has no attribute 'predict'
❌ Failed to caption https://edwarddonner.com/wp-content/uploads/2025/04/autonomy.png: 'Version' object has no attribute 'predict'
❌ Failed to caption https://edwarddonner.com/wp-content/uploads/2025/01/handson.jpg: 'Version' object has no attribute 'predict'
❌ Failed to caption https://pixel.wp.com/g.gif?v=ext&blog=227378255&post=57&tz=-4&srv=edwarddonner.com&hp=atomic&ac=2&amp=0&j=1%3A14.9-a.1&host=edwarddonner.com&ref=&fcp=292&rand=0.24834006810901665: 'Version' object has no attribute 'predict'
