In [5]:
import requests
from bs4 import BeautifulSoup
import os
import re

In [2]:
!pip install requests beautifulsoup4



In [3]:

def scrape_google_images(query, num_images=10, folder_name='images'):
    # Prepare search term for URL (replace spaces with '+')
    search_term = query.replace(' ', '+')
    
    # Construct the Google search URL
    url = f"https://www.google.com/search?q={search_term}&tbm=isch"

    # Set headers to mimic a browser visit
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}

    # Send request
    response = requests.get(url, headers=headers)
    
    # Check if the request was successful
    if response.status_code != 200:
        print("Failed to retrieve the webpage.")
        return

    # Parse the content with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all image tags
    img_tags = soup.find_all('img', {'src': re.compile('https://')})

    # Create folder to store images
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    # Download images
    for i, img in enumerate(img_tags):
        if i >= num_images:
            break
        img_url = img['src']
        img_name = os.path.join(folder_name, f"image_{i + 1}.jpg")
        try:
            img_data = requests.get(img_url).content
            with open(img_name, 'wb') as f:
                f.write(img_data)
            print(f"Downloaded {img_name}")
        except Exception as e:
            print(f"Could not download {img_url}. Reason: {e}")



In [7]:
# Example usage
scrape_google_images('pisang', num_images=100, folder_name='pisang_images')

Downloaded pisang_images/image_1.jpg
Downloaded pisang_images/image_2.jpg
Downloaded pisang_images/image_3.jpg
Downloaded pisang_images/image_4.jpg
Downloaded pisang_images/image_5.jpg
Downloaded pisang_images/image_6.jpg
Downloaded pisang_images/image_7.jpg
Downloaded pisang_images/image_8.jpg
Downloaded pisang_images/image_9.jpg
Downloaded pisang_images/image_10.jpg
Downloaded pisang_images/image_11.jpg
Downloaded pisang_images/image_12.jpg
Downloaded pisang_images/image_13.jpg
Downloaded pisang_images/image_14.jpg
Downloaded pisang_images/image_15.jpg
Downloaded pisang_images/image_16.jpg
Downloaded pisang_images/image_17.jpg
Downloaded pisang_images/image_18.jpg
Downloaded pisang_images/image_19.jpg
Downloaded pisang_images/image_20.jpg


In [8]:
!pip install selenium



In [9]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import time
import requests
import os

In [42]:
index = 0
def scroll_and_scrape_images(query, num_images=10, folder_name='images'):
    global index
    # Set up the Selenium WebDriver (for Chrome in this case)
    driver = webdriver.Chrome()  # Ensure you have the ChromeDriver installed and in your PATH

    # Prepare search term for Google Images
    search_term = query.replace(' ', '+')
    url = f"https://www.google.com/search?q={search_term}&tbm=isch"

    # Open the browser and navigate to the URL
    driver.get(url)

    # Scroll down the page to load more images
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)  # Wait for new images to load
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:  # If the page height doesn't change, we've reached the end
            break
        last_height = new_height

    # Parse the page with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Close the Selenium browser session
    driver.quit()

    # Find all image tags
    # img_tags = soup.find_all('img', {'class': re.compile(r'^YQ4gaf$'), 'src': re.compile('https://')})
    img_tags = soup.find_all(lambda tag: tag.name == 'img' and tag.get('class') == ['YQ4gaf'] and re.compile('http').match(tag.get('src', '')) and int(tag.get('width', 0)) > 12)

    # Create folder to store images
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    # Download the images
    for i, img in enumerate(img_tags):
        if i >= num_images:
            break
        img_url = img['src']
        img_name = os.path.join(folder_name, f"{query}_{index + 1}.jpg")
        index += 1
        try:
            img_data = requests.get(img_url).content
            with open(img_name, 'wb') as f:
                f.write(img_data)
            print(f"Downloaded {img_name}")
        except Exception as e:
            print(f"Could not download {img_url}. Reason: {e}")

In [45]:
# Example usage
query = "aple food"
folder_name = "apel"
scroll_and_scrape_images(query, num_images=1000, folder_name=folder_name)

Downloaded apel/aple food_1.jpg
Downloaded apel/aple food_2.jpg
Downloaded apel/aple food_3.jpg
Downloaded apel/aple food_4.jpg
Downloaded apel/aple food_5.jpg
Downloaded apel/aple food_6.jpg
Downloaded apel/aple food_7.jpg
Downloaded apel/aple food_8.jpg
Downloaded apel/aple food_9.jpg
Downloaded apel/aple food_10.jpg
Downloaded apel/aple food_11.jpg
Downloaded apel/aple food_12.jpg
Downloaded apel/aple food_13.jpg
Downloaded apel/aple food_14.jpg
Downloaded apel/aple food_15.jpg
Downloaded apel/aple food_16.jpg
Downloaded apel/aple food_17.jpg
Downloaded apel/aple food_18.jpg
Downloaded apel/aple food_19.jpg
Downloaded apel/aple food_20.jpg
Downloaded apel/aple food_21.jpg
Downloaded apel/aple food_22.jpg
Downloaded apel/aple food_23.jpg
Downloaded apel/aple food_24.jpg
Downloaded apel/aple food_25.jpg
Downloaded apel/aple food_26.jpg
Downloaded apel/aple food_27.jpg
Downloaded apel/aple food_28.jpg
Downloaded apel/aple food_29.jpg
Downloaded apel/aple food_30.jpg
Downloaded apel/apl