In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# download wikipage
wikipage = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_by_continent_(data_file)"
result = requests.get(wikipage)

# if successful parse the download into a BeautifulSoup object, which allows easy manipulation 
if result.status_code == 200:
    soup = BeautifulSoup(result.content, "html.parser")
    
# find the object with HTML class wibitable sortable
table = soup.find('table',{'class':'wikitable sortable'})

# loop through all the rows and pull the text
new_table = []
for row in table.find_all('tr')[1:]:
    column_marker = 0
    columns = row.find_all('td')
    new_table.append([column.get_text() for column in columns])
    
df = pd.DataFrame(new_table, columns=['ContinentCode','Alpha2','Alpha3','PhoneCode','Name'])
df['Name'] = df['Name'].str.replace('\n','')
df

Unnamed: 0,ContinentCode,Alpha2,Alpha3,PhoneCode,Name
0,AS,AF,AFG,004,"Afghanistan, Islamic Republic of"
1,EU,AL,ALB,008,"Albania, Republic of"
2,AN,AQ,ATA,010,Antarctica (the territory South of 60 deg S)
3,AF,DZ,DZA,012,"Algeria, People's Democratic Republic of"
4,OC,AS,ASM,016,American Samoa
...,...,...,...,...,...
257,AS,YE,YEM,887,Yemen
258,AF,ZM,ZMB,894,"Zambia, Republic of"
259,AS,XD,,,United Nations Neutral Zone
260,AS,XS,,,Spratly Islands


In [74]:

from selenium import webdriver
import os
import time
wd = webdriver.Chrome (executable_path=r'E:/Scraping/chromedriver.exe')

In [75]:
wd

<selenium.webdriver.chrome.webdriver.WebDriver (session="6f78a09cd9372021a2dc22fbb246e35f")>

In [76]:
def fetch_image_urls(query:str, max_links_to_fetch:int, wd:webdriver, sleep_between_interactions:int=1):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)    
    
    # build the google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)
        
        print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")
        
        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls    
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls), "image links, looking for more ...")
            time.sleep(30)
            return
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls

In [77]:
wd.get('https://google.com')

In [78]:
search_box = wd.find_element_by_css_selector('input.gLFyf')
search_box.send_keys('Dogs')

In [92]:
import io
from PIL import Image 
import hashlib


In [93]:
def persist_image(folder_path:str,url:str):
    try:
        image_content = requests.get(url).content

    except Exception as e:
        print(f"ERROR - Could not download {url} - {e}")

    try:
        image_file = io.BytesIO(image_content)
        image = Image.open(image_file).convert('RGB')
        file_path = os.path.join(folder_path,hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
        with open(file_path, 'wb') as f:
            image.save(f, "JPEG", quality=85)
        print(f"SUCCESS - saved {url} - as {file_path}")
    except Exception as e:
        print(f"ERROR - Could not save {url} - {e}")

In [94]:
def search_and_download(search_term:str,driver_path:str,target_path='./images',number_images=5):
    target_folder = os.path.join(target_path,'_'.join(search_term.lower().split(' ')))

    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    with webdriver.Chrome(executable_path=driver_path) as wd:
        res = fetch_image_urls(search_term, number_images, wd=wd, sleep_between_interactions=0.5)
        
    for elem in res:
        persist_image(target_folder,elem)

In [97]:
search_and_download ('fire extinguisher','E:/Scraping/chromedriver.exe','E:/Scraping/images',50)

Found: 100 search results. Extracting links from 0:100
Found: 50 image links, done!
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcS8PW6SEJgg0vHls3-__O3PdTva0VLJ4Pt9Mg&usqp=CAU - as E:/Scraping/images\fire_extinguisher\f55945e084.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcRdykaKTs2xzlBp4Sfuzw2fxCSc2SR-I6TZTw&usqp=CAU - as E:/Scraping/images\fire_extinguisher\682a4cc810.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcQbDTLJ2FGhzgeh_yfIdk30htNvwkV38vNgkg&usqp=CAU - as E:/Scraping/images\fire_extinguisher\dd205b8d76.jpg


  "Palette images with Transparency expressed in bytes should be "


SUCCESS - saved https://i0.wp.com/variex.in/wp-content/uploads/2018/09/modular-2k.png - as E:/Scraping/images\fire_extinguisher\e6f21f689b.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcTuuSvnH9Qrntkm6DQj8FZu9EQbYTA2XuQFNg&usqp=CAU - as E:/Scraping/images\fire_extinguisher\258ac1a43a.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcSior05aIFrpih3wwdmV8-ozVGlBZjwvOOr6w&usqp=CAU - as E:/Scraping/images\fire_extinguisher\e7010fad74.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcQBd3VxQ8ctQ6TChsZTqmGK8137dabcXsCa9A&usqp=CAU - as E:/Scraping/images\fire_extinguisher\5e7009b4f5.jpg
SUCCESS - saved https://images-na.ssl-images-amazon.com/images/I/61%2BxTPonQvL._SL1000_.jpg - as E:/Scraping/images\fire_extinguisher\edd14bf113.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcQi54Vkl7dWmCZYG_KORkCUzitjXS4gZ9zhyg&usqp=CAU - as E:/Scraping/images\fire_extinguisher\3d17048cd3.jpg
SUCCESS 



ERROR - Could not save https://static.wixstatic.com/media/cf66d3_38546f18e3904838aea738054a46b14d~mv2.png/v1/fill/w_480,h_480,al_c,q_90,usm_0.66_1.00_0.01/cf66d3_38546f18e3904838aea738054a46b14d~mv2.webp - cannot identify image file <_io.BytesIO object at 0x000000A2ADB3A7C8>
SUCCESS - saved https://rukminim1.flixcart.com/image/352/352/jyafukw0/fire-extinguisher-mount/5/f/m/abc-powder-type-6-kg-fire-cylinder-rainx-original-imafgk5zmhqmgvys.jpeg?q=70 - as E:/Scraping/images\fire_extinguisher\895a1892e8.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcQVdItMVaUbXoleqe0DnSMkw4eLCTzI5oLJCw&usqp=CAU - as E:/Scraping/images\fire_extinguisher\d959bfe799.jpg
SUCCESS - saved https://www.thespruce.com/thmb/HOkJ9QiswRy1LFbHLlahMSrPi3Y=/640x480/smart/filters:no_upscale()/AmerexB5005lbABCDryChemicalClassABCFireExtinguisher-591ccc8a5f9b58f4c01fa953.jpg - as E:/Scraping/images\fire_extinguisher\77e39b28f8.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3A