# Imports

Link: https://serpapi.com/blog/scrape-google-images-with-python/

- os:             to return environment variable (SerpApi API key) value.
- requests:       to make a request to the website.
- lxml:   	    to process XML/HTML documents fast.
- json:   	    to convert extracted data to a JSON object.
- re:     	    to extract parts of the data via regular expression.
- urllib.request:	to save images locally with `urllib.request.urlretrieve`
- BeautifulSoup:	is a XML/HTML scraping library. It's used in combo with `lxml` as it faster than `html.parser`

In [1]:
import os, requests, lxml, re, json, urllib.request
from bs4 import BeautifulSoup
from serpapi import GoogleSearch

# Create URL parameter and request headers
- `params`:	a prettier way of passing URL parameters to a request.
- `user-agent`:	to act as a "real" user request from the browser by passing it to request headers. Default `requests` user-agent is a `python-reqeusts` so websites might understand that it's a bot or a script and block the request to the website. Check what's your `user-agent`.

In [2]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36"
}

params = {
    "q": "mincraft wallpaper 4k", # search query
    "tbm": "isch",                # image results
    "hl": "en",                   # language of the search
    "gl": "us",                   # country where search comes from
    "ijn": "0"                    # page number
}

# Request
Make a request, pass created request parameters and headers. Pass returned HTML to `BeautifulSoup`

- `timeout=30`:	to stop waiting for response after 30 seconds.
- `BeautifulSoup(html.text, "lxml")`:	`html.text` will return a textual HTML data and `"lxml"` will be set as a XML/HTML processor, not the default `html.parser`

In [3]:
html = requests.get("https://www.google.com/search", params=params, headers=headers, timeout=30)
soup = BeautifulSoup(html.text, "lxml")

# Extract Data 
Only with request headers, no regular expressions

The reason why it's handy is beacuse when you try directly parse data from `img` tag and `src` attribute, you'll get a base64 encoded URL which will be a 1x1 image placeholder. Not a particularly useful image resolution

- `params["content-type"]`:	will create a new dict key "content-type" and assinged a "image/png" value which will return images.
- `[img["src"] for img in soup.select("img")]`:	will iterate over all img tags and extracts 

In [4]:
def get_images_with_request_headers():
    params["content-type"] = "image/png" # parameter that indicate the original media type 

    return [img["src"] for img in soup.select("img")]

# Suggested search results
- Thing above actual images
- `suggested_searches`:	a temporary `list` where extracted data will be appended at the end of the function.
- `all_script_tags`:	a variable which will hold all extracted `<script>` HTML tags from `soup.select("script")` where `select()` will return a list of matched `<script>` tags.
- matched_images:	will hold all extracted matched images data from `re.findall()` which returns an iterator. This variable is needed to extract suggested search thumbnails, image thumbnails and full-resolution images.
- `suggested_search_thumbnails` and `suggested_search_thumbnail_encoded`:	parses part of inline JSON where `suggested_search_thumbnail_encoded` parses actual thumbnails from partly parsed inline JSON data.
- `zip()`:	to iterate over multiple iterables in parralel. Keep in mind that zip is used on purpose. `zip()` ends with the shortest iterator while `zip_longest()` iterates up to the length of the longest iterator.
- `suggested_searches.append({})`:	to `append` extracted images data to a `list` as a dictionary.
- `select_one()`:	to return one (instead of all) matched element in a loop.
- `["href"]`:	is a shortcut of accessing and extracting HTML attributes with `BeautifulSoup`. Alternative is `get(<attribute>)`.
- `"".join()`:	to join all items from in iterable into a string.
- `bytes(<variable>, "ascii").decode("unicode-escape")`:	to decode parsed image data.

In [5]:
def get_suggested_search_data():
    """
    https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes
    if you try to json.loads() without json.dumps it will throw an error:
    "Expecting property name enclosed in double quotes"
    """

    suggested_searches = []

    all_script_tags = soup.select("script")

    # https://regex101.com/r/48UZhY/6
    matched_images = "".join(re.findall(r"AF_initDataCallback\(({key: 'ds:1'.*?)\);</script>", str(all_script_tags)))
    
    matched_images_data_fix = json.dumps(matched_images)
    matched_images_data_json = json.loads(matched_images_data_fix)

    # search for only suggested search thumbnails related
    # https://regex101.com/r/ITluak/2
    suggested_search_thumbnails = ",".join(re.findall(r'{key(.*?)\[null,\"Size\"', matched_images_data_json))

    # https://regex101.com/r/MyNLUk/1
    suggested_search_thumbnail_encoded = re.findall(r'\"(https:\/\/encrypted.*?)\"', suggested_search_thumbnails)

    # zip() is used on purpose over zip_longest() as number of results would be identical
    for suggested_search, suggested_search_fixed_thumbnail in zip(soup.select(".PKhmud.sc-it.tzVsfd"), suggested_search_thumbnail_encoded):
        suggested_searches.append({
            "name": suggested_search.select_one(".VlHyHc").text,
            "link": f"https://www.google.com{suggested_search.a['href']}",
            # https://regex101.com/r/y51ZoC/1
            "chips": "".join(re.findall(r"&chips=(.*?)&", suggested_search.a["href"])),
            # https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
            "thumbnail": bytes(suggested_search_fixed_thumbnail, "ascii").decode("unicode-escape")
        })

    return suggested_searches

# Extracting original resolution images

Almost identical to extracting suggested search results except for different regular expressions:
1. Create a temporary `list` `google_images` where extracted data will be appended.
2. Extracting `all_script_tags`.
3. Extracting `matched_images_data` to extract thumbnails and original resolution images.

In [6]:
def get_original_images():

    """
    https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes
    if you try to json.loads() without json.dumps() it will throw an error:
    "Expecting property name enclosed in double quotes"
    """

    google_images = []

    all_script_tags = soup.select("script")

    # # https://regex101.com/r/48UZhY/4
    matched_images_data = "".join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
    
    matched_images_data_fix = json.dumps(matched_images_data)
    matched_images_data_json = json.loads(matched_images_data_fix)

    # https://regex101.com/r/VPz7f2/1
    matched_google_image_data = re.findall(r'\"b-GRID_STATE0\"(.*)sideChannel:\s?{}}', matched_images_data_json)

    # https://regex101.com/r/NnRg27/1
    matched_google_images_thumbnails = ", ".join(
        re.findall(r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]',
                   str(matched_google_image_data))).split(", ")

    thumbnails = [
        bytes(bytes(thumbnail, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for thumbnail in matched_google_images_thumbnails
    ]

    # removing previously matched thumbnails for easier full resolution image matches.
    removed_matched_google_images_thumbnails = re.sub(
        r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', "", str(matched_google_image_data))

    # https://regex101.com/r/fXjfb1/4
    # https://stackoverflow.com/a/19821774/15164646
    matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]", removed_matched_google_images_thumbnails)

    full_res_images = [
        bytes(bytes(img, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for img in matched_google_full_resolution_images
    ]
    
    for index, (metadata, thumbnail, original) in enumerate(zip(soup.select(".isv-r.PNCib.MSM1fd.BUooTd"), thumbnails, full_res_images), start=1):
        google_images.append({
            "title": metadata.select_one(".VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb")["title"],
            "link": metadata.select_one(".VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb")["href"],
            "source": metadata.select_one(".fxgdke").text,
            "thumbnail": thumbnail,
            "original": original
        })

        # Download original images
        print(f"Downloading {index} image...")
        
        opener=urllib.request.build_opener()
        opener.addheaders=[("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36")]
        urllib.request.install_opener(opener)

        urllib.request.urlretrieve(original, f"Bs4_Images/original_size_img_{index}.jpg")

        # Insert 4

        # Insert 5

            # Insert save within the for loop

    return google_images

4. Decode extracted encoded `thumbnails`:

In [None]:
thumbnails = [
    bytes(bytes(thumbnail, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for thumbnail in matched_google_images_thumbnails
]

# equvalent to 
for fixed_google_image_thumbnail in matched_google_images_thumbnails:
    # https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
    google_image_thumbnail_not_fixed = bytes(fixed_google_image_thumbnail, 'ascii').decode('unicode-escape')
    # after first decoding, Unicode characters are still present. After the second iteration, they were decoded.
    google_image_thumbnail = bytes(google_image_thumbnail_not_fixed, 'ascii').decode('unicode-escape')

5. Decode extracted encoded `full_res_images`:

In [None]:
full_res_images = [
      bytes(bytes(img, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for img in matched_google_full_resolution_images
  ]

# equvalent to
for index, fixed_full_res_image in enumerate(matched_google_full_resolution_images):
    # https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
    original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape')
    original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')


Save full resolution images locally:

- `urllib.request.build_opener()`:	manages the chaining of handlers and will automatically add headers on each request (row below).
- `opener.addheaders[()]`:	to add headers to the request.
- `urllib.install_opener()`:	set opener as a default global opener. Whatever that means 👀
- `urllib.request.urlretrieve()`:	to save images locally.

In [None]:
opener=urllib.request.build_opener()
opener.addheaders=[("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36")]
urllib.request.install_opener(opener)

urllib.request.urlretrieve(original, f"Bs4_Images/original_size_img_{index}.jpg")

# Using Google Images API
- No need to figure out regular expressions, create a parser and maintain it over time, or how to scale the number of requests without being blocked

In [None]:
def serpapi_get_google_images():
    image_results = []
    
    for query in ["Coffee", "boat", "skyrim", "minecraft"]:
        # search query parameters
        params = {
            "engine": "google",               # search engine. Google, Bing, Yahoo, Naver, Baidu...
            "q": query,                       # search query
            "tbm": "isch",                    # image results
            "num": "100",                     # number of images per page
            "ijn": 0,                         # page number: 0 -> first page, 1 -> second...
            "api_key": os.getenv("API_KEY")   # your serpapi api key
            # other query parameters: hl (lang), gl (country), etc  
        }
    
        search = GoogleSearch(params)         # where data extraction happens
    
        images_is_present = True
        while images_is_present:
            results = search.get_dict()       # JSON -> Python dictionary
    
            # checks for "Google hasn't returned any results for this query."
            if "error" not in results:
                for image in results["images_results"]:
                    if image["original"] not in image_results:
                        image_results.append(image["original"])
                
                # update to the next page
                params["ijn"] += 1
            else:
                print(results["error"])
                images_is_present = False
    
    # -----------------------
    # Downloading images

    for index, image in enumerate(results["images_results"], start=1):
        print(f"Downloading {index} image...")
        
        opener=urllib.request.build_opener()
        opener.addheaders=[("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36")]
        urllib.request.install_opener(opener)

        urllib.request.urlretrieve(image["original"], f"SerpApi_Images/original_size_img_{index}.jpg")

    print(json.dumps(image_results, indent=2))
    print(len(image_results))