In [None]:
import signal
from timeit import default_timer as timer
import requests, lxml, re, json, urllib.request
from bs4 import BeautifulSoup

In [None]:
class timeout:
    def __init__(self, seconds=1, error_message="Timeout"):
        self.seconds = seconds
        self.error_message = error_message

    def handle_timeout(self, signum, frame):
        raise TimeoutError(self.error_message)

    def __enter__(self):
        signal.signal(signal.SIGALRM, self.handle_timeout)
        signal.alarm(self.seconds)

    def __exit__(self, type, value, traceback):
        signal.alarm(0)


In [None]:
start = timer()

headers = {
    "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}

params = {
    "q": "bmw",  #We change the search after getting 100 images from each query
    "tbm": "isch", 
    "hl": "en",
    "ijn": "0",
}

html = requests.get("https://www.google.com/search", params=params, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')


def get_images_data():

    print('\nGoogle Images Metadata:')
    for google_image in soup.select('.isv-r.PNCib.MSM1fd.BUooTd'):
        try:
            title = google_image.select_one('.VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb')['title']
            source = google_image.select_one('.fxgdke').text
            link = google_image.select_one('.VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb')['href']
            print(f'{title}\n{source}\n{link}\n')
        except:
            pass
    # this steps could be refactored to a more compact
    all_script_tags = soup.select('script')

    # # https://regex101.com/r/48UZhY/4
    matched_images_data = ''.join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
    
    # https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes
    # if you try to json.loads() without json.dumps() it will throw an error:
    # "Expecting property name enclosed in double quotes"
    matched_images_data_fix = json.dumps(matched_images_data)
    matched_images_data_json = json.loads(matched_images_data_fix)

    # https://regex101.com/r/pdZOnW/3
    matched_google_image_data = re.findall(r'\[\"GRID_STATE0\",null,\[\[1,\[0,\".*?\",(.*),\"All\",', matched_images_data_json)

    # https://regex101.com/r/NnRg27/1
    matched_google_images_thumbnails = ', '.join(
        re.findall(r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]',
                   str(matched_google_image_data))).split(', ')

    print('Google Image Thumbnails:')  # in order
    for fixed_google_image_thumbnail in matched_google_images_thumbnails:
        # https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
        google_image_thumbnail_not_fixed = bytes(fixed_google_image_thumbnail, 'ascii').decode('unicode-escape')

        # after first decoding, Unicode characters are still present. After the second iteration, they were decoded.
        google_image_thumbnail = bytes(google_image_thumbnail_not_fixed, 'ascii').decode('unicode-escape')
        print(google_image_thumbnail)

    # removing previously matched thumbnails for easier full resolution image matches.
    removed_matched_google_images_thumbnails = re.sub(
        r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', '', str(matched_google_image_data))

    # https://regex101.com/r/fXjfb1/4
    # https://stackoverflow.com/a/19821774/15164646
    matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]",
                                                       removed_matched_google_images_thumbnails)


    print('\nFull Resolution Images:')  # in order
    for index, fixed_full_res_image in enumerate(matched_google_full_resolution_images):
        # https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
        original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape')
        original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')
        print(original_size_img)

        # ------------------------------------------------
        # Download original images
        print(f'Downloading {index} image...')
        try:
          #opener=urllib.request.build_opener()
          #opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582')]
          #urllib.request.install_opener(opener)
          with timeout(2):
            urllib.request.urlretrieve(original_size_img, f'/content/drive/MyDrive/cars/original_size_img_{index}.jpg')
        except:
            pass
    
    end = timer()
    print(end - start)
get_images_data()



Google Images Metadata:
BMW Group
bmwgroup.com
https://www.bmwgroup.com/en.html

BMW - Wikipedia
en.wikipedia.org
https://en.wikipedia.org/wiki/BMW

Luxury SUVs, Sedans, Coupes, Convertibles & Crossovers | BMW USA
bmwusa.com
https://www.bmwusa.com/

Luxury SUVs, Sedans, Coupes, Convertibles & Crossovers | BMW USA
bmwusa.com
https://www.bmwusa.com/

BMW M8 Competition Review 2021 | Top Gear
topgear.com
https://www.topgear.com/car-reviews/bmw/m8-competition

2022 BMW 7-Series Review, Pricing, and Specs
caranddriver.com
https://www.caranddriver.com/bmw/7-series

Luxury SUVs, Sedans, Coupes, Convertibles & Crossovers | BMW USA
bmwusa.com
https://www.bmwusa.com/

Build Your Own – Start Here – Choose A Series – BMW USA
bmwusa.com
https://www.bmwusa.com/build-your-own.html

2022 BMW M3 Buyer's Guide: Reviews, Specs, Comparisons
motortrend.com
https://www.motortrend.com/cars/bmw/m3/

2022 BMW X1 Review, Pricing, and Specs
caranddriver.com
https://www.caranddriver.com/bmw/x1

New BMW 5 Series 