# Google Images Web Crawler
This program takes a string and divvies it up into words, then searches the web for each word, extracting as many URLs as it can find. The URLs are used to download images. The images are then scaled to 640x640 px and labeled with the search term used to find it. 

Please use your own Project Key (console.cloud.google.com) and Search Engine (cse.google.com). They're both free :p

#### First, import dependent libraries.

In [3]:
import os
import sys
import time
import urllib3 as urllib3
import urllib.request
import simplejson
from io import StringIO
import pprint
from googleapiclient.discovery import build
from PIL import Image, ImageFont, ImageDraw
from random import randint

### Create a search string. Each word is searched independently.

In [4]:
# insert favorite wilco line
searchString = '''im the boy with the poetry power im the boy who smells like flowers'''

searchTerms = searchString.split()
print(str(len(searchTerms)))

14


### Use a custom search engine generated in cse.google.com on the project generated in console.cloud.google.com
Search the web for each word and append the dict of the result to a list of dicts (results).

In [13]:
projectKey = "AIzaSyC2j8I9Qsoiop80HT6nsm4pxYF5BG6uORM"
searchEngineID = '008351460169727747582:0ryy6yqws-o'

# instantiate the service
service = build("customsearch", "v1",
        developerKey=projectKey)

# avoid gifs
gifExclude = '-filetype:gif'

# may need to change this if you're getting errors
# the errors are caused by google not finding enough
# results to start at the given point
maxStart = 10

results = []
for term in searchTerms:
    # random starting point for each image between 0-19
    startPoint = randint(0,maxStart)
    
    results.append(service.cse().list(
        start = startPoint,
        # max # of images to return
        num = 10,
        q = term,
        cx = searchEngineID
        #imgtype = 'face'
        ).execute())

print(str(len(results)) + '\n')
pprint.pprint(results)

14

[{'context': {'title': 'Image Crawler'},
  'items': [{'cacheId': 'nnq_ZsLqlFEJ',
             'displayLink': 'play.google.com',
             'formattedUrl': 'https://play.google.com/store/apps/details?id=de.shapeservices...',
             'htmlFormattedUrl': 'https://play.google.com/store/apps/details?id=de.shapeservices...',
             'htmlSnippet': 'IM+ supports simultaneously major <b>IM</b> '
                            'services, including Google Talk, Twitter <br>\n'
                            'DMs, Yahoo!, AIM/iChat, ICQ, Jabber (and Slack '
                            'through Jabber), and many&nbsp;...',
             'htmlTitle': 'IM+ - Android Apps on Google Play',
             'kind': 'customsearch#result',
             'link': 'https://play.google.com/store/apps/details?id=de.shapeservices.impluslite',
             'pagemap': {'aggregaterating': [{'ratingcount': '90514',
                                              'ratingvalue': '4.170305252075195'}],
            

### Parse the results to find direct links to images
Add the URLs, Search Terms, and Titles to a list. 

In [15]:
urls = []
terms = []
titles = []
ogImage = ''
cseImage = ''
try: 
    for res in results:
        try: itemsFromRes = res.get('items')
        except: pass

        for item in itemsFromRes:
            try: ogImage = item.get('pagemap').get('metatags')[0].get('og:image')
            except: pass
            try: cseImage = item.get('pagemap').get('cse_image')[0].get('src')
            except: continue

            try:
                #pprint.pprint(item)
                #print(cseImage)
                if ogImage:
                    urls.append(ogImage)
                    print('og:image result: ' + ogImage)

                    titles.append(item.get('pagemap').get('metatags')[0].get('og:title'))
                    print('title: ' + item.get('pagemap').get('metatags')[0].get('og:title'))

                elif cseImage:
                    urls.append(cseImage)
                    print('cse_image result: ' + cseImage)

                    titles.append(item.get('title'))
                    print('title: ' + item.get('title'))

                terms.append(res.get('queries').get('nextPage')[0].get('searchTerms'))
                print('search term: '+ res.get('queries').get('nextPage')[0].get('searchTerms') + '\n')

            except: pass
except: pass

print(len(urls))
#print(urls)    
#print(terms)
#print(len(urls), ' ', len(terms))

cse_image result: https://lh4.ggpht.com/7WFBFNz6ym0PrH_yHG-VMYjudkRAAJuo4G36LdbDhOoXMIKEXZut57PSOL5iLlweKAUN=w300
title: IM+ - Android Apps on Google Play
search term: im

cse_image result: https://i.ytimg.com/vi/5DKGe_oEajs/hqdefault.jpg
title: Skype - free IM & video calls - Android Apps on Google Play
search term: im

cse_image result: https://lh3.googleusercontent.com/tm_N1osJGfifuRlMfEip4kZFD5QCtd42CYYEhUsxABzIoEn6Nb9UXjRVmjoeKvMFUCzF=w300
title: imo free video calls and chat - Android Apps on Google Play
search term: im

og:image result: https://ssl.gstatic.com/chrome/webstore/images/chrome_web_store-128.png
title: Chrome Web Store
search term: im

og:image result: https://www.gstatic.com/images/icons/material/apps/fonts/1x/opengraph_color_1200dp.png
title: Google Fonts
search term: im

og:image result: https://0.soompi.io/wp-content/uploads/2017/04/21093159/YoonA-Hong-Jong-Hyun-Im-Siwan.jpg
title: Im Siwan, Hong Jong Hyun, And YoonA Are A Lovely Trio In Upcoming MBC Drama
search

### Download and label each image.

Folder will be created in the working directory (where you're running this code from). Folder name = proj. Subfolders include the original images and the labeled/resized ones. In addition, a file called urls.txt will be written that includes the url, search term, and title for each source. 

In [144]:
# create working directory
proj = 'flowers1'
cwd = os.getcwd()
os.mkdir(proj)
os.chdir(proj)
cwd = os.getcwd()
os.mkdir('originals')
os.mkdir('labeled')

# write urls to txt file
urlsFile = open('urls.txt', 'w')
i = 0
for url in urls:
    urlsFile.write('%s\n%s\n%s\n\n' % (url, terms[i], titles[i]))
    i += 1

urlsFile.close()
os.chdir('..\\')

# set font for labels
font = ImageFont.truetype("couri.ttf", 64)

i = 0
for term in terms:
    try:
        url = urls[i]
        i += 1
        imageTuple = urllib.request.urlretrieve(url, '.\\' + proj + '\\originals\\' + str(i) + '_' + term + '.jpg')
        
        # open and resize to 640x640
        image = Image.open(imageTuple[0]).resize((640, 640), Image.BILINEAR)
        
        # overlay search term
        draw = ImageDraw.Draw(image)
        draw.rectangle(((0, 100), (40*len(term), 170)), fill='black', outline=None)
        draw.text((0, 100),term,(255,255,255),font=font)
        
        image.save('.\\' + proj + '\\labeled\\' + str(i) + '_' + term + '_labeled.jpg')
    except:
        pass