## Who Are They?

### This project will be scraping images or actors and actresses from Google Images to create a dataset that will then be used in a facial recognition project to identify actors and actresses in movies. This notebook only deals with the collection and labling of faces and the building of the dataset. The data will be stored in an S3 bucket.


### *By: Connor Secen*

### Setup:
#### This is the initial setup of the notebook, importing all the packages needed and setting main variables.

In [None]:
# all required imports
from bs4 import BeautifulSoup
from selenium import webdriver
from PIL import Image
import shutil
import hashlib
import multiprocessing
import requests
import cv2
import config
import boto3
import time
import os
import io

In [None]:
s3 = boto3.client('s3')   # instance of aws S3

# preset variables 
max_results = 350   # number of images collected
actor_num = 4   # represents number of pages == (n-1)*100 actors
group_size = 5   # number of actors images collected at once
imdb_url = 'https://www.imdb.com/list/ls058011111/?page={0}'   # imdb top 1000 actors webpage

### Collect actors names
#### Because we are building a dataset of actors images, we needed to first collect names of a fair number of actors. In order to do this, I will scrape the first few pages of IMDbs list of top 1000 actors.

In [None]:
# scrapes specified page of imdbs list of top 1000 actors
def collect_actors(url, actors):
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    curr_actors = [h3.find('a').text for h3 in soup.find_all('h3', attrs={'class':'lister-item-header'})]   # get the name of each actor and add it to the list parameter
    actors += curr_actors
    return actors

In [None]:
manager = multiprocessing.Manager()
actors = manager.list()   # list of actors names shared between threads
jobs = []
# start 3 process each scraping a different page of the imdbs top 1000 actors list
for i in range(1, actor_num):
    p = multiprocessing.Process(target=collect_actors, args=(imdb_url.format(i), actors))
    jobs.append(p)
    p.start()

# wait for each process to complete
for proc in jobs:
    proc.join()

actors = list(set(actors))   # remove any duplicate names
actors = [a[1:-1] for a in actors]   # remove leading space and trailing newline from each name

### Collect images
#### Now that we have collected the names of the top 300 actors from imdbs top 1000 actors list, we can begin to collect images by scraping Google Images for each actor in the list inorder to build our dataset that will later be used to train a facial recognition cnn.

In [None]:
def fetch_image_urls(query, max_links_to_fetch, wd, sleep_between_interactions:int=1):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)    
    
    # build the google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.rg_ic")
        number_results = len(thumbnail_results)
                
        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls    
            actual_images = wd.find_elements_by_css_selector('img.irc_mi')
            for actual_image in actual_images:
                if actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls), "image links, looking for more ...")
            time.sleep(1)
            load_more_button = wd.find_element_by_css_selector(".ksb")
            if load_more_button:
                wd.execute_script("document.querySelector('.ksb').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls

In [None]:
def persist_image(folder_path:str,url:str):
    try:
        image_content = requests.get(url).content
    except Exception as e:
        print(f"ERROR - Could not download {url} - {e}")

    try:
        image_file = io.BytesIO(image_content)
        image = Image.open(image_file).convert('RGB')
        file_path = os.path.join(folder_path,hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
        with open(file_path, 'wb') as f:
            image.save(f, "JPEG", quality=85)
        print(f"SUCCESS - saved {url} - as {file_path}")
        
    except Exception as e:
        print(f"ERROR - Could not save {url} - {e}")

In [None]:
def search_and_download(search_term, number_images):
    target_folder = '_'.join(search_term.lower().split(' ')) + '/'   # build the target folder path

    # make the path if it does not exist
    if not os.path.exists(target_folder):   
        os.makedirs(target_folder)

    # fetch all the images for the search term
    with webdriver.Chrome('/usr/bin/chromedriver') as wd:
        res = fetch_image_urls(search_term, number_images, wd=wd, sleep_between_interactions=0.5)
    
    # save every image collected locally
    for elem in res:
        persist_image(target_folder,elem)
        
    # move each image save to an S3 bucket and remove the local directory and all its contents
    bucket_name = 'actors-images'
    file_list = os.listdir(target_folder)
    for file in file_list:
        s3.upload_file(target_folder + file, bucket_name, target_folder + file)
 
    shutil.rmtree(target_folder)

In [None]:
# break down the number of actors being searched for in chucks of 5
# for i in range(0, len(actors), 5):
for i in range(0, 20, group_size):
    sub_actors = actors[i:i+5]
    jobs = []
    
    # create a process for each search term in the sublist
    for i in range(len(sub_actors)):
        p = multiprocessing.Process(target=search_and_download, args=(sub_actors[i], 10))
        jobs.append(p)
        p.start()
        
    # wait for each process to complete
    for proc in jobs:
        proc.join()