In [33]:
from concurrent.futures import ThreadPoolExecutor
import os
from PIL import Image
import requests
import threading

import cv2
import pandas as pd
from tqdm import tqdm

In [2]:
data_dir = '../data/'
img_dir = os.path.join(data_dir, 'google_captions/imgs/')
data_fp = os.path.join(data_dir, 'google_captions/gcc_train_data.tsv')
animal_names_fp = os.path.join(data_dir, 'animal_names.txt')

In [3]:
df = pd.read_csv(data_fp, delimiter='\t', header=None, names=['caption', 'url'])
df = df.sample(frac=1)
df.head()

Unnamed: 0,caption,url
3262086,interpretation of a dream in which you sawfood,https://drawinglics.com/view/660453/interpreta...
2374623,person dances on a cliff,https://communications.iu.edu/images/team/pers...
2390527,"cyclists pass , in the early evening light",http://l7.alamy.com/zooms/2ca14e7754b84e08b045...
807452,woman painting a heart on the window,http://l7.alamy.com/zooms/baeab7b2ad514fa3af88...
788135,if your dreams end with you releasing an arrow...,https://i.pinimg.com/736x/71/2c/c6/712cc625230...


In [4]:
with open(animal_names_fp, 'r') as f:
    animal_names = f.readlines()
animal_names = set([an.strip().lower() for an in animal_names if ' ' not in an.strip()])
print(len(animal_names), 'animals')

355 animals


In [5]:
def contains_word(sentence, target_words):
    for token in sentence.split():
        if token in target_words:
            return True
    return False

In [6]:
animal_entries = df.apply(lambda row: contains_word(row['caption'], animal_names), axis=1)
animal_df = df[animal_entries]
print(len(animal_df))

In [37]:
def get_img_from_url(url):
    try:
        return Image.open(requests.get(url, stream=True, timeout=8).raw)
    except Exception as e:
        print('ERROR!' + e)
        print('Couldn\'t get img from url "{}" because of `{}`, skipping.'.format(url, type(e)))

def format_img(img, target_size=256):
    width, height = img.size
    smaller_dim = min(width, height)
    scale_factor = target_size / smaller_dim
    width = int(scale_factor * width)
    height = int(scale_factor * height)
    img = img.resize((width, height))

    half_size = target_size // 2
    left = (width // 2) - half_size
    right = (width // 2) + half_size
    top = (height // 2) - half_size
    bottom = (height // 2) + half_size
    img = img.crop((left, top, right, bottom))
    
    return img

def process_and_save_img(url, name):
    img = get_img_from_url(url)
    img = format_img(img)
    
    save_path = os.path.join(img_dir, name)
    img.save('{}.png'.format(save_path), 'PNG')

In [38]:
def download_dataset(df, n_threads=16):
    with ThreadPoolExecutor(max_workers=n_threads) as executor:
        for i, row in tqdm(df.iterrows(), total=df.shape[0]):
            name = str(row.name)
            url = row['url']
            executor.submit(process_and_save_img, url, name)

In [None]:
download_dataset(animal_df)

100%|████████████████████████████████████████████████████████████████████████| 203578/203578 [00:23<00:00, 8795.28it/s]
