In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import pickle
from keras.models import load_model

# Read photo ids
keyword_photoid_urls = pickle.load(open('/content/gdrive/My Drive/images/keyword_photoid_urls.pkl', 'rb'))

# Load classification model
model = load_model('/content/gdrive/My Drive/images/classification_model_religion.h5')

In [None]:
import numpy as np

def resize_image(image, resolutions, uniform_background=True):
    """
    Resize an image to the wanted resolution.
    Same factor is applied along x and y axis
    """

    h, w, _ = image.shape

    # Transform into a square image
    if h>w:
        nb_before, nb_after = int((h-w)/2), h - w - int((h-w)/2)
        before, after = np.zeros((h, nb_before, 3)) + 255, np.zeros((h, nb_after, 3)) + 255
        image_t = np.concatenate((before, image, after), axis=1).astype(np.uint8)
    elif h<w:
        nb_before, nb_after = int((w-h)/2), w - h - int((w-h)/2)
        before, after = np.zeros((nb_before, w, 3)) + 255, np.zeros((nb_after, w, 3)) + 255
        image_t = np.concatenate((before, image, after), axis=0).astype(np.uint8)
    elif h==w:
        image_t = image

    # Resize image
    resized_images = []
    for resolution in resolutions:
        image_resized = cv2.resize(image_t, (resolution, resolution))

        # Set uniform white background (255, 255, 255)
        # It Background may be black or white, we set white everywhere to be consistent
        if uniform_background:
            border = (image_resized==255).all(axis=2) | (image_resized==0).all(axis=2)
            border_rgb = np.repeat(np.expand_dims(border, -1), 3, axis=2)
            image_resized[border_rgb] = 255

        resized_images.append(image_resized)

    return resized_images

In [None]:
import urllib

def download_image(photo_id_urls, photo_id, path):

    if not photo_id in photo_id_urls:
        print(f'{photo_id} not in dict.')
        return False
        
    urls = photo_id_urls[photo_id]
    if len(urls)>0:
        # Select the size with lower resolution but higher than 1E6 pixels
        sizes = sorted([(int(i[0]), int(i[1])) for i in urls.keys() if i!=('','')], key=lambda i: i[0] * i[1])
        sizes = [s for s in sizes if s[0]*s[1]<1E7]
        sizes_t = [s for s in sizes if s[0]*s[1]>=1E6]
        if len(sizes_t)==0:
            size = sizes[-1]
        else:
            size = sizes_t[0]

        try:
            urllib.request.urlretrieve(urls[(str(size[0]), str(size[1]))], path)
            return True
        except:
            print('Download error', photo_id)

    return False

In [None]:
import os
from tqdm import tqdm
import cv2

root = 'data'
os.makedirs(root, exist_ok=True)

keyword = 'protestant_temple'

photoid_urls = keyword_photoid_urls[keyword]
tot = 10000
n = 0
photoids = sorted(photoid_urls.keys())[tot * n : tot * (n+1)]
photoid_preds = {}
batch_nb, batch_photoids, batch_images = 0, [], []
for e, photo_id in enumerate(tqdm(photoids)):

    if e%1000==0:
        print(e)

    # Build image paths
    f_path = f'{e:06d}_{photo_id}.jpg'
    path = f'{root}/{f_path}'
    res = download_image(photoid_urls, photo_id, path)

    if res:
        img = cv2.imread(path)
        if img is not None:
            # Resize
            img_256 = resize_image(img, [256], uniform_background=False)[0]
            batch_nb += 1
            batch_photoids.append(path)
            batch_images.append(img_256)

    if batch_nb==16:
        # Prediction
        predictions = model.predict(np.array(batch_images) / 255.).reshape(-1)

        # Set predictions
        for id_, pred in zip(batch_photoids, predictions):
          photoid_preds[id_] = pred

        batch_nb, batch_photoids, batch_images = 0, [], []

pickle.dump(photoid_preds, open(f'/content/gdrive/My Drive/images/photoid_preds_{keyword}_{n}.pkl', 'wb'))

  0%|          | 0/4601 [00:00<?, ?it/s]

0


 22%|██▏       | 1000/4601 [13:43<1:04:30,  1.07s/it]

1000


 28%|██▊       | 1307/4601 [17:58<34:52,  1.57it/s]

Download error 26609162543


 30%|███       | 1399/4601 [19:15<33:10,  1.61it/s]

Download error 27214737715


 43%|████▎     | 2000/4601 [27:51<37:32,  1.15it/s]

2000


 59%|█████▉    | 2717/4601 [37:51<21:41,  1.45it/s]

Download error 42551630165


 65%|██████▌   | 3000/4601 [41:43<17:05,  1.56it/s]

3000


 87%|████████▋ | 4000/4601 [54:59<07:17,  1.37it/s]

4000


100%|██████████| 4601/4601 [1:02:42<00:00,  1.22it/s]


In [None]:
print(len(photoid_preds))
#download_image(photoid_urls, photo_id, path)

4576
