In [10]:
import header

from multiprocessing.pool import ThreadPool
from pathlib import Path

import pandas as pd
import numpy as np
import os
import requests
import tldextract
from v0.ai import embedding_model
from v0.models import Image


FILE = Path('data/gcc/validation.tsv')

if not FILE.is_file():
    raise FileNotFoundError(f'{FILE} not found')


In [14]:
images = pd.read_csv(FILE, sep='\t', header=None, names=['label', 'url'])
print(images.head())

# filter to ones we havent done yet
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"
done_urls = list(Image.objects.all().values_list('url', flat=True))
print(len(done_urls))

images = images.drop(images[images.url.isin(done_urls)].index)
print(len(images))

                                               label  \
0      author : a life in photography -- in pictures   
1            an angler fishes river on a snowy day .   
2  photograph of the sign being repaired by brave...   
3  the player staring intently at a computer scre...   
4  globes : the green 3d person carrying in hands...   

                                                 url  
0  https://i.pinimg.com/736x/66/01/6c/66016c3ba27...  
1  http://www.standard.net/image/2015/02/04/800x_...  
2  http://indianapolis-photos.funcityfinder.com/f...  
3  http://www.abc.net.au/news/image/9066492-3x2-7...  
4  https://www.featurepics.com/StockImage/2009031...  
13810
2030


In [15]:
# embeddings
embeds = np.array(embedding_model.model.encode(images['label'].to_list())).astype(np.float32)
images['embed'] = embeds.tolist()

Batches:   0%|          | 0/64 [00:00<?, ?it/s]

In [16]:

def process_image(i_image):
    i, image = i_image
    try:
        image_model = Image()
        image_model.url = image['url']
        image_model.description = image['label']
        image_model.embedding_all_mpnet_base_v2 = image['embed']
        image_model.domain = tldextract.extract(image['url']).domain
        image_model.provider = Image.providers.GCC_DATASET

        # validate the image is alive
        r = requests.head(image['url'], timeout=1)
        if r.status_code != 200:
            image_model.url_alive = False
        image_model.save()
        return
    except Exception as e:
        print(e)

# for i, image in images.iterrows(): 
#     process_image((i, image))
#     print(i)
pool = ThreadPool(processes=16)
pool.map(process_image, images.iterrows())

HTTPConnectionPool(host='killdevilsdenobx.com', port=80): Max retries exceeded with url: /wp-content/uploads/2017/03/bathroom-683x1024.jpg (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x312b230d0>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))
HTTPConnectionPool(host='piquemagazine.uk', port=80): Max retries exceeded with url: /wp-content/uploads/2017/10/LPO-24-Feb-Albrecht-Menzel-%C2%AE-Anne-Hornemann-300dpi.jpg (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x312b234f0>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))
HTTPConnectionPool(host='salonseven.ru', port=80): Max retries exceeded with url: /wp-content/uploads/2015/04/Quote-by-Henry-David-Thoreau-about-our-planet-600x600.jpg (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x312b233d0>: Failed to establish a new connection: [Errno 8] nodename n

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

duplicate key value violates unique constraint "v0_image_url_key"
DETAIL:  Key (url)=(http://l7.alamy.com/zooms/efbfd6bc4ed84b47b63c498eb85f8723/the-argentinian-coach-jose-pekerman-c-shouts-instructions-on-the-sideline-d3nnpd.jpg) already exists.

duplicate key value violates unique constraint "v0_image_url_key"
DETAIL:  Key (url)=(http://i2.wp.com/img1.dzmind.com/images/barbie.jpeg?w=600) already exists.

duplicate key value violates unique constraint "v0_image_url_key"
DETAIL:  Key (url)=(http://l7.alamy.com/zooms/b5cf8ed0fed5408fb49bea3b1486f9ef/looking-down-on-a-city-street-and-bus-stop-in-sheffield-yorkshire-dxtja4.jpg) already exists.

duplicate key value violates unique constraint "v0_image_url_key"
DETAIL:  Key (url)=(http://www.thecatdish.com/wp-content/uploads/2015/10/PA200229-600x450.jpg) already exists.

duplicate key value violates unique constraint "v0_image_url_key"
DETAIL:  Key (url)=(https://www.ocregister.com/wp-content/uploads/migration/o6m/o6mkol-b88695764z.12016050