In [None]:
import os
import time
import pickle
import re 
import json
import requests

import numpy as np
import cv2
from matplotlib import pyplot as plt
from PIL import Image
from tqdm import tqdm

### Download missing images from json files and resize all images

In [5]:
data_dir = 'dataset/cogent/data'

all_img_fnames = []

for dirname in os.listdir(data_dir):
    if dirname.endswith('.zip'):
        continue
        
    if 'images' not in dirname:
        continue
        
    dir_path = os.path.join(data_dir, dirname)
    print(dirname, dir_path)

    for fname in os.listdir(dir_path):
        print(fname)
        break
    
    fnames = os.listdir(dir_path)
    print('Total:',len(fnames))
    all_img_fnames.extend(fnames)
    print()
print('Total fnames',len(all_img_fnames))
print('Total unique:',len(set(all_img_fnames)))
all_img_fnames = set(all_img_fnames)

images-archief dataset/cogent/data/images-archief
8aed773f831cf6c92929c1efdb43da89-transcode-MA_ROM_XVI_AF_03556_R.jpg
Total: 1898

images-alijn dataset/cogent/data/images-alijn
35371a245bbb4c6c0bd43f31f812630e-transcode-2004-247-550.jpg
Total: 9682

images-downloaded dataset/cogent/data/images-downloaded
8c506ddc81b9174f6f89d67c8fade9ce-transcode-OA_535_372_015_B_TE.jpg
Total: 7505

images-design dataset/cogent/data/images-design
5399315ee2427f9da3853672539735d8-transcode-0844.jpg
Total: 2186

images-industrie dataset/cogent/data/images-industrie
d0e8fe9b5ab89d454546e082c77699e6-transcode-V24752.jpg
Total: 8431

images-stam dataset/cogent/data/images-stam
c3df5b8acdf4d3cc8f7d24a765e2aefa-transcode-N_00024_051-114.jpg
Total: 9390

Total fnames 39092
Total unique: 39092


In [6]:
data_dir = 'dataset/cogent/data'

all_fnames = []
all_image_locations = []
no_transcode = 0

no_media = []
for dirname in os.listdir(data_dir):
    if dirname.endswith('.zip'):
        continue
        
    if 'collection' not in dirname:
        continue
        
    dir_path = os.path.join(data_dir, dirname)
    print(f'Current collection: {dirname}')
    fnames = os.listdir(dir_path)
    print('Total:',len(fnames))
    all_fnames.extend(fnames)
    for fname in tqdm(os.listdir(dir_path)):
        
        json_path = os.path.join(dir_path,fname)
        with open(json_path) as f_:
            data = json.load(f_)
            loc = None
            if 'primary_transcode_location' in data:
                loc = data['primary_transcode_location']
    
            if loc is None and 'primary_mediafile_location' in data:
                loc = data['primary_mediafile_location']
                no_transcode += 1
            
            if loc is not None:
                all_image_locations.append(loc)
            
            else:
                no_media.append(json_path)
                
            
   
    print()
print('Total fnames',len(all_fnames))
print('Total unique:',len(set(all_fnames)))
print('Total img locations:', len(all_image_locations))
print(f'Total without transcode: {no_transcode}')
print(f'Total without any media: {len(no_media)}')

Current collection: collection-stam
Total: 9396


100%|██████████| 9396/9396 [00:10<00:00, 924.35it/s] 



Current collection: collection-alijn
Total: 10000


100%|██████████| 10000/10000 [00:11<00:00, 842.62it/s]



Current collection: collection-archief
Total: 9402


100%|██████████| 9402/9402 [00:10<00:00, 894.63it/s] 



Current collection: collection-industrie
Total: 8431


100%|██████████| 8431/8431 [00:10<00:00, 805.86it/s] 



Current collection: collection-design
Total: 2186


100%|██████████| 2186/2186 [00:02<00:00, 920.41it/s] 


Total fnames 39415
Total unique: 39415
Total img locations: 39409
Total without transcode: 6
Total without any media: 6





In [9]:
not_collected = 0
not_jpg = 0
extensions = {}
for f in all_image_locations:
    img_fname = f.rsplit('/',1)[-1]
    ext = img_fname.lower().rsplit('.',1)[-1]
    if img_fname == 'v1None':
        continue
    
    
    if img_fname not in all_img_fnames and ext=='jpg':
        not_collected += 1
    
    if ext in extensions:
        extensions[ext] += 1
    else:
        extensions[ext] = 1
    
    
print('Total not collected:',not_collected)
print('Extensions:')
for k,v in sorted(extensions.items(), key=lambda x:-x[1]):
    print(k,v)
    
    

Total not collected: 0
Extensions:
jpg 39091
mp3 147
mp4 122


#### Download missing images to new directory images-downloaded

In [8]:
download_dir = os.path.join(data_dir, 'images-downloaded')

if not os.path.exists(download_dir):
    os.mkdir(download_dir)

downloaded_fnames = set(os.listdir(download_dir))

for img_url in tqdm(all_image_locations):
    img_fname = img_url.rsplit('/',1)[-1]
    ext = img_fname.lower().rsplit('.',1)[-1]
    
    if img_fname not in all_img_fnames and ext == 'jpg':
        download_path = os.path.join(download_dir, img_fname)
        if os.path.exists(download_path):
            continue
        img_data = requests.get(img_url).content
        with open(download_path, 'wb') as handler:
            handler.write(img_data)
        

100%|██████████| 39409/39409 [00:00<00:00, 453586.65it/s]


In [5]:
def resize(img, max_dim=2048):
    
    h,w = img.shape[:2]
    
    if h >= w and h > max_dim:
        new_h = max_dim
        new_w = int(w/h * new_h)
    elif w > h and w > max_dim:
        new_w = max_dim
        new_h = int(h/w*new_w)
        
    else:
        return img
    
    return cv2.resize(img, (new_w,new_h))
    
def read_resize(tup):
    img_path, output_path = tup
    img = resize(cv2.imread(img_path), max_dim=max_dim)
    cv2.imwrite(output_path, img)
     


#### set cv2 threads to 1 for multiprocessing

In [6]:
print(cv2.getNumThreads())
cv2.setNumThreads(1)
print(cv2.getNumThreads())

1
1


#### Image filenames are unique, below code checks the images-resized folder with all other image folders and resizes the images accordingly

Change n_CPUs to the number of cpus available or change to multiprocessing.dummy for multithreaded (untested)

In [15]:
from multiprocessing.pool import Pool
#from multiprocessing.dummy import Pool
data_dir = 'dataset/cogent'
collections_dir = os.path.join(data_dir, 'data')

img_dir = os.path.join(data_dir, 'images_resized')

all_img_fnames = []
max_dim = 2048


n_CPUs = 4
 
if __name__ == '__main__':
    
    for dirname in os.listdir(collections_dir):
        if dirname.endswith('.zip'):
            continue

        if 'images' not in dirname:
            continue

        dir_path = os.path.join(collections_dir, dirname)
        print(dirname, dir_path)

        input_filenames = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if not os.path.exists(os.path.join(img_dir, f))]
        output_filenames = [os.path.join(img_dir, f) for f in os.listdir(dir_path) if not os.path.exists(os.path.join(img_dir, f))]
        print(len(input_filenames), len(output_filenames))
        if len(input_filenames) == 0:
            continue
        inputs = zip(input_filenames, output_filenames)
        # create and configure the process pool
        with Pool(n_CPUs) as pool:
            print(pool)
            pool.imap_unordered(read_resize, inputs, chunksize=32)
            pool.close()
            # wait for all issued task to complete
            pool.join()

        fnames = os.listdir(dir_path)
        print('Total:',len(fnames))
        all_img_fnames.extend(fnames)
        print()
print('Total fnames',len(all_img_fnames))
print('Total unique:',len(set(all_img_fnames)))
all_img_fnames = set(all_img_fnames)


images-archief dataset/cogent/data/images-archief
0 0
images-alijn dataset/cogent/data/images-alijn
0 0
images-downloaded dataset/cogent/data/images-downloaded
0 0
images-design dataset/cogent/data/images-design
0 0
images-industrie dataset/cogent/data/images-industrie
0 0
images-stam dataset/cogent/data/images-stam
0 0
Total fnames 0
Total unique: 0
