In [2]:
import os
from os import listdir,makedirs
from os.path import isfile,join
import shutil

import cv2
import numpy as np

import matplotlib.pyplot as plt

import fnmatch

In [3]:
def crop_center_resize(img, dim):
    """Crop the centered image region and resize. Only use this for refined good images."""

    width, height = img.shape[1], img.shape[0]

    crop_width = min(width, height)
    crop_height = crop_width

    mid_x, mid_y = int(width/2), int(height/2)
    cw2, ch2 = int(crop_width/2), int(crop_height/2) 

    crop_img = img[mid_y-ch2:mid_y+ch2, mid_x-cw2:mid_x+cw2]
    
    resized = cv2.resize(crop_img, dim, interpolation = cv2.INTER_AREA)
    
    return resized
    

In [4]:
def move_images(path, dest_path):
    """Move images from general path to batch folder"""
    
    files = list(filter(lambda f: isfile(join(path,f)), listdir(path)))
    
    isExist = os.path.exists(dest_path)

    if not isExist:
        os.makedirs(dest_path)
        
    for image in files:
        if fnmatch.fnmatch(image, '*.jpg'):
            image_path = os.path.join(path, image)
            shutil.move(image_path, dest_path)



In [5]:
def move_new_image(path_with_new_img, refined_img_path, dest_path):
    """Move newly download images to folder of refined images e.g. 'soap_refined_batch' """
    
    # all_files_with_new is larger
    all_files_with_new = list(filter(lambda f: isfile(join(path_with_new_img,f)), listdir(path_with_new_img)))
    
    # refine_img is smaller 
    refine_img = []
    for root, dirs, files in os.walk(refined_img_path):  # replace the . with your starting directory
        for file in files:
            if fnmatch.fnmatch(file, '*.jpg'):
                path_file = os.path.join(root,file)
                refine_img.append(file)

    diff = list(set(all_files_with_new) - set(refine_img))
    diff_path = [path_with_new_img + x for x in diff]
    
    for diff_image in diff_path:
        shutil.move(diff_image, dest_path)
    

# Move newly downloaded images for one new batch

step 1. Run flickr_scraper.py to download the image by "keyword" searching.
You might see these errors when running this script:
a. requests.exceptions.ConnectionError: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))
This is okay, it still downloads many images.

b. script does not terminate 
You then have to force quite running. When this occurs, it still downloads most images.

c. You might also see this:
7/10 error...
It's has been downloaded for this keyword

This is because I modify the orginal code, to skip duplicates, and those images in the folder 'soap_refined_batch' (where I save the good images)

step 2. Run function 'move_new_image' to copy the newly downloaded images to folder 'soap_refined_batch'.

In [7]:
###### example
# Please change these 3 path accordingly
keyword_folder = "soap_cube"
download_path_by_keyword = '/Users/chenxiliao/Dropbox/Projects/Search_Google_img/flickr_scraper/images/' + keyword_folder + '/'
print(download_path_by_keyword)
existing_img_path = '/Users/chenxiliao/Dropbox/Projects/Search_Google_img/flickr_scraper/images/soap_refined_batch/'
destination_path = '/Users/chenxiliao/Dropbox/Projects/Search_Google_img/flickr_scraper/images/soap_refined_batch/'


move_new_image(download_path_by_keyword,
               existing_img_path,
               destination_path)

/Users/chenxiliao/Dropbox/Projects/Search_Google_img/flickr_scraper/images/soap_cube/


step 3. Clean images in folder 'soap_refined_batch'. Move unwanted images to folder 'bad_sample'

step 4. Move filtered newly downloaded images to folder 'batch_num' (n) in folder 'soap_refined_batch'.

In [9]:
###### example
#Please change 'refined_img_batch_path' accordingly
new_batch = "batch3"
refined_img_batch_path = '/Users/chenxiliao/Dropbox/Projects/Search_Google_img/flickr_scraper/images/soap_refined_batch/' + new_batch
move_images(existing_img_path ,
          refined_img_batch_path)

# Crop center and Resize images for one batch
This part is unfinished. Do not worry about center and resize images in batch folders at this moment.

path ='/Users/chenxiliao/Dropbox/Projects/Search_Google_img/flickr_scraper/images/soap_refined_batch/batch1' # Source Folder
files = list(filter(lambda f: isfile(join(path,f)), listdir(path)))

dstpath = '/Users/chenxiliao/Dropbox/Projects/Search_Google_img/flickr_scraper/images/soap_resize/' # Destination Folder

try:
    makedirs(dstpath)
except:
    print ("Directory already exist, images will be written in same folder")

for image in files:
    if fnmatch.fnmatch(image, '*.jpg'):
        print(image)
        img = cv2.imread(os.path.join(path,image))
        resized = crop_center_resize(img, (512,512))
        dstPath_img = join(dstpath,image)
        cv2.imwrite(dstPath_img,resized)