#### This function downloads image files from BING and stores  them into a directory.
#### The form of the call to the function is
#### download(output_dir,query,snum, new_ext, limit=100, verbose=0, timeout=10)  where
#### output_dir is the full path to the directory where the downloaded files will be stored
#### query is a string defining the subject of the files to download - for example query='bird images'
#### limit is an integer specifying the maximum number of files to download
#### verbose is an integer. If verbose=0 details of the download process are not printed out
#### timeout is an integer specifying the max time in seconds to wait for a query response
#### When downloading images from BING, the file names are typically long and often unconventional
#### and can be difficult to handle if you process the downloaded images
#### this function includes the ability to rename the downloaded images in a numerical sequence
#### starting with the numerical value of the integer snum. If snum=0 no renaming of the downloaded
#### files takes place. Files are renamed in numerical sequence with "zeros padding" The number of
#### leading zeros in the name is a function of the number of files stored in the output_dir.
#### If less than 10 no leading zeros are present, if less than 100, 1 leading 0 is appended. If more
#### 1000 are present two leading zeros are present. Zeros padding preserves the file order 
#### processing in typical python functions
#### If snum is not equal to 0 the files are renumbered and you also have the option to modify
#### the image format to that specified by the string next_ext. If new_ext='same' the original file
#### formats are preserved. If new_ext='jpg" for example all downloaded images are  converted
#### to a jpg format.
####  Note: see the import cell below to determine what modules need to be installed

In [1]:
import os
import shutil
from pathlib import Path
import os
import sys
import urllib.request
import urllib
import imghdr
import posixpath
import re
from tqdm import tqdm
import cv2

#### Function below is the main function

In [2]:
def download(output_dir,query,snum, new_ext,resize, limit=100, verbose=0, timeout=10):
    adult = 'off'    
    image_dir = output_dir
    status=check_dir(output_dir)  # checks inputs and looks to see if output_dir contains files      
    if status==False:        
        return False
    bing = Bing(query, limit, output_dir, adult, timeout) # initializes the bing class
    bing.run()  # initiates the downloading of images    
    if snum !=0 or resize !='same':
        rename (output_dir, snum, new_ext, resize) # renames the downloaded files
        

#### This function performs checks on the input parameters
####  it looks at the output_dir to determine if it already contains files
#### If files are present the user is given the option to delete the files,
#### move the files from the output_dir to a sub directory called saved files
#### or to continue and append new downloaded files into the output_dir

In [3]:
def check_dir(dir):
    if os.path.isdir(dir)==False:
        msg='\nThe directory you specified {0} does not exist.  Enter C to create it or Q to Quit'.format(dir)
        ans=input(msg)
        if ans not in ['Q', 'q']:
            os.makedirs(dir)
            return True
        else:
            return False
    else:
        # the directory exists check if it is empty
        dir_list=os.listdir(dir) 
        file_count=0
        f_list=[]
        for d in dir_list:
            d_path=os.path.join (dir, d)
            if os.path.isfile(d_path):
                file_count = file_count +1
                f_list.append(d)
        if file_count==0:
            return (True, None)   # there are no files in the directory                 
        msg='There are {0} files in {1}. Enter C to continue, M to move them, or D to delete them'
        ans=input(msg.format(file_count, dir))
        run=True        
        while run:
            if ans not in ['C', 'c', 'M', 'm', 'D', 'd', 'Q', 'q']:
                msg=' your response {0} was neither a C, a M or a D, please enter a valid response'.format(ans)
                ans=input(msg)                
            else:                
                run=False 
                break             
        if ans in ['Q', 'q']:
             return False
        if ans in ['C', 'c']:           
            return True
        if ans in ['D', 'd']:
            for f in f_list:
                dir_file_path=os.path.join(dir,f)
                if os.path.isfile(dir_file_path):
                    os.remove(dir_file_path)
            msg='{0} files were removed from {1}'.format(file_count, dir)
            print_in_color(msg, (0,255,255),(0,127,127))
            return True                    
        store_dir_path=os.path.join (dir, 'saved files')
        if os.path.isdir(store_dir_path)==False:
            print('the storage directory you specified ', store_dir_path, ' does not exist, creating it for you')
            os.mkdir(store_dir_path)            
        for i in range (file_count):
            f=f_list[i]            
            store_file_path=os.path.join (store_dir_path,f)
            dir_file_path=os.path.join(dir,f)            
            shutil.copy(dir_file_path, store_dir_path)
            if os.path.isfile(dir_file_path):                
                os.remove(dir_file_path)             
        msg='{0} files were removed from {1} and stored in {2}'.format(file_count, dir, store_dir_path)
        print_in_color(msg, (0,255,255),(0,127,127))
        return True  

#### This function enables printing a message to the terminal with foreground color
#### specified by the rgb tupple fore_tupple and a background color specified by the
#### rgb tupple back_tupple

In [4]:
def print_in_color(txt_msg,fore_tupple,back_tupple,):
    #prints the text_msg in the foreground color specified by fore_tupple with the background specified by back_tupple 
    #text_msg is the text, fore_tupple is foregroud color tupple (r,g,b), back_tupple is background tupple (r,g,b)
    rf,gf,bf=fore_tupple
    rb,gb,bb=back_tupple
    msg='{0}' + txt_msg
    mat='\33[38;2;' + str(rf) +';' + str(gf) + ';' + str(bf) + ';48;2;' + str(rb) + ';' +str(gb) + ';' + str(bb) +'m' 
    print(msg .format(mat), flush=True)
    print('\33[0m', flush=True) # returns default print color to back to black
    return

#### This function renes the files in the output_dir in a numerically sequential fashion starting
#### with the value of the integer snum. It will also change the image format based on the
#### value of the string new_ext. For example is new_ext='jpg' all files will be converted to
#### jpg images. If next_ext='same' the original extensions are unchanged

In [5]:
def rename (source_dir, snum, new_ext, resize):
    f_list=[]
    if new_ext !='same':
        new_ext=new_ext.lower()
    source_list=os.listdir(source_dir) 
    for f in source_list:
        f_path=os.path.join(source_dir, f)
        if os.path.isdir(f_path)==False:
            f_list.append(f)  # list of file names in source directory
    fc=len(f_list)  # determine number of d files to process    
    pad=0
    mod = 10     
    for i in range(1, fc + 1): # skip i=0 because 0 modulo anything is 0 and we don't want to increment pad
        if i % mod == 0:
            pad=pad+1                    
            mod =mod * 10
    for i in tqdm(range(0,fc)):
        f=f_list[i] 
        f_path=os.path.join(source_dir,f) #full path to the file
        index=f_path.rfind('.') # find location of last . in file name
        fname=f_path[:index]    # name of file - no extension
        ext=f_path[index:]   # this includes the period (.)
        if ext=='jfif': # jfif files are the same as jpg files so change the extension to jpg
            ext='jpg'
        if snum !=0: # check if need to rename file
            fnew= str(i + snum).zfill(pad+1)  # rename the files name with leading zeros
        else:
            fnew=fname # use original file name
        new_path=os.path.join(source_dir, fnew) 
        if new_ext != 'same': # check if files will have a new extension
            fnew=fnew +'.' + new_ext 
        else:
            fnew=fnew +  ext  # use old extension
        new_path= os.path.join (source_dir, fnew) # full path for the newly renamed file
        try:
            img=cv2.imread(f_path) 
            dummy=img.shape
            if resize !='same':
                img=cv2.resize(img, resize) 
                os.remove(f_path)
            cv2.imwrite(new_path, img)
        except:
            print ('image ' ,fname, ' is an invalid image and will be deleted')
            os.remove(f_path)
    if new_ext=='same':            
        msg=' {0} files were renamed and saved with the original extensions'.format(fc)
    else:
         msg='{0} files were renamed and saved with new extension {1}'.format(fc, new_ext) 
    print (msg)
    return True


#### Class definition 

In [6]:
class Bing:
    def __init__(self, query, limit, output_dir, adult, timeout, filters=''):
        self.download_count = 0
        self.query = query
        self.output_dir = output_dir
        self.adult = adult
        self.filters = filters

        assert type(limit) == int, "limit must be integer"
        self.limit = limit 
        assert type(timeout) == int, "timeout must be integer"
        self.timeout = timeout

        self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0'}
        self.page_counter = 0

    def save_image(self, link, file_path, out_dir): # saves images to the output_dir
        request = urllib.request.Request(link, None, self.headers)
        image = urllib.request.urlopen(request, timeout=self.timeout).read()
        output_dir_list=os.listdir(out_dir)
        filename=os.path.basename(file_path)
        if filename in output_dir_list: # check if a file with this name is already in the output_dir
            if verbose !=0:
                print (' file ', filename, ' is a duplicate')
                raise # raise an error so the file is not saved this avoids duplicate images
        if not imghdr.what(None, image) :
            if verbose !=0:
                print('[Error]Invalid image, not saving {}\n'.format(link))
            raise
        with open(file_path, 'wb') as f:
            f.write(image)

    def download_image(self, link):
        self.download_count += 1

        # Get the image link
        try:
            path = urllib.parse.urlsplit(link).path
            filename = posixpath.basename(path).split('?')[0]
            file_type = filename.split(".")[-1]
            if file_type.lower()=='jfif':
                file_type='jpg' 
            if file_type.lower() not in ["jpe", "jpeg", "jfif",  "tiff", "gif", "bmp", "png",  "jpg"]:
                raise  # raise an exception and do not save the file
            # Download the image
            if verbose != 0:
                print('Downloading Image # ', self.download_count) 
            dest=os.path.join(self.output_dir, filename)
            self.save_image(link, dest, self.output_dir)
            if verbose != 0:
                print( 'Image ', self.download_count, ' saved ', filename, flush=True)
        except:
            if verbose !=0:
                print('image file was a duplicate or failed to get download image for Image ', self.download_count, ' Trying next link')
            self.download_count -= 1
           

    def run(self):
        
        while self.download_count < self.limit:
            
            request_url = 'https://www.bing.com/images/async?q=' + urllib.parse.quote_plus(self.query) \
                          + '&first=' + str(self.page_counter) + '&count=' + str(self.limit) \
                          + '&adlt=' + self.adult + '&qft=' + self.filters
            request = urllib.request.Request(request_url, None, headers=self.headers)
            response = urllib.request.urlopen(request)
            html = response.read().decode('utf8')
            links = re.findall('murl&quot;:&quot;(.*?)&quot;', html)
            for link in links:
                if self.download_count < self.limit:
                    self.download_image(link)
                else:
                    break
            
            self.page_counter += 1
        print ('***** Completed downloading of ', self.limit, 'files ******', flush=True)
            


#### Example of use

In [7]:
limit=20
dest_dir=r'C:\Temp\household_goods\storage'
query='salad fork images'
verbose=1
start_num=1
new_ext='jpg'
image_size=(224,224)
status=download(output_dir=dest_dir,query=query,snum=start_num, new_ext=new_ext,resize=image_size, limit=limit, verbose=verbose, timeout=10)


The directory you specified C:\Temp\household_goods\storage does not exist.  Enter C to create it or Q to Quitc
Downloading Image #  1
Image  1  saved  512541.jpg
Downloading Image #  2
Image  2  saved  melinda-flatware-stainless-steel-salad-fork-12-pack.jpg
Downloading Image #  3
Image  3  saved  Dinner-Fork-Stainless.jpg
Downloading Image #  4
Image  4  saved  511917.jpg
Downloading Image #  5
Image  5  saved  512542.jpg
Downloading Image #  6
Image  6  saved  artaste-salad-fork-set-of-12-salad-fork.jpg
Downloading Image #  7
Image  7  saved  stainless-salad-fork.jpg
Downloading Image #  8
Image  8  saved  Laguna-Salad-Fork1.jpg
Downloading Image #  9
Image  9  saved  606623.jpg
Downloading Image #  10
Image  10  saved  Winco-0005-06-Dots-Salad-Forks-23084_xlarge.jpg
Downloading Image #  11
Image  11  saved  370-rg1206.jpg
Downloading Image #  12
Image  12  saved  Winco-0034-06-Stanford--Salad-Forks-23053_xlarge.jpg
Downloading Image #  13
image file was a duplicate or failed to get

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 55.42it/s]

20 files were renamed and saved with new extension jpg



