# Algorithm for downloading images from iDigBio
###  (all images are verified by experts)

All Diptera occurences from iDigBio are obtained and cleaned to preserve only those that have associated image of the frontal view (head) of the specimens.

### NOTE: to download the same images as in our study use this csv file 

    D1_list_of_filtered_images.csv
    
### Here is the walk trough how we acquired and filtered the images

**Input**: multimedia.csv - a list of records from iDigBio obtained with query keywords ("hasImage":"true" and "order":"diptera")

**Outputs**: images of frontal habitus sorted by family names

Procedure: 

	Step 1.
		collect:
			- images with keywords
				- 'dorsal'
				- 'habitus_dor', 'Habitus_dor'
				- '_D.', "_had"
			- images from institutions that provide mainly dorsal view 
				- 'Denver Museum of Nature & Science'
	            - 'University of Tennessee at Chattanooga (UTC-UTCI)'
	            - 'United States National Museum, Entomology Collections (USNM-USNMENT)'
		skip: 
			- images with keywords: 
				- "lateral", "frontal", "ventral", 'anterior'
				- "head", 'antenna', "labels", 
				- 'mesosoma', "genitalia"
				- "_L", "_F", "_V", 
				- 'web', 'habitus_lat', 'Habitus_lat' 
				- "hed", "hef", "hal", "hed" (head images) 
            - images from institutions that provided fossil images
		check:
			- from records that are not skipped or collected depict images from poorly represented families 
	Step 2.
    	- download images from families with N+ records
		- manually check all the images (to avoid drawings, images of labels, images where head is destroyed, etc.)
        

We ended up with 11 families and 884 images.

In [1]:
import csv
import os
import urllib

with open('metadata/multimedia_raw.csv', 'rt') as csvfile:
    reader = csv.reader(csvfile)
    total_media = 0
    remained = []
    cleaned_list = []
    first_row = []
    #iterate over each row and count how many they are
    for row in reader:
        if first_row == []:
            first_row = row
    
        total_media +=1

        # and clean row 100 - some institutions provided only photos of labels with insects barelly visible or fossils
        # Arizona, Hawaii, Yale, Michigan, Texas
        
        if row[100] == 'University of Hawaii Insect Museum' or\
            row[100] == 'University of California Museum of Paleontology' or\
            row[100] == 'CUML' or\
            row[54]  == 'Michigan State University' or\
            row[54] == 'University of Minnesota' or\
            row[100] == 'Queensland Museum' or\
            row[100] == 'something':
            pass
        
        # Colorado
        elif row[100]== 'Colorado Plateau Museum of Arthropod Biodiversity (NAUF-CPMAB)' and '_F.' in row[15]:
            cleaned_list.append(row)    
        elif row[100]== 'Colorado Plateau Museum of Arthropod Biodiversity (NAUF-CPMAB)':
            pass
        
        
        # HARVARD
        elif 'Harvard University' in row[100] and '_hef' in row[5]:
            cleaned_list.append(row)    
        elif row[100]== 'Museum of Comparative Zoology, Harvard University':
            pass
        
        
        # also to exclude fossils, audio media, slides, drawers, broken links, no access
        elif 'PALE' in row[5] or \
            'macaulay' in row[15] or\
            'flickr' in row[5] or\
            'invert' in row[5] or\
            'utexas' in row[5] or\
            'osuc' in row[5] or\
            'yale' in row[5]: 
            pass
        
        
        # get rid the images which are reported as not dorsal view    
        elif 'Later' in row[29] or\
            'Dors' in row[29] or\
            'Tho' in row[30] or 'Gen' in row[30] or 'Whol' in row[30] or 'Ventr' in row[29]:
            pass   
        
        elif 'Head' in row[30]:
            cleaned_list.append(row)
        
        # how many images are not treated
        else:   
            remained.append(row)
            

print ('true\t\t', len(cleaned_list))
print ('cleaned\t\t', total_media- len(cleaned_list)- len(remained))
print
print ('remained to treat\t', len(remained))
print ('total data\t\t', total_media)
print()

#### next, we checked institutions which we still have to treat

to_treat={}
for row in remained:
    if row[100] in to_treat.keys():
        to_treat[row[100]]+=1
    else:
        to_treat[row[100]]=1
for key in to_treat:
    print (to_treat[key], key, '\n')

#### NOT IN USE

#the cell bellow is used to show links for untreated images provided by certain institution
num = 0
for row in remained:
    if 'arctos' in row[5]:
        num+=1
        print (row[5])
print (num)

### save the list as csv file and add the titles in the first row

cleaned_list.insert(0, first_row)

with open('metadata/newcleaned_head.csv', 'wt') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    for row in range(len(cleaned_list)):
        wr.writerow(cleaned_list[row])

remained.insert(0, first_row)
with open('metadata/newremained.csv', 'wt') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    for row in range(len(remained)):
        wr.writerow(remained[row])

# download the images 
from **cleaned_head.csv** and if you wish add manually more examples  from **remained.csv** for families with fewer examples

In [7]:
def download_all(records, dictionary, save_to, tax_level = 'family', num=25):
    if tax_level == 'genus':
        level = 4
    else:
        level = 3
    for key in dictionary.keys():
        if len(dictionary[key])>num:
            print ()
            print (len(dictionary[key]), key)
            print()
            directory = save_to + '/' + key
            if not os.path.exists(directory):
                os.makedirs(directory)
            for value in dictionary[key]:
                for i in records:
                    if value == i[0]:
                        print (i[level])
                        urllib.urlretrieve(i[1], directory+'/'+i[2]+".jpg")

In [8]:
with open('D1_list_of_filtered_images.csv', 'rt') as csv1:
    dorsal = csv.reader(csv1)
    records = []
    record = []
    for row in dorsal:
        records.append(row)

In [11]:
download_all(records, family, 'family', num=25)

# make sure you examine and clean your dataset manually

### or you can simply download filtered images a list we provided 

    D1_list_of_filtered_images.csv