In [1]:
from PIL import Image
import glob
import requests


import os


In [2]:
# function for downloading images from the labelme website
def download_images(url, folder, index_count, image_label_long_string):

    # Fetch HTML content of the page
    response = requests.get(url)
    html_content = response.text

    # Use while loop to find all ."jpg" hyperlink things in the HTML text thing
    index = 0
    while True:
        index = html_content.find('.jpg', index)
        
        # The indices are only -1 if no instance of the substring was found (i.e. end of file)
        if index == -1:
            break
            
        # Find the start of the image title
        start_index = html_content.rfind('"', 0, index)
        if start_index == -1:
            break
            
        # Find the end of image title
        end_index = html_content.find('"', index)
        if end_index == -1:
            break
        

        # Get full image url
        image_url = url + html_content[start_index + 1:end_index]
        
        filename = os.path.join(folder, os.path.basename(image_url))
        
        # get image name:
        img_name = html_content[start_index + 1:end_index]
        
        # extra identifier to solve problems with overlapping names
        img_identifier = str(index_count) + str(index_count) + str(index_count) + ".jpg"
        filename = filename.replace(".jpg", img_identifier)
        
        if filename in image_label_long_string:
            # Download image
            image_retrieval = requests.get(image_url)
            if image_retrieval.status_code == 200:      # this is a quick safeguard to guarantee that the retrieval was succesfull
                # Save the image to folder
                with open(filename, 'wb') as file:
                    file.write(image_retrieval.content)
                
        # Update index so that the next jpg can be found
        index = end_index + 1

In [3]:
# function for generating the list of folder urls from the labelme website
def find_folders(do_annotations_bool):
    if do_annotations_bool:
        url = "http://labelme.csail.mit.edu/Annotations/"
    else:
        url = "http://labelme.csail.mit.edu/Images/"
  
    request = requests.get(url)
    html_content = request.text
    
    folder_list = []

    start_index = 0

    while start_index != -1:
        
        # all folder hyperlinks start with the alt=... substring, and end at double quotes.
        start_index = html_content.find('alt="[DIR]"></td><td><a href="', start_index) + 30
        stop_index = html_content.find('"', start_index)

        # quit in case no additional folder was found
        if start_index == 29 or stop_index == -1: 
            break
            
        folder_url = url + html_content[start_index:stop_index]

        folder_list.append(folder_url)
        start_index = stop_index + 1

    return(folder_list)






In [4]:
# function for getting the associated image labels from the labelme website
def obtain_labels_labelMe(folder_url, index_count):

    LabelMe_labels = []
   
    request = requests.get(folder_url)
    html_folder_content = request.text
    html_length = len(html_folder_content)

    index = 0

    # while loop to find the xml files in the current folder
    while True:
        index = html_folder_content.find('.xml', index)

        # break out of while loop if all xmls have been found
        if index == -1:
            break
        # Find the start of the URL
        begin_index = html_folder_content.rfind('"', 0, index)

        if begin_index == -1:
            break
        # Find the end of the URL
        end_index = html_folder_content.find('"', index)

        if end_index == -1:
            break

        # obtain xml name
        xml_name = html_folder_content[begin_index:end_index]

        xml_url = folder_url + html_folder_content[begin_index + 1:end_index]


        start_index = 0


        request = requests.get(xml_url)
        html_content = request.text

        
        
        label_begin_start_index = html_content.find('<filename>', start_index) + 10
        label_end_stop_index = html_content.find('</filename>', start_index)

        label_begin_start_index_pt2 = html_content.find('<folder>', start_index) + 8
        label_end_stop_index_pt2 = html_content.find('</folder>', start_index)

        # initialize the label with the name of the .jpg image (which is used for sorting purposes later)
        label_pt1 = html_content[label_begin_start_index:label_end_stop_index]
        label_pt2 = html_content[label_begin_start_index_pt2:label_end_stop_index_pt2]
        
        label = label_pt1 + "><" + label_pt2 + ">>"
        
        label_identifier = str(index_count) + str(index_count) + str(index_count) + ".jpg"
        
        label = label.replace(".jpg", label_identifier)



        # Now store all the labels that exist in the annotation part of the jpg
        while start_index != -1:
            start_index = html_content.find('<name>', start_index) + 6
            stop_index = html_content.find('</name>', start_index)

            # break out if no more labels are present in the annotation
            if start_index == 5 or stop_index == -1:
                break

            # some annotations were added after publication of the unbiasedness paper, 
            # and they also have a tendency to be wrong, so these labels are not added
            forbidden_years = ["2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024"]

            if not any(sub_string in html_content[stop_index:min([stop_index+250, html_length])] for sub_string in forbidden_years):
                label = label + " " + html_content[start_index:stop_index]

            start_index = stop_index+1

        # Add all labels to corresponding annotation
        LabelMe_labels.append(label)

        # go to the next xml in the folder
        index = end_index + 50
        
    return(LabelMe_labels)

  

In [42]:
# downloading labelme annotations and storing them in a .txt file:

# retrieve all folder urls which have the images
url_folders = find_folders(True)

# uncomment next line for testing (here only images in the first 3 folders are downloaded)
# url_folders = url_folders[0:3]   



labelMe_label_list = []
count = 0

# count = X
# url_folders = url_folders[count:len(url_folders)]

for folder_url in url_folders:
    print(folder_url)
    new_labels = obtain_labels_labelMe(folder_url, count)
    
    for labels in new_labels:
        labelMe_label_list.append(labels)
    
    print("Just finished count:", count, ". Percentage done:", 100*(count+1)/296)
    count += 1


http://labelme.csail.mit.edu/Annotations/05june05_static_indoor/
Just finished count: 0 . Percentage done: 0.33783783783783783
http://labelme.csail.mit.edu/Annotations/05june05_static_street_boston/
Just finished count: 1 . Percentage done: 0.6756756756756757
http://labelme.csail.mit.edu/Annotations/05june05_static_street_porter/
Just finished count: 2 . Percentage done: 1.0135135135135136
http://labelme.csail.mit.edu/Annotations/10feb04_static_cars_highland/
Just finished count: 3 . Percentage done: 1.3513513513513513
http://labelme.csail.mit.edu/Annotations/10feb04_static_cars_techsquare_lot/
Just finished count: 4 . Percentage done: 1.6891891891891893
http://labelme.csail.mit.edu/Annotations/10feb04_static_cars_underground/
Just finished count: 5 . Percentage done: 2.027027027027027
http://labelme.csail.mit.edu/Annotations/10feb04_static_techsquare/
Just finished count: 6 . Percentage done: 2.364864864864865
http://labelme.csail.mit.edu/Annotations/30may05_static_street_cambridge/
J

Just finished count: 64 . Percentage done: 21.95945945945946
http://labelme.csail.mit.edu/Annotations/sep1_seq2_bldg400_outdoor/
Just finished count: 65 . Percentage done: 22.2972972972973
http://labelme.csail.mit.edu/Annotations/sep1_seq3_bldg400_outdoor/
Just finished count: 66 . Percentage done: 22.635135135135137
http://labelme.csail.mit.edu/Annotations/sep1_seq4_bldg400_outdoor/
Just finished count: 67 . Percentage done: 22.972972972972972
http://labelme.csail.mit.edu/Annotations/seq_april29_04_stata_fl4_drayfoos_atb_a/
Just finished count: 68 . Percentage done: 23.31081081081081
http://labelme.csail.mit.edu/Annotations/seq_april29_04_stata_fl4_drayfoos_atb_b/
Just finished count: 69 . Percentage done: 23.64864864864865
http://labelme.csail.mit.edu/Annotations/seq_feb19_bldg400_fl6_a/
Just finished count: 70 . Percentage done: 23.986486486486488
http://labelme.csail.mit.edu/Annotations/seq_feb21_bldg400_fl6_b/
Just finished count: 71 . Percentage done: 24.324324324324323
http://la

Just finished count: 125 . Percentage done: 42.567567567567565
http://labelme.csail.mit.edu/Annotations/static_houses_boston_2005/
Just finished count: 126 . Percentage done: 42.9054054054054
http://labelme.csail.mit.edu/Annotations/static_indoor_2005_china/
Just finished count: 127 . Percentage done: 43.24324324324324
http://labelme.csail.mit.edu/Annotations/static_indoor_bathroom/
Just finished count: 128 . Percentage done: 43.58108108108108
http://labelme.csail.mit.edu/Annotations/static_indoor_bedroom_web/
Just finished count: 129 . Percentage done: 43.91891891891892
http://labelme.csail.mit.edu/Annotations/static_indoor_comerce/
Just finished count: 130 . Percentage done: 44.25675675675676
http://labelme.csail.mit.edu/Annotations/static_indoor_corridor/
Just finished count: 131 . Percentage done: 44.5945945945946
http://labelme.csail.mit.edu/Annotations/static_indoor_database_by_aude_oliva/
Just finished count: 132 . Percentage done: 44.932432432432435
http://labelme.csail.mit.edu

Just finished count: 182 . Percentage done: 61.82432432432432
http://labelme.csail.mit.edu/Annotations/static_outdoor_nature_galapagos_photos_by_fredo_durand/
Just finished count: 183 . Percentage done: 62.16216216216216
http://labelme.csail.mit.edu/Annotations/static_outdoor_nature_park_forest_san_jose_california/
Just finished count: 184 . Percentage done: 62.5
http://labelme.csail.mit.edu/Annotations/static_outdoor_nature_squirrel_photos_by_fredo_durand/
Just finished count: 185 . Percentage done: 62.83783783783784
http://labelme.csail.mit.edu/Annotations/static_outdoor_nature_tanzania_photos_by_fredo_durand/
Just finished count: 186 . Percentage done: 63.17567567567568
http://labelme.csail.mit.edu/Annotations/static_outdoor_oxford_submitted_alyosha_efros/
Just finished count: 187 . Percentage done: 63.513513513513516
http://labelme.csail.mit.edu/Annotations/static_outdoor_prague_submitted_alyosha_efros/
Just finished count: 188 . Percentage done: 63.851351351351354
http://labelme.c

Just finished count: 239 . Percentage done: 81.08108108108108
http://labelme.csail.mit.edu/Annotations/static_street_outdoor_palma_mallorca_spain/
Just finished count: 240 . Percentage done: 81.41891891891892
http://labelme.csail.mit.edu/Annotations/static_street_outdoor_palma_mallorca_spain_2007/
Just finished count: 241 . Percentage done: 81.75675675675676
http://labelme.csail.mit.edu/Annotations/static_street_outdoor_pittsburgh_usa/
Just finished count: 242 . Percentage done: 82.0945945945946
http://labelme.csail.mit.edu/Annotations/static_street_statacenter_cambridge_outdoor_2005/
Just finished count: 243 . Percentage done: 82.43243243243244
http://labelme.csail.mit.edu/Annotations/static_street_urban_mineapolis_minesota_usa/
Just finished count: 244 . Percentage done: 82.77027027027027
http://labelme.csail.mit.edu/Annotations/static_street_village_fuensaldana_spain/
Just finished count: 245 . Percentage done: 83.10810810810811
http://labelme.csail.mit.edu/Annotations/static_submit

In [43]:
# Store labels, in an earlier created Annotations.txt file 

folder_to_store_images = ".../data/LabelMe"
txt_path = folder_to_store_images + "/Annotations.txt"

with open(txt_path, mode="w", encoding="utf-8") as f:
    f.write(",;.,".join(labelMe_label_list))


In [None]:
# Download images from labelme website DELETE LATER DELETE LATER

# retrieve all folder urls which have the images
url_folders = find_folders(True)

url_folders = url_folders[0:2]

# uncomment next line for testing (here only images in the first 3 folders are downloaded)
# url_folders = url_folders[0:3]   

# Specify correct path to label me folder
folder_to_store_images = ".../data/LabelMe"

print(len(url_folders))

counter = 0

long_label_string = labelMe_label_list.join()

# Download the images (WARNING: might take a while for the full image set)
for url in url_folders:
    print(url)
    download_images(url, folder_to_store_images, counter, long_label_string)
    print("Just finished count: ", counter)
    counter += 1


In [11]:

folder_to_store_images = ".../data/LabelMe"
# labelMe_label_list = labelMe_label_List[:]
txt_path = folder_to_store_images + "/Annotations.txt"


with open(txt_path, "r", encoding='utf-8') as f:
    labelMe_label_list_unsplit = f.read()
    
    labelMe_label_list = labelMe_label_list_unsplit.split(",;.,")
    
print(len(labelMe_label_list))


idx = 0
for labels in labelMe_label_list:
    
    if ".jpg" in labels:
        
        stop_index = labels.find('><')
        if ()
#         print(labels[0:stop_index])
    else:
        print(labels)
        break
    idx += 1
    
print("in total", sum(".jpg" in s for s in labelMe_label_list), "occurences of .jpg in the annotations")



80545
0abayu5pjghnpeh872w1mo0dq4wwkocc32qseh><static_houses_boston_2005>> window window window window door stairs housetop for sale sign window bushes house window window window house number flower
in total 80383 occurences of .jpg in the annotations


In [None]:
# Download images from labelme website

# retrieve all folder urls which have the images
url_folders = find_folders(True)

url_folders = url_folders[0:2]

# uncomment next line for testing (here only images in the first 3 folders are downloaded)
# url_folders = url_folders[0:3]   

# Specify correct path to label me folder
folder_to_store_images = ".../data/LabelMe"


counter = 0

long_label_string = labelMe_label_list.join()

# Download the images (WARNING: might take a while for the full image set)
for url in url_folders:
    print(url)
    download_images(url, folder_to_store_images, counter, long_label_string)
    print("Just finished count: ", counter)
    counter += 1
