In [2]:
from PIL import Image
import glob
import requests


import os


In [3]:
# loading msrc images

# Insert path to data folder containing all image datasets
filepath = ".../data/"

msrcPath = filepath + "msrcorid/"

w = os.walk(msrcPath)
all_paths = []

# create all possible paths
for (dirpath, dirnames, filenames) in w:
    all_paths.append(dirpath)

image_paths = all_paths[:]

# delete paths that don't contain a jpg
for dirs in all_paths:
    if ".JPG" not in os.listdir(dirs)[0]:
        image_paths.remove(dirs)

msrc_image_list = []
msrc_label_list = []

for current_path in image_paths:
    
    current_jpgpath = current_path + "/*.jpg"
    
    for images in glob.glob(current_jpgpath): 
        jpg = Image.open(images)
        msrc_image_list.append(jpg)
        msrc_label_list.append(current_path)


# find point up to which all paths have identical names, 
# so that this can be deleted for the sake of labels. 
# labels are now based on folder paths
isSameChar = False

idx = 0
while not isSameChar:
    char = msrc_label_list[0][idx]
    for label in msrc_label_list:
        new_char = label[idx]
        if new_char != char:
            isSameChar = True
            cutoff_idx = idx
    idx += 1

# replace slashes with blank space
for label_idx in range(len(msrc_label_list)):
    msrc_label_list[label_idx] = msrc_label_list[label_idx][(cutoff_idx):]
    msrc_label_list[label_idx] = msrc_label_list[label_idx].replace("\\", " ")

    


In [None]:
# function for downloading images from the labelme website
def download_images(url, folder):

    # Fetch HTML content of the page
    response = requests.get(url)
    html_content = response.text

    # Use while loop to find all ."jpg" hyperlink things in the HTML text thing
    index = 0
    while True:
        index = html_content.find('.jpg', index)
        
        # The indices are only -1 if no instance of the substring was found (i.e. end of file)
        if index == -1:
            break
            
        # Find the start of the image title
        start_index = html_content.rfind('"', 0, index)
        if start_index == -1:
            break
            
        # Find the end of image title
        end_index = html_content.find('"', index)
        if end_index == -1:
            break
            
        # Get full image url
        image_url = url + html_content[start_index + 1:end_index]
        
        # Download image
        image_retrieval = requests.get(image_url)
        if image_retrieval.status_code == 200:      # this is a quick safeguard to guarantee that the retrieval was succesfull
            # Save the image to folder
            filename = os.path.join(folder, os.path.basename(image_url))
            
            with open(filename, 'wb') as file:
                file.write(image_retrieval.content)
                
        # Update index so that the next jpg can be found
        index = end_index + 1

In [None]:
# function for generating the list of folder urls from the labelme website
def find_folders(): 
    url = "http://labelme.csail.mit.edu/Images/"
    request = requests.get(url)
    html_content = request.text
    
    folder_list = []

    start_index = 0

    while start_index != -1:
        
        # all folder hyperlinks start with the alt=... substring, and end at double quotes.
        start_index = html_content.find('alt="[DIR]"></td><td><a href="', start_index) + 30
        stop_index = html_content.find('"', start_index)

        # quit in case no additional folder was found
        if start_index == 29 or stop_index == -1: 
            break
            
        folder_url = url + html_content[start_index:stop_index]

        folder_list.append(folder_url)
        start_index = stop_index + 1

    return(folder_list)


In [None]:
# function for getting the associated image labels from the labelme website
def obtain_labels(url_folders):

    LabelMe_labels = []

    for folder_url in url_folders:
        
        request = requests.get(folder_url)
        html_folder_content = request.text
        html_length = len(html_folder_content)

        index = 0
        
        # while loop to find the jpg images in the current folder
        while True:
            index = html_folder_content.find('.jpg', index)
      
            # break out of while loop if all jpgs have been found
            if index == -1:
                break
            # Find the start of the URL
            begin_index = html_folder_content.rfind('"', 0, index)
                            
            if begin_index == -1:
                break
            # Find the end of the URL
            end_index = html_folder_content.find('"', index)
            
            if end_index == -1:
                break
            
            # obtain image name
            image_name = html_folder_content[begin_index:end_index]
            
            image_url = folder_url + html_folder_content[begin_index + 1:end_index]
            
            # change url to correct associated annotation url
            image_url = image_url.replace("Images", "Annotations")
            label_url = image_url.replace(".jpg", ".xml")
            
            start_index = 0
            
            # initialize the label with the name of the .jpg image (which is used for sorting purposes later)
            label = html_folder_content[begin_index + 1:end_index]
            
            
            request = requests.get(label_url)
            html_content = request.text

        
            # Now store all the labels that exist in the annotation part of the jpg
            while start_index != -1:
                start_index = html_content.find('<name>', start_index) + 6
                stop_index = html_content.find('</name>', start_index)

                # break out if no more labels are present in the annotation
                if start_index == 5 or stop_index == -1:
                    break
                
                # some annotations were added after publication of the unbiasedness paper, 
                # and they also have a tendency to be wrong, so these labels are not added
                forbidden_years = ["2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024"]

                if not any(sub_string in html_content[stop_index:min([stop_index+250, html_length])] for sub_string in forbidden_years):
                    label = label + " " + html_content[start_index:stop_index]

                start_index = stop_index+1

            # Add all labels to corresponding annotation
            LabelMe_labels.append(label)
            
            # go to the next jpg in the folder
            index = end_index + 50
    return(LabelMe_labels)

  

In [None]:
# Download images from labelme website

# retrieve all folder urls which have the images
url_folders = find_folders()

# uncomment next line for testing (here only images in the first 3 folders are downloaded)
url_folders = url_folders[0:3]   

# Specify correct path to label me folder
folder_to_store_images = ".../data/LabelMe"

# Download the images (WARNING: might take a while for the full image set)
for url in url_folders:
    download_images(url, folder_to_store_images)

In [None]:
# loading labelme images

# Retrieve labels from website + load images from locally stored labelme folder

labelMe_label_list = obtain_labels(url_folders)

labelMe_image_list = []

# Specify correct path to label me folder
current_path = ".../data/LabelMe"

current_jpgpath = current_path + "/*.jpg"

for images in glob.glob(current_jpgpath): 
    jpg = Image.open(images)
    labelMe_image_list.append(jpg)

# sort label list alphabetically. 
# This is done because the images were pulled from the local data folder in order. 
# Since (Windows at least) automatically orders all images in the folder automatically, 
# we also have to sort the label list so that the indices align with the corresponding image
labelMe_label_list = sorted(labelMe_label_list)


In [None]:
# loading caltech images

# Insert path to data folder containing all image datasets
filepath = ".../data/"

filepath = "C:/Users/Gebruiker/Documents/aaStudie23-24/aDeepLearning/Project 2/data/"


caltech_path = filepath + "caltech-101/"

w = os.walk(caltech_path)
all_paths = []


for (dirpath, dirnames, filenames) in w:
    all_paths.append(dirpath)


image_paths = all_paths[:]

for dirs in all_paths:
    if ".JPG" not in os.listdir(dirs)[0]:
        image_paths.remove(dirs)




caltech_image_list = []
caltech_label_list = []

for current_path in image_paths:
    
    current_jpgpath = current_path + "/*.jpg"
    
    for images in glob.glob(current_jpgpath): 
        jpg = Image.open(images)
        caltech_image_list.append(jpg)
        caltech_label_list.append(current_path)


isSameChar = False

idx = 0
while not isSameChar:
    char = caltech_label_list[0][idx]
    for label in caltech_label_list:
        new_char = label[idx]
        if new_char != char:
            isSameChar = True
            cutoff_idx = idx
    idx += 1


for label_idx in range(len(caltech_label_list)):
    caltech_label_list[label_idx] = caltech_label_list[label_idx][(cutoff_idx):]
    caltech_label_list[label_idx] = caltech_label_list[label_idx].replace("\\", " ")

 