In [1]:
import os
import base64
from tqdm import tqdm
from icrawler.builtin import GoogleImageCrawler
from icrawler import ImageDownloader
from six.moves.urllib.parse import urlparse

class MyImageDownloader(ImageDownloader):
    def get_filename(self, task, default_ext):
        url_path = urlparse(task['file_url'])[2]
        if '.' in url_path:
            extension = url_path.split('.')[-1]
            if extension.lower() not in ['jpg', 'jpeg', 'png', 'bmp']:
                extension = default_ext
        else:
            extension = default_ext
        filename = base64.b64encode(url_path.encode()).decode()
        return '{}.{}'.format(filename, extension)

def download_images(keyword, output_directory, max_num=100):
    google_crawler = GoogleImageCrawler(
        feeder_threads=1,
        parser_threads=2,
        downloader_threads=4,
        # downloader_cls=MyImageDownloader,
        storage= {'backend': 'FileSystem','root_dir': output_directory})

    google_crawler.crawl(keyword=keyword, min_size=(200, 200), max_size=None, max_num=max_num)

def create_dataset(architects: list, all_works: list, output_directory: str, max_num=100) -> None:
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Loop through architects and their works
    for architect, architect_works in zip(architects, all_works):
        architect_directory = os.path.join(output_directory, architect)

        if not os.path.exists(architect_directory):
            os.makedirs(architect_directory)

        for work in architect_works.split('\n'):
            download_images(work, os.path.join(architect_directory, work), max_num)

In [3]:
DATA_DIR = '/home/ebylmz/projects/architectural-style-classification/data'

In [9]:
import pandas as pd


excel_file_path = os.path.join(DATA_DIR, 'architects-works.xlsx')
output_dir_path = os.path.join(DATA_DIR, 'scraping')

# Load the Excel file
excel_data = pd.read_excel(excel_file_path)

# Extract architect names and their works
architects = excel_data.iloc[:, 0].tolist()
works = excel_data.iloc[:, 1].tolist()  

create_dataset(architects[-1:], works[-1:], output_dir_path, max_num=100)

2023-12-25 23:35:59,340 - INFO - icrawler.crawler - start crawling...
2023-12-25 23:35:59,341 - INFO - icrawler.crawler - starting 1 feeder threads...
2023-12-25 23:35:59,343 - INFO - feeder - thread feeder-001 exit
2023-12-25 23:35:59,344 - INFO - icrawler.crawler - starting 2 parser threads...
2023-12-25 23:35:59,352 - INFO - icrawler.crawler - starting 4 downloader threads...
2023-12-25 23:36:00,206 - INFO - parser - parsing result page https://www.google.com/search?q=Stuttgart+Hauptbahnhof&ijn=0&start=0&tbs=&tbm=isch
2023-12-25 23:36:00,265 - INFO - downloader - skip downloading file 000001.jpg
2023-12-25 23:36:00,268 - INFO - downloader - skip downloading file 000002.jpg
2023-12-25 23:36:00,271 - INFO - downloader - skip downloading file 000003.jpg
2023-12-25 23:36:00,409 - INFO - downloader - image #4	https://www.bahnprojekt-stuttgart-ulm.de/fileadmin/_processed_/6/8/csm__14_221010_487601e06a.jpg
2023-12-25 23:36:00,475 - ERROR - downloader - Response status code 404, file https:

In [14]:
import os
import shutil

# Define paths to the original dataset and the new dataset
original_dataset_path = os.path.join(DATA_DIR, 'scraping-clean')
new_dataset_path = os.path.join(DATA_DIR, 'scraping-clean-combined')

# Create a list of architect directories in the original dataset
architect_directories = os.listdir(original_dataset_path)

count = 0

# Iterate through each architect directory and copy artifacts to the new dataset
for architect_dir in architect_directories:
    architect_path = os.path.join(original_dataset_path, architect_dir)
    
    # Create a directory for the architect in the new dataset
    new_architect_path = os.path.join(new_dataset_path, architect_dir)
    os.makedirs(new_architect_path, exist_ok=True)
    
    # Iterate through each artifact directory of the architect
    artifacts_directories = os.listdir(architect_path)
    idx = 1
    for artifact_dir in artifacts_directories:
        artifact_path = os.path.join(architect_path, artifact_dir)
        
        # Iterate through each file in the artifact directory and copy to the new architect directory
        artifact_files = os.listdir(artifact_path)
        for file_name in artifact_files:
            file_path = os.path.join(artifact_path, file_name)

            
            # Get the file extension
            file_extension = os.path.splitext(file_name)[1]  # This retrieves the file extension
            
            # Define the new file name with '0001.ext' format
            new_file_name = f"{idx:05d}{file_extension}"
            
            # Copy file to the new directory for the architect with the new name
            new_file_path = os.path.join(new_architect_path, new_file_name)
            shutil.copy(file_path, new_file_path)
            count += 1
            idx += 1
            

print(f"Number of images copied: {count}")

Number of images copied: 2020
