In [2]:
import os
import base64
from tqdm import tqdm
from icrawler.builtin import GoogleImageCrawler
from icrawler import ImageDownloader
from six.moves.urllib.parse import urlparse

class MyImageDownloader(ImageDownloader):
    def get_filename(self, task, default_ext):
        url_path = urlparse(task['file_url'])[2]
        if '.' in url_path:
            extension = url_path.split('.')[-1]
            if extension.lower() not in [
                    'jpg', 'jpeg', 'png', 'bmp', 'tiff', 'gif', 'ppm', 'pgm'
            ]:
                extension = default_ext
        else:
            extension = default_ext
        filename = base64.b64encode(url_path.encode()).decode()
        return '{}.{}'.format(filename, extension)

def download_images(keyword, output_directory, max_num=100):
    google_crawler = GoogleImageCrawler(
        feeder_threads=1,
        parser_threads=2,
        downloader_threads=4,
        downloader_cls=MyImageDownloader,
        storage= {'backend': 'FileSystem','root_dir': output_directory})

    google_crawler.crawl(keyword=keyword, min_size=(200, 200), max_size=None, max_num=max_num)

def create_dataset(architects: list, all_works: list, output_directory: str, max_num=100) -> None:
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Loop through architects and their works
    for architect, architect_works in zip(architects, all_works):
        architect_formatted = architect.replace(' ', '-').lower()
        architect_directory = os.path.join(output_directory, architect_formatted)

        if not os.path.exists(architect_directory):
            os.makedirs(architect_directory)

        for work in architect_works.split('\n'):
            work_formatted = work.replace(' ', '-').lower()
            download_images(work, os.path.join(architect_directory, work_formatted), max_num)

In [None]:
import pandas as pd

DATA_DIR = '/home/ebylmz/projects/architectural-style-classification/data'

# excel_file_path = 'data/architects-works.xlsx'
excel_file_path = os.path.join(DATA_DIR, 'architects-works.xlsx')
output_dir_path = os.path.join(DATA_DIR, 'scraping')

# Load the Excel file
excel_data = pd.read_excel(excel_file_path)
max_num = 100

# Extract architect names and their works
architects = excel_data.iloc[:, 1].tolist()  # Column 2 (index 1) contains architect names
works = excel_data.iloc[:, 2].tolist()  # Column 3 (index 2) contains architect works

create_dataset(architects, works, output_dir_path, max_num)

2023-12-12 17:20:45,934 - INFO - icrawler.crawler - start crawling...
2023-12-12 17:20:45,937 - INFO - icrawler.crawler - starting 1 feeder threads...
2023-12-12 17:20:45,942 - INFO - feeder - thread feeder-001 exit
2023-12-12 17:20:45,943 - INFO - icrawler.crawler - starting 2 parser threads...
2023-12-12 17:20:45,952 - INFO - icrawler.crawler - starting 4 downloader threads...
2023-12-12 17:20:46,834 - INFO - parser - parsing result page https://www.google.com/search?q=Ankara+Palas&ijn=0&start=0&tbs=&tbm=isch
2023-12-12 17:20:47,530 - ERROR - downloader - Exception caught when downloading file https://panel.millisaraylar.gov.tr//uploads/lokasyonlar/8b605064-2fff-4952-8724-1e2daa943878.jpg, error: HTTPSConnectionPool(host='panel.millisaraylar.gov.tr', port=443): Max retries exceeded with url: //uploads/lokasyonlar/8b605064-2fff-4952-8724-1e2daa943878.jpg (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local iss

In [2]:
DATA_DIR = '/home/ebylmz/projects/architectural-style-classification/data'

In [4]:
import os
import shutil

# Define paths to the original dataset and the new dataset
original_dataset_path = os.path.join(DATA_DIR, 'scraping-clean')
new_dataset_path = os.path.join(DATA_DIR, 'scraping-clean-combined')

# Create a list of architect directories in the original dataset
architect_directories = os.listdir(original_dataset_path)

# Iterate through each architect directory and copy artifacts to the new dataset
for architect_dir in architect_directories:
    architect_path = os.path.join(original_dataset_path, architect_dir)
    
    # Create a directory for the architect in the new dataset
    new_architect_path = os.path.join(new_dataset_path, architect_dir)
    os.makedirs(new_architect_path, exist_ok=True)
    
    # Iterate through each artifact directory of the architect
    artifacts_directories = os.listdir(architect_path)
    for artifact_dir in artifacts_directories:
        artifact_path = os.path.join(architect_path, artifact_dir)
        
        # Iterate through each file in the artifact directory and copy to the new architect directory
        artifact_files = os.listdir(artifact_path)
        for file_name in artifact_files:
            file_path = os.path.join(artifact_path, file_name)
            
            # Copy file to the new directory for the architect
            shutil.copy(file_path, new_architect_path)
