In [None]:
import requests
import os
import time
import random
from urllib.parse import urlparse, quote
from concurrent.futures import ThreadPoolExecutor, as_completed
import re

class SimpleImageDownloader:
    def __init__(self):
        self.session = requests.Session()
        self.user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0'
        ]
        self.update_headers()

    def update_headers(self):
        self.session.headers.update({
            'User-Agent': random.choice(self.user_agents),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'ko-KR,ko;q=0.8,en-US;q=0.5,en;q=0.3',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        })

    def search_pixabay_images(self, keyword, num_images=100):
        print(f"Pixabay에서 '{keyword}' 이미지 검색 중...")
        all_urls = []
        per_page = 20
        total_pages = (num_images // per_page) + 5  # 혹시 부족하면 넉넉히 더
        try:
            for page in range(1, total_pages + 1):
                params = {
                    'key': '9656065-a4094594c34f9ac14c7fc4c39',
                    'q': keyword,
                    'image_type': 'photo',
                    'per_page': per_page,
                    'page': page,
                    'safesearch': 'true'
                }
                response = self.session.get("https://pixabay.com/api/", params=params, timeout=10)
                if response.status_code == 200:
                    data = response.json()
                    hits = data.get('hits', [])
                    if not hits:
                        break
                    image_urls = [img.get('webformatURL') for img in hits if img.get('webformatURL')]
                    all_urls.extend(image_urls)
                    if len(all_urls) >= num_images:
                        break
                else:
                    break
            return all_urls[:num_images]
        except Exception as e:
            print(f"Pixabay 검색 오류: {e}")
        return []

    def search_unsplash_images(self, keyword, num_images=100):
        print(f"Unsplash에서 '{keyword}' 이미지 검색 중...")
        all_urls = []
        per_page = 30
        total_pages = (num_images // per_page) + 5
        try:
            for page in range(1, total_pages + 1):
                search_url = "https://unsplash.com/napi/search/photos"
                params = {
                    'query': keyword,
                    'per_page': per_page,
                    'page': page,
                    'order_by': 'relevant'
                }
                response = self.session.get(search_url, params=params, timeout=10)
                if response.status_code == 200:
                    data = response.json()
                    results = data.get('results', [])
                    if not results:
                        break
                    image_urls = [photo.get('urls', {}).get('regular')
                                  for photo in results if photo.get('urls', {}).get('regular')]
                    all_urls.extend(image_urls)
                    if len(all_urls) >= num_images:
                        break
                else:
                    break
            return all_urls[:num_images]
        except Exception as e:
            print(f"Unsplash 검색 오류: {e}")
        return []

    def search_pexels_images(self, keyword, num_images=100):
        print(f"Pexels에서 '{keyword}' 이미지 검색 중...")
        all_urls = []
        per_page = 20  # 실제로 한 페이지에 약 20~24개 있음
        total_pages = (num_images // per_page) + 5
        try:
            for page in range(1, total_pages + 1):
                search_url = f"https://www.pexels.com/search/{quote(keyword)}/?page={page}"
                response = self.session.get(search_url, timeout=10)
                if response.status_code == 200:
                    pattern = r'https://images\.pexels\.com/photos/\d+/[^"]*\.jpeg\?[^"]*'
                    matches = re.findall(pattern, response.text)
                    unique_page_urls = list(dict.fromkeys(matches))
                    if not unique_page_urls:
                        break
                    all_urls.extend(unique_page_urls)
                    if len(all_urls) >= num_images:
                        break
                else:
                    break
            return all_urls[:num_images]
        except Exception as e:
            print(f"Pexels 검색 오류: {e}")
        return []

    def download_image(self, url, filename):
        try:
            self.update_headers()
            response = self.session.get(url, timeout=15, stream=True)
            response.raise_for_status()
            content_length = response.headers.get('content-length')
            if content_length and int(content_length) < 5000:
                return False
            with open(filename, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            if os.path.getsize(filename) < 5000:
                os.remove(filename)
                return False
            return True
        except Exception:
            return False

    def download_images_parallel(self, urls, keyword_dir, safe_keyword, num_images):
        downloaded_count = 0
        failed_count = 0
        max_workers = min(10, num_images)

        def worker(idx_url):
            i, url = idx_url
            parsed_url = urlparse(url)
            extension = 'jpg'
            if '.' in parsed_url.path:
                ext = parsed_url.path.split('.')[-1].lower()
                if ext in ['jpg', 'jpeg', 'png', 'webp']:
                    extension = ext
            filename = os.path.join(keyword_dir, f"{safe_keyword}_{i+1:04d}.{extension}")
            if os.path.exists(filename):
                return 'skip', filename, url
            if self.download_image(url, filename):
                return 'success', filename, url
            return 'fail', filename, url

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [executor.submit(worker, (i, url)) for i, url in enumerate(urls[:num_images])]
            for idx, f in enumerate(as_completed(futures), 1):
                status, fname, url = f.result()
                if status == 'success':
                    downloaded_count += 1
                    if downloaded_count % 50 == 0 or downloaded_count <= 10:
                        print(f"✓ 다운로드 완료: {os.path.basename(fname)} ({downloaded_count}/{num_images})")
                elif status == 'fail':
                    failed_count += 1
                    if failed_count % 20 == 0:
                        print(f"✗ 다운로드 실패 누적: {failed_count}개 (예: {url})")
                elif status == 'skip':
                    downloaded_count += 1
        return downloaded_count, failed_count

    def download_images_for_keyword(self, keyword, download_dir, num_images=1000):
        print(f"\n=== '{keyword}' 키워드 처리 시작 ===")
        print(f"목표: {num_images}개 이미지")
        safe_keyword = keyword.replace(' ', '_').replace('-', '_').replace('/', '_').replace('\\', '_')
        keyword_dir = os.path.join(download_dir, safe_keyword)
        os.makedirs(keyword_dir, exist_ok=True)
        print(f"저장 폴더: {keyword_dir}")

        images_per_source = num_images // 3
        remaining = num_images % 3
        all_image_urls = []

        pixabay_target = images_per_source + (1 if remaining > 0 else 0)
        unsplash_target = images_per_source + (1 if remaining > 1 else 0)
        pexels_target = images_per_source

        pixabay_urls = self.search_pixabay_images(keyword, pixabay_target)
        all_image_urls.extend(pixabay_urls)
        print(f"Pixabay에서 {len(pixabay_urls)}개 URL 수집")

        unsplash_urls = self.search_unsplash_images(keyword, unsplash_target)
        all_image_urls.extend(unsplash_urls)
        print(f"Unsplash에서 {len(unsplash_urls)}개 URL 수집")

        pexels_urls = self.search_pexels_images(keyword, pexels_target)
        all_image_urls.extend(pexels_urls)
        print(f"Pexels에서 {len(pexels_urls)}개 URL 수집")

        # Remove duplicates, keep order
        unique_urls = list(dict.fromkeys(all_image_urls))
        if not unique_urls:
            print(f"'{keyword}'에 대한 이미지를 찾을 수 없습니다.")
            return

        print(f"총 {len(unique_urls)}개의 고유한 이미지 URL을 찾았습니다.")
        if len(unique_urls) < num_images:
            print(f"⚠️  경고: 목표({num_images}개)보다 적은 {len(unique_urls)}개만 찾았습니다.")

        target_count = min(len(unique_urls), num_images)
        downloaded_count, failed_count = self.download_images_parallel(
            unique_urls, keyword_dir, safe_keyword, target_count
        )
        print(f"'{keyword}' 완료: {downloaded_count}개 이미지 다운로드, {failed_count}개 실패 (폴더: {keyword_dir})")

def main():
    keywords = [
        "modern interior",
        "northern european interior",
        "vintage interior",
        "natural interior",
        "romantic interior",
        "antique interior",
        "traditional korean style interior"
    ]
    download_directory = "furniture_images_simple"
    os.makedirs(download_directory, exist_ok=True)
    downloader = SimpleImageDownloader()

    print("🖼️  간단한 이미지 다운로더를 시작합니다...")
    print("Selenium 없이 Pixabay, Pexels, Unsplash에서 이미지를 수집합니다.\n")

    for keyword in keywords:
        try:
            downloader.download_images_for_keyword(
                keyword=keyword,
                download_dir=download_directory,
                num_images=1000  # 각 키워드당 최대 1000개
            )
        except Exception as e:
            print(f"'{keyword}' 처리 중 오류: {e}")
        time.sleep(2)  # 키워드별 휴식

    print("\n🎉 모든 이미지 다운로드가 완료되었습니다!")

if __name__ == "__main__":
    main()


In [None]:
from icrawler.builtin import GoogleImageCrawler
import os

def download_google_images(keyword, download_dir, num_images=1000):
    save_dir = os.path.join(download_dir, keyword.replace(' ', '_'))
    os.makedirs(save_dir, exist_ok=True)
    google_crawler = GoogleImageCrawler(storage={'root_dir': save_dir})
    google_crawler.crawl(keyword=keyword, max_num=num_images, min_size=(200,200), file_idx_offset=0)

def main():
    keywords = [
        "modern interior",
        "northern european interior",
        "vintage interior",
        "natural interior",
        "romantic interior",
        "antique interior",
        "traditional korean style interior"
    ]
    download_directory = "google_images"
    os.makedirs(download_directory, exist_ok=True)

    for keyword in keywords:
        print(f"=== {keyword} ===")
        download_google_images(keyword, download_directory, num_images=1000)
        print(f"{keyword} 완료")

if __name__ == "__main__":
    main()


2025-05-29 17:34:19,741 - INFO - icrawler.crawler - start crawling...
2025-05-29 17:34:19,741 - INFO - icrawler.crawler - starting 1 feeder threads...
2025-05-29 17:34:19,741 - INFO - icrawler.crawler - starting 1 parser threads...
2025-05-29 17:34:19,741 - INFO - icrawler.crawler - starting 1 downloader threads...


=== modern interior ===


2025-05-29 17:34:20,713 - INFO - parser - parsing result page https://www.google.com/search?q=modern+interior&ijn=0&start=0&tbs=&tbm=isch
2025-05-29 17:34:21,735 - INFO - downloader - image #1	https://www.decorilla.com/online-decorating/wp-content/uploads/2023/12/Modern-interior-design-ideas-for-a-living-room-by-Decorilla.jpg
2025-05-29 17:34:21,889 - INFO - downloader - image #2	https://www.thespruce.com/thmb/4Mxuujjo8BQ3woAAFhtUqNRslgo=/1500x0/filters:no_upscale():max_bytes(150000):strip_icc()/DesignbyEmilyHenderson_MountainHouseLivingRoom_PhotobySaraLigorria-TrampforEHD_9-79d20b8810c24403b627c6ee543dd538.jpg
2025-05-29 17:34:22,439 - INFO - downloader - image #3	https://auramodernhome.com/cdn/shop/articles/img-1707349210324.jpg
2025-05-29 17:34:23,702 - INFO - downloader - image #4	https://www.nawy.com/blog/wp-content/uploads/2022/07/Modern-Interior-Design.jpg
2025-05-29 17:34:24,916 - INFO - downloader - image #5	https://blog.buyerselect.com/wp-content/uploads/2024/05/organic-moder

modern interior 완료
=== northern european interior ===


2025-05-29 17:36:32,726 - INFO - parser - parsing result page https://www.google.com/search?q=northern+european+interior&ijn=0&start=0&tbs=&tbm=isch
2025-05-29 17:36:34,379 - INFO - downloader - image #1	https://www.pufikhomes.com/wp-content/uploads/2018/06/yarkaya-kvartira-v-stokgolme-2-1.jpg
2025-05-29 17:36:35,556 - INFO - downloader - image #2	https://www.pufikhomes.com/wp-content/uploads/2018/06/funkcionalnoe-zhile-v-stokgolme-2.jpg
2025-05-29 17:36:35,833 - ERROR - downloader - Response status code 400, file https://media.istockphoto.com/id/1190166068/photo/stylish-scandinavian-living-room-with-design-mint-sofa-furnitures-mock-up-poster-map-plants.jpg
2025-05-29 17:36:36,035 - ERROR - downloader - Response status code 400, file https://media.istockphoto.com/id/1824615178/photo/interior-design-of-modern-apartment-with-colorful-dark-walls-and-orange-sofa-interior-mockup.jpg
2025-05-29 17:36:36,113 - INFO - downloader - image #3	https://www.pufikhomes.com/wp-content/uploads/2018/06/

northern european interior 완료
=== vintage interior ===


2025-05-29 17:37:45,192 - INFO - parser - parsing result page https://www.google.com/search?q=vintage+interior&ijn=0&start=0&tbs=&tbm=isch
2025-05-29 17:37:46,721 - INFO - downloader - image #1	https://www.cyruscrafts.com/img/cms/blog/vintage-interior-design/vintage-interior.jpg
2025-05-29 17:37:47,155 - INFO - downloader - image #2	https://i.pinimg.com/originals/7e/36/1b/7e361bd06877378ed742550cb943c01a.jpg
2025-05-29 17:37:48,166 - INFO - downloader - image #3	https://static.wixstatic.com/media/5a4d0f_2a0c8c6f23024072932d33476506dade~mv2.png
2025-05-29 17:37:49,192 - INFO - downloader - image #4	https://decoholic.org/wp-content/uploads/2022/02/modern-vintage-interior-design-style.jpg
2025-05-29 17:37:50,381 - INFO - downloader - image #5	https://www.huntingforgeorge.com/wp-content/uploads/RL_Georgia_Ezra16368-BLOG-FEATURE.jpg
2025-05-29 17:37:50,634 - INFO - downloader - image #6	https://cdn.imweb.me/upload/S201802175a881c3a9257c/b7e4fb8bbb6b4.jpg
2025-05-29 17:37:51,072 - INFO - dow

vintage interior 완료
=== natural interior ===


2025-05-29 17:40:04,405 - INFO - parser - parsing result page https://www.google.com/search?q=natural+interior&ijn=0&start=0&tbs=&tbm=isch
2025-05-29 17:40:04,885 - INFO - downloader - image #1	https://i.pinimg.com/736x/17/c0/22/17c02234bcdb19bacf4b72fd46e6d13b.jpg
2025-05-29 17:40:05,501 - INFO - downloader - image #2	https://cdn.shopify.com/s/files/1/1917/6601/files/Natural_Interior_Design_1.jpg
2025-05-29 17:40:07,651 - INFO - downloader - image #3	https://sabiinadesign.com/wp-content/uploads/2023/09/Image-11-1-1024x1024.jpg
2025-05-29 17:40:08,726 - INFO - downloader - image #4	https://blog.cort.com/wp-content/uploads/2022/04/CORT-Natural-Interior-Design.jpg
2025-05-29 17:40:09,792 - INFO - downloader - image #5	https://media.designcafe.com/wp-content/uploads/2023/03/27203108/natural-interior-design-ideas-for-your-home.jpg
2025-05-29 17:40:10,792 - INFO - downloader - image #6	https://www.decorilla.com/online-decorating/wp-content/uploads/2022/01/Biophilic-interior-design-by-Wanda-

natural interior 완료
=== romantic interior ===


2025-05-29 17:43:02,848 - INFO - parser - parsing result page https://www.google.com/search?q=romantic+interior&ijn=0&start=0&tbs=&tbm=isch
2025-05-29 17:43:04,828 - INFO - downloader - image #1	http://shoplikha.com/cdn/shop/articles/interior-design-101-romantic-style-122050.png
2025-05-29 17:43:07,342 - INFO - downloader - image #2	https://cdn.shopify.com/s/files/1/2563/0080/files/2_20d8ba3e-b473-4c98-86fa-929431a5f607_600x600.png
2025-05-29 17:43:07,956 - INFO - downloader - image #3	https://cdn.shopify.com/s/files/1/2563/0080/files/Romantic_600x600.jpg
2025-05-29 17:43:08,444 - INFO - downloader - image #4	https://nazmiyalantiquerugs.com/wp-content/uploads/2020/03/romantic-interior-design-living-room.jpg
2025-05-29 17:43:09,446 - INFO - downloader - image #5	https://cdn.shopify.com/s/files/1/0550/1075/4765/files/carlos_garcia.jpg
