In [1]:
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
import os
import requests

from boto3 import client
import dotenv
from io import BytesIO

from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
dotenv.load_dotenv('../gs-anomaly.env')

s3_client = client(
    "s3",
    aws_access_key_id=os.getenv("CREDENTIALS_ACCESS_KEY"),
    aws_secret_access_key=os.getenv("CREDENTIALS_SECRET_KEY"),
    region_name=os.getenv("S3_REGION"),
)

In [3]:
# 이미지를 저장할 디렉토리 경로
IMAGE_DIR = "./images"

In [4]:
def download_and_upload_image_to_s3(url, prd_id):
    try:
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            file_name = f"{prd_id}.jpg"
            s3_client.upload_fileobj(
                BytesIO(response.content),
                Bucket=os.getenv("S3_BUCKET"),
                Key=f"images/{file_name}",
                ExtraArgs={'ContentType': 'image/jpeg'}
            )
        else:
            print(f"Failed to download image for prd_id: {prd_id} (status code: {response.status_code})")
    except Exception as e:
        print(f"Error to upload image for prd_id: {prd_id}, Error: {e}")
        

def process_images(data):
    count = 0
    total_count = len(data)
    futures = []
    with ThreadPoolExecutor(max_workers=20) as executor:  # 20개의 스레드 사용

        for index, prd_id in enumerate(data['prd_id']):
            prd_id_str = str(prd_id)
            image_url = f"http://image.gsshop.com/image/{prd_id_str[0:2]}/{prd_id_str[2:4]}/{prd_id_str}_L1.jpg"
            futures.append(executor.submit(download_and_upload_image_to_s3, image_url, prd_id))
            
            count+=1
            # 1000개마다 현황 출력
            if count % 1000 == 0:
                print(f"Processed {count}/{total_count} prd_ids")
                
    print(f"Total successful uploads: {count}")

In [5]:
data_route = "C:\\workspace\\project_final\\data\\dataset_sample_1percent.csv"
data = pd.read_csv(data_route, encoding='utf-8')

In [6]:
process_images(data)
print("Image scraping completed")

Processed 1000/159179 prd_ids
Processed 2000/159179 prd_ids
Processed 3000/159179 prd_ids
Processed 4000/159179 prd_ids
Processed 5000/159179 prd_ids
Processed 6000/159179 prd_ids
Processed 7000/159179 prd_ids
Processed 8000/159179 prd_ids
Processed 9000/159179 prd_ids
Processed 10000/159179 prd_ids
Processed 11000/159179 prd_ids
Processed 12000/159179 prd_ids
Processed 13000/159179 prd_ids
Processed 14000/159179 prd_ids
Processed 15000/159179 prd_ids
Processed 16000/159179 prd_ids
Processed 17000/159179 prd_ids
Processed 18000/159179 prd_ids
Processed 19000/159179 prd_ids
Processed 20000/159179 prd_ids
Processed 21000/159179 prd_ids
Processed 22000/159179 prd_ids
Processed 23000/159179 prd_ids
Processed 24000/159179 prd_ids
Processed 25000/159179 prd_ids
Processed 26000/159179 prd_ids
Processed 27000/159179 prd_ids
Processed 28000/159179 prd_ids
Processed 29000/159179 prd_ids
Processed 30000/159179 prd_ids
Processed 31000/159179 prd_ids
Processed 32000/159179 prd_ids
Processed 33000/1

TypeError: argument of type 'NoneType' is not iterable