In [15]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import os
import requests

from boto3 import client
import dotenv
from io import BytesIO

In [16]:
dotenv.load_dotenv('../gs-anomaly.env')

s3_client = client(
    "s3",
    aws_access_key_id=os.getenv("CREDENTIALS_ACCESS_KEY"),
    aws_secret_access_key=os.getenv("CREDENTIALS_SECRET_KEY"),
    region_name=os.getenv("S3_REGION"),
)

In [17]:
# 이미지를 저장할 디렉토리 경로
IMAGE_DIR = "./images"

In [18]:
data_route = "C:\\workspace\\project_final\\data\\dataset_sample_1percent.csv"
data = pd.read_csv(data_route)

In [19]:
# Selenium WebDriver 설정 (크롬 드라이버 경로)
path = "C:\\workspace\\project_final\\chromedriver-win64\\chromedriver.exe"
driver = webdriver.Chrome()

In [20]:
def download_and_upload_image_to_s3(url, prd_id):
    try:
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            # S3 업로드
            file_name = f"{prd_id}.jpg"
            print(f"Uploading image for prd_id: {prd_id}")
            
            s3_client.upload_fileobj(
                Fileobj=BytesIO(response.content),  # 이미지 로드
                Bucket=os.getenv("S3_BUCKET"),  # S3 버킷 이름
                Key=f"images/{file_name}",  # S3에 저장할 경로
                ExtraArgs={'ContentType': 'image/jpeg'}  # 이미지 MIME 타입 설정
            )
            
            url = f"https://{os.getenv('S3_BUCKET')}.s3.amazonaws.com/images/{file_name}"
            # print(f"Image uploaded to S3: {url}")
        else:
            print(f"Failed to download image for prd_id: {prd_id} (status code: {response.status_code})")
    except Exception as e:
        print(f"Error to upload image for prd_id: {prd_id}, Error: {e}")

In [21]:
# prd_id 열을 기준으로 크롤링 수행
success = 0
for index, prd_id in enumerate(data['prd_id']):
    prd_id_str = str(prd_id)
    # 이미지 URL 생성
    image_url = f"http://image.gsshop.com/image/{prd_id_str[0:2]}/{prd_id_str[2:4]}/{prd_id_str}_L1.jpg"
    
    # Selenium을 사용하여 URL 접속
    driver.get(image_url)
    time.sleep(1)  # 페이지 로드 대기
    
    # 페이지 소스에서 이미지 태그 추출
    try:
        image = driver.find_element(By.TAG_NAME, 'img')
        if image:
            img_src = image.get_attribute('src')
            download_and_upload_image_to_s3(img_src, prd_id)
            success+=1
        else:
            print(f"No image found for prd_id: {prd_id}")
    except Exception as e:
        print(f"Error while scraping image for prd_id: {prd_id}, Error: {e}")
    
    if (index + 1) % 1000 == 0:
        print(f"Processed {index + 1} prd_ids")
        print(f"Successful: {success}")

# 데이터 프레임 생성 및 확인
print("Image scraping completed")

# WebDriver 종료
driver.quit()

Uploading image for prd_id: 1045363955
Image uploaded to S3: https://gs-product-bucket.s3.amazonaws.com/images/1045363955.jpg
Uploading image for prd_id: 1039712480
Image uploaded to S3: https://gs-product-bucket.s3.amazonaws.com/images/1039712480.jpg
Uploading image for prd_id: 50237029
Image uploaded to S3: https://gs-product-bucket.s3.amazonaws.com/images/50237029.jpg
Uploading image for prd_id: 1038767251
Image uploaded to S3: https://gs-product-bucket.s3.amazonaws.com/images/1038767251.jpg
Uploading image for prd_id: 1041353764
Image uploaded to S3: https://gs-product-bucket.s3.amazonaws.com/images/1041353764.jpg
Uploading image for prd_id: 1030787032
Image uploaded to S3: https://gs-product-bucket.s3.amazonaws.com/images/1030787032.jpg
Uploading image for prd_id: 1032314185
Image uploaded to S3: https://gs-product-bucket.s3.amazonaws.com/images/1032314185.jpg
Uploading image for prd_id: 51050263
Image uploaded to S3: https://gs-product-bucket.s3.amazonaws.com/images/51050263.jpg


KeyboardInterrupt: 