In [None]:
import csv
import time
import random
import pymysql
import requests
import json
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

##### 5010까지 함...

# ==========================================
# 0. 설정값
# ==========================================
CSV_PATH = "musinsa_hat_ids.csv"   # product_id 컬럼 포함 CSV
CATEGORY_ID = 101001        # img_url용 PLP 카테고리
START_IDX = 5010               # 시작 인덱스
END_IDX = 10000              # 끝 인덱스

DB_HOST = "musinsa-db.c07kuo6ug98z.us-east-1.rds.amazonaws.com"
DB_NAME = "musinsa"
DB_USER = "admin"
DB_PASSWORD = "qkqajrwkrnrnrn9_"
DB_PORT = 3306

# ==========================================
# 1. product_id 로드
# ==========================================
product_ids = []
with open(CSV_PATH, newline="", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for row in reader:
        if row.get("goodsNo"):
            product_ids.append(int(row["goodsNo"]))

total_count = len(product_ids)

print("=" * 70)
print(f"총 product_id 수        : {total_count}")
print(f"이번 실행 처리 범위     : {START_IDX + 1} ~ {END_IDX}")
print(f"이번 실행 처리 개수     : {min(END_IDX, total_count) - START_IDX}")
print("=" * 70)

# ==========================================
# 2. PLP API → goodsNo : thumbnail 매핑
# ==========================================
def fetch_goods_thumbnail_map(category_id, max_pages=200, size=60):
    api_url = "https://api.musinsa.com/api2/dp/v1/plp/goods"
    goods_map = {}

    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/120.0.0.0 Safari/537.36"
        ),
        "Accept": "application/json, text/plain, */*",
        "Referer": "https://www.musinsa.com/",
    }

    for page in range(1, max_pages + 1):
        params = {
            "gf": "M",
            "sortCode": "SALE_ONE_YEAR_COUNT",
            "category": category_id,
            "size": size,
            "testGroup": "",
            "caller": "CATEGORY",
            "page": page,
            "seen": 0,
            "seenAds": ""
        }

        res = requests.get(api_url, params=params, headers=headers, timeout=10)

        if res.status_code == 403:
            print(f"⛔ PLP API 403 차단 (page={page})")
            break

        goods_list = res.json().get("data", {}).get("list", [])
        if not goods_list:
            break

        for g in goods_list:
            goods_no = g.get("goodsNo")
            thumbnail = g.get("thumbnail")
            if goods_no and thumbnail:
                goods_map[int(goods_no)] = thumbnail
        
        time.sleep(0.3)
    print(f"누적 img_url: {len(goods_map)}")
    return goods_map

print("PLP API에서 img_url 수집 중...")
goods_thumbnail_map = fetch_goods_thumbnail_map(CATEGORY_ID)
print(f"img_url 매핑 수: {len(goods_thumbnail_map)}")

# ==========================================
# 3. Selenium 설정
# ==========================================
options = Options()
options.add_argument("--headless")

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)
driver.set_page_load_timeout(20)

# ==========================================
# 4. DB 연결
# ==========================================
conn = pymysql.connect(
    host=DB_HOST,
    user=DB_USER,
    password=DB_PASSWORD,
    db=DB_NAME,
    port=DB_PORT,
    charset="utf8mb4"
)
cursor = conn.cursor()

insert_sql = """
INSERT INTO product_new (
    product_id, product_name, brand,
    original_price, sale_price,
    upper_category, lower_category,
    gender, rating,
    wish_count, review_count,
    size_info, discount_rate,
    fit_season, cumulative_sales,
    style, img_url
) VALUES (
    %s, %s, %s, %s, %s,
    %s, %s, %s, %s,
    %s, %s, %s, %s,
    %s, %s, %s, %s
)
"""

# ==========================================
# 5. 공통 파싱 함수
# ==========================================
def extract_int(text):
    return int(re.sub(r"[^0-9]", "", text)) if text else None

def extract_number(text):
    if not text:
        return None
    if "만" in text:
        try:
            return int(float(text.replace("만", "")) * 10000)
        except:
            return None
    cleaned = re.sub(r"[^0-9]", "", text)
    return int(cleaned) if cleaned else None

def clean_product_name(name):
    return re.sub(r"-.*", "", name).strip()

# ==========================================
# 6. 크롤링 → 즉시 DB INSERT
# ==========================================
print("크롤링 + DB INSERT 시작!")


processed_count = 0
total_target = min(END_IDX, total_count) - START_IDX
start_all = time.time()

for idx in range(START_IDX + 1, min(END_IDX, total_count)):
    product_id = product_ids[idx]
    t0 = time.time()

    try:
        driver.get(f"https://www.musinsa.com/products/{product_id}")
        time.sleep(random.uniform(0.8, 1.5))
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # 상품명 / 브랜드
        name_tag = soup.select_one("div[class*='GoodsName__Wrap'] span")
        product_name = clean_product_name(name_tag.get_text(strip=True)) if name_tag else None

        brand_tag = soup.select_one("div[class*='Brand__Wrap'] span[class*='BrandName']")
        brand = brand_tag.get_text(strip=True) if brand_tag else None

        # 가격
        discount_rate = extract_int(
            soup.select_one("span[class*='DiscountRate']").get_text()
        ) if soup.select_one("span[class*='DiscountRate']") else None

        if discount_rate is not None:
            original_price = extract_int(
                soup.select_one("span.line-through").get_text()
            )
            sale_price = extract_int(
                soup.select_one("span[class*='CalculatedPrice']").get_text()
            )
        else:
            original_price = extract_int(
                soup.select_one("span[class*='CalculatedPrice']").get_text()
            )
            sale_price = None

        # 카테고리
        upper_category = None
        lower_category = None
        cat_wrap = soup.select_one("div[class*='Category__Wrap']")
        if cat_wrap:
            for a in cat_wrap.find_all("a"):
                if a.get("data-category-id") == "2depth":
                    upper_category = a.get_text(strip=True)
                elif a.get("data-category-id") == "3depth":
                    lower_category = a.get_text(strip=True)

        # 성별 / 누적판매
        gender = 0
        cumulative_sales = None
        info_boxes = soup.select("dl[class*='Layout__Wrap'] div[class*='Layout__Box']")
        for box in info_boxes:
            dt = box.find("dt")
            dd = box.find("dd")
            if not dt or not dd:
                continue
            if dt.get_text(strip=True) == "성별":
                gender = {"남":1,"여":2,"공용":0,"남녀공용":0}.get(dd.get_text(strip=True),0)
            if dt.get_text(strip=True) == "누적판매":
                cumulative_sales = dd.get_text(strip=True)

        # 별점 / 후기 / 관심
        rating = None

        for span in soup.find_all("span"):
            class_list = span.get("class", [])

            # 별점이 있는 경우에만 존재하는 정확한 class 조합
            if (
                "text-body_13px_med" in class_list
                and "text-black" in class_list
                and "font-pretendard" in class_list
            ):
                try:
                    rating = float(span.get_text(strip=True))
                    break
                except:
                    rating = None
        
        review_count = extract_number(
            soup.select_one("div[class*='Review__Wrap'] span:nth-of-type(2)").get_text()
        ) if soup.select_one("div[class*='Review__Wrap'] span:nth-of-type(2)") else 0

        wish_count = extract_number(
            soup.select_one("div[class*='Like__Container'] span").get_text()
        ) if soup.select_one("div[class*='Like__Container'] span") else None

        # 스타일
        style = ",".join(
            t.get_text(strip=True)
            for t in soup.select("ul[class*='ProductTags__List'] span")
        )

        # img_url (PLP API)
        img_url = goods_thumbnail_map.get(product_id, "")

        # DB INSERT
        cursor.execute(
            insert_sql,
            (
                product_id, product_name, brand,
                original_price, sale_price,
                upper_category, lower_category,
                gender, rating,
                wish_count, review_count,
                json.dumps([], ensure_ascii=False),
                discount_rate,
                json.dumps({"핏": [], "계절감": []}, ensure_ascii=False),
                cumulative_sales,
                style,
                img_url
            )
        )
        conn.commit()

        elapsed = time.time() - t0
        processed_count += 1

        print(
            f"[진행 {processed_count}/{total_target}] "
            f"(global idx={idx}) "
            f"product_id={product_id} 완료 "
            f"⏱ {elapsed:.2f}s"
        )

    except Exception as e:
        conn.rollback()
        processed_count += 1

        print(
            f"[진행 {processed_count}/{total_target}] "
            f"(global idx={idx}) "
            f"product_id={product_id} ❌ 실패: {e}"
        )


# ==========================================
# 7. 종료
# ==========================================
driver.quit()
cursor.close()
conn.close()

print("✅ 전체 크롤링 + DB INSERT 완료")
