In [None]:
### 상품 id 크롤링

import requests
from tqdm import tqdm
import time
import csv

# 헤더 설정 (브라우저 요청과 동일하게)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Accept": "application/json, text/plain, */*",
    "Referer": "https://www.musinsa.com/category/003?gf=M&sortCode=SALE_ONE_YEAR_COUNT"
}

all_ids = set()
page = 1
page_size = 60  # API에서 요청하는 size
total_pages_estimate = 1350  # 약 7만7천개 / 60 -> 대략 3167 페이지, 안전하게 1350

# CSV 파일 열기 (중간 저장 가능)
with open("musinsa_bottom_ids.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["goodsNo"])  # 헤더

    # tqdm 진행률 표시
    for page in tqdm(range(1, total_pages_estimate + 1), desc="상품 ID 수집"):
        url = f"https://api.musinsa.com/api2/dp/v1/plp/goods?gf=M&sortCode=SALE_ONE_YEAR_COUNT&category=003&size={page_size}&caller=CATEGORY&page={page}&seen=0&seenAds="
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            data = response.json()
        except requests.exceptions.RequestException as e:
            print(f"\nRequest Error at page {page}: {e}")
            break
        except ValueError:
            print(f"\nJSON Decode Error at page {page}")
            print(response.text[:200])
            break

        items = data.get("data", {}).get("list", [])
        if not items:
            print(f"\n마지막 페이지 도달: page={page}")
            break

        for item in items:
            goods_no = item.get("goodsNo")
            if goods_no and goods_no not in all_ids:
                all_ids.add(goods_no)
                writer.writerow([goods_no])

        time.sleep(0.25)  # 서버 부담 방지

print(f"\n총 수집된 상품 ID: {len(all_ids)}")
print("CSV 파일로 저장 완료: musinsa_bottom_ids.csv")

In [None]:
import time
import numpy as np
from sqlalchemy import create_engine, text

# =========================================
# 0. DB 설정
# =========================================
DB_USER = "admin"
DB_PASSWORD = "qkqajrwkrnrnrn9_"
DB_HOST = "musinsa-db.c07kuo6ug98z.us-east-1.rds.amazonaws.com"  # ⚠️ http:// 제거
DB_PORT = 3306
DB_NAME = "musinsa_db"

t0 = time.time()

db_url = (
    f"mysql+pymysql://{DB_USER}:{DB_PASSWORD}"
    f"@{DB_HOST}:{DB_PORT}/{DB_NAME}"
    "?charset=utf8mb4"
)

engine = create_engine(db_url)

print(f"[DB 연결 완료] {time.time() - t0:.3f} 초")

# =========================================
# 1. INSERT / UPDATE SQL
# =========================================
insert_sql = text("""
INSERT INTO product_emb (
    product_id,
    category_vector,
    description_vector
)
VALUES (
    :product_id,
    :category_vector,
    :description_vector
)
ON DUPLICATE KEY UPDATE
    category_vector = VALUES(category_vector),
    description_vector = VALUES(description_vector)
""")

# =========================================
# 2. DataFrame → DB 파라미터 변환
# =========================================
t0 = time.time()

records = []

for _, row in df_category_emb.iterrows():
    records.append({
        "product_id": int(row["product_id"]),
        "category_vector": row["category_vector"]
            .astype(np.float32)
            .tobytes(),          # 200 * 4 = 800 bytes
        "description_vector": row["descrption_vector"]
            .astype(np.float32)
            .tobytes()           # 768 * 4 = 3072 bytes
    })

print(
    f"[파라미터 변환 완료] rows={len(records)} | "
    f"{time.time() - t0:.3f} 초"
)

# =========================================
# 3. DB 저장 (batch)
# =========================================
t0 = time.time()

with engine.begin() as conn:
    conn.execute(insert_sql, records)

print(
    f"[DB INSERT / UPDATE 완료] "
    f"{time.time() - t0:.3f} 초"
)

# =========================================
# 4. 크기 검증 (선택, 디버깅용)
# =========================================
row0 = df_category_emb.iloc[0]

print(
    "[벡터 크기 검증] "
    f"category={len(row0['category_vector'].astype(np.float32).tobytes())} bytes, "
    f"description={len(row0['descrption_vector'].astype(np.float32).tobytes())} bytes"
)
