In [1]:
# 항상 첫 번째 셀
import sys
import os
sys.path.append(os.path.abspath("../src"))

In [2]:
from db_client import RDSClient

def main():
    # 1. DB 클라이언트 인스턴스 생성 (이때 풀이 생성됨)
    db = RDSClient()
    
    # 2. 쿼리 실행 예시 (SELECT)
    print("--- 사용자 조회 ---")
    users = db.execute("SELECT * FROM crawl_target_id LIMIT 3;")
    if users:
        for user in users:
            print(user)


if __name__ == "__main__":
    main()

DB HOST: musinsa-data.c07kuo6ug98z.us-east-1.rds.amazonaws.com
DB USER: admin
DB PASSWORD: qkqajrwkrnrnrn9_
DB NAME: musinsa
DB PORT: 3306
✅ DB Engine (Pool) 생성 완료
--- 사용자 조회 ---
{'product_id': 70061, 'is_crawled': 0, 'created_at': datetime.datetime(2025, 11, 27, 1, 44)}
{'product_id': 70063, 'is_crawled': 0, 'created_at': datetime.datetime(2025, 11, 27, 1, 44)}
{'product_id': 70067, 'is_crawled': 0, 'created_at': datetime.datetime(2025, 11, 27, 1, 44)}


In [None]:
# !pip install tqdm

In [2]:
### 상품 id 크롤링

import requests
from tqdm import tqdm
import time
import csv

In [None]:

# 헤더 설정 (브라우저 요청과 동일하게)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Accept": "application/json, text/plain, */*",
    "Referer": "https://www.musinsa.com/category/003?gf=M&sortCode=SALE_ONE_YEAR_COUNT"
}

all_ids = set()
page = 1
page_size = 60  # API에서 요청하는 size
total_pages_estimate = 1350  # 약 7만7천개 / 60 -> 대략 3167 페이지, 안전하게 1350

# CSV 파일 열기 (중간 저장 가능)
with open("musinsa_bottom_ids.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["goodsNo"])  # 헤더

    # tqdm 진행률 표시
    for page in tqdm(range(1, total_pages_estimate + 1), desc="상품 ID 수집"):
        url = f"https://api.musinsa.com/api2/dp/v1/plp/goods?gf=M&sortCode=SALE_ONE_YEAR_COUNT&category=003&size={page_size}&caller=CATEGORY&page={page}&seen=0&seenAds="
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            data = response.json()
        except requests.exceptions.RequestException as e:
            print(f"\nRequest Error at page {page}: {e}")
            break
        except ValueError:
            print(f"\nJSON Decode Error at page {page}")
            print(response.text[:200])
            break

        items = data.get("data", {}).get("list", [])
        if not items:
            print(f"\n마지막 페이지 도달: page={page}")
            break

        for item in items:
            goods_no = item.get("goodsNo")
            if goods_no and goods_no not in all_ids:
                all_ids.add(goods_no)
                writer.writerow([goods_no])

        time.sleep(0.25)  # 서버 부담 방지

print(f"\n총 수집된 상품 ID: {len(all_ids)}")
print("CSV 파일로 저장 완료: musinsa_bottom_ids.csv")
     

In [None]:
# !pip install webdriver-manager
# !pip install pymysql

In [17]:
import re
import csv
import json
import pymysql
from pymysql.err import OperationalError
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import random

In [None]:
# ==========================================
# DB 설정
# ==========================================
DB_HOST = "musinsa-data.c07kuo6ug98z.us-east-1.rds.amazonaws.com"
DB_USER = "admin"
DB_PASSWORD = "qkqajrwkrnrnrn9_"
DB_NAME = "musinsa"
DB_PORT = 3306

# ==========================================
# DB 연결
# ==========================================
def get_connection():
    return pymysql.connect(
        host=DB_HOST, user=DB_USER, password=DB_PASSWORD,
        db=DB_NAME, port=DB_PORT, charset="utf8mb4"
    )

conn = get_connection()
cursor = conn.cursor()

# ==========================================
# product_bottom 테이블 컬럼 자동 읽기
# ==========================================
cursor.execute("SHOW COLUMNS FROM product_bottom")
db_cols = [row[0] for row in cursor.fetchall()]

print("DB 테이블 컬럼:", db_cols)
print("총 컬럼 수:", len(db_cols))

placeholders = ", ".join(["%s"] * len(db_cols))
insert_sql = f"""
INSERT INTO product_bottom ({", ".join(db_cols)})
VALUES ({placeholders})
"""

# ==========================================
# CSV에서 Product ID 읽기
# ==========================================
goods_ids = []
with open("musinsa_bottom_ids.csv", newline="", encoding="utf-8") as f:
    reader = csv.reader(f)
    next(reader)
    for row in reader:
        goods_ids.append(row[0])

total_count = len(goods_ids)

# ==========================================
# 이미 저장된 상품 수 확인 → 재시작 기능
# ==========================================
cursor.execute("SELECT COUNT(*) FROM product_bottom")
saved_count = cursor.fetchone()[0]

print(f"이미 저장된 상품 수: {saved_count}개")
print(f"전체 상품 수: {total_count}")

start_index = saved_count
print(f"이번에 시작할 인덱스: {start_index}")

# ==========================================
# Selenium 설정
# ==========================================
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# ==========================================
# 공통 파싱 함수
# ==========================================
def extract_int(text):
    return int(re.sub(r"[^0-9]", "", text)) if text else None

def extract_number(text):
    if text is None:
        return None
    if "만" in text:
        try:
            return str(int(float(text.replace("만", "")) * 10000))
        except:
            return text
    cleaned = re.sub(r"[^0-9]", "", text)
    return cleaned if cleaned != "" else ""

def clean_product_name(name):
    return re.sub(r"-.*", "", name).strip()

# ==========================================
# 크롤링 시작
# ==========================================
print("크롤링 시작!")

for idx in range(start_index, total_count):

    gid = goods_ids[idx]
    print(f"[{idx+1}/{total_count}] 상품 {gid} 처리 중...")

    try:
        url = f"https://www.musinsa.com/products/{gid}"
        driver.get(url)
        time.sleep(random.uniform(0.8, 1.5))
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # -----------------------------
        # 1. 상품명
        # -----------------------------
        name_tag = soup.select_one("div[class*='GoodsName__Wrap'] span")
        product_name = clean_product_name(name_tag.get_text(strip=True)) if name_tag else None

        # 2. 브랜드
        brand_tag = soup.select_one("div[class*='Brand__Wrap'] span[class*='BrandName']")
        brand = re.sub(r"[0-9\.]+만", "", brand_tag.get_text(strip=True)).strip() if brand_tag else None

        # 3~5 정가/판매가/할인율
        normal_price = extract_int(soup.select_one("span.line-through").get_text()) if soup.select_one("span.line-through") else None
        sale_price = extract_int(soup.select_one("span[class*='CalculatedPrice']").get_text()) if soup.select_one("span[class*='CalculatedPrice']") else None
        discount = extract_int(soup.select_one("span[class*='DiscountRate']").get_text()) if soup.select_one("span[class*='DiscountRate']") else None

        # 6 카테고리
        cats = soup.select("div[class*='Category__Wrap'] a")
        upper_category = cats[0].get_text(strip=True) if len(cats) > 0 else None
        lower_category = cats[1].get_text(strip=True) if len(cats) > 1 else None

        # 7 성별
        gender = 0
        layout_boxes = soup.select("dl[class*='Layout__Wrap'] div[class*='Layout__Box']")
        if len(layout_boxes) > 1:
            dd = layout_boxes[1].find("dd")
            if dd:
                gtext = dd.get_text(strip=True)
                gender_map = {"남": 1, "여": 2, "공용": 0, "남녀공용": 0}
                gender = gender_map.get(gtext, 0)

        # 누적판매
        cumulative = ""
        for box in layout_boxes:
            dt = box.find("dt")
            dd = box.find("dd")
            if dt and dd and dt.get_text(strip=True) == "누적판매":
                cumulative = dd.get_text(strip=True)

        # 8 별점
        rating_tag = soup.select_one("div[class*='Review__Wrap'] span")
        rating = float(rating_tag.get_text(strip=True)) if rating_tag else None

        # 9 후기수
        review_cnt_tag = soup.select_one("div[class*='Review__Wrap'] span:nth-of-type(2)")
        review_cnt = int(extract_number(review_cnt_tag.get_text())) if review_cnt_tag else 0

        # 10 관심수
        like_tag = soup.select_one("div[class*='Like__Container'] span")
        like_cnt = extract_number(like_tag.get_text()) if like_tag else None

        # 11 스타일 태그
        tag_items = soup.select("ul[class*='ProductTags__List'] span")
        styles = ",".join([t.get_text(strip=True) for t in tag_items])

        # 12 실측사이즈 JSON
        size_json = "[]"

        # 13 핏/계절감 JSON
        fit_json = json.dumps({"핏": [], "계절감": []}, ensure_ascii=False)

        # -----------------------------
        # INSERT 값 (DB 순서대로)
        # -----------------------------
        values = (
            gid, product_name, brand, normal_price, sale_price,
            upper_category, lower_category, gender, rating,
            like_cnt, review_cnt, size_json, discount,
            fit_json, cumulative, styles
        )

        cursor.execute(insert_sql, values)
        conn.commit()

    except Exception as e:
        print(f"오류: {gid}, 이유: {e}")
        with open("error_log.txt", "a", encoding="utf-8") as log:
            log.write(f"{gid}, {e}\n")

        # DB 끊겼으면 재연결
        try:
            conn.ping(reconnect=True)
        except:
            conn = get_connection()
            cursor = conn.cursor()

print("모든 작업 완료!")

driver.quit()
conn.close()

DB 테이블 컬럼: ['product_id', 'product_name', 'brand', 'original_price', 'sale_price', 'upper_category', 'lower_category', 'gender', 'rating', 'wish_count', 'review_count', 'size_info', 'discount_rate', 'fit_season', 'cumulative_sales', 'style']
총 컬럼 수: 16
이미 저장된 상품 수: 0개
전체 상품 수: 59914
이번에 시작할 인덱스: 0
크롤링 시작!
[1/59914] 상품 1844582 처리 중...
[2/59914] 상품 1926048 처리 중...
[3/59914] 상품 2574822 처리 중...
[4/59914] 상품 1551840 처리 중...
[5/59914] 상품 3791988 처리 중...
[6/59914] 상품 1926034 처리 중...
[7/59914] 상품 2112061 처리 중...
[8/59914] 상품 1551839 처리 중...
[9/59914] 상품 1149329 처리 중...
[10/59914] 상품 4274049 처리 중...
[11/59914] 상품 750908 처리 중...
[12/59914] 상품 2112059 처리 중...
[13/59914] 상품 4757347 처리 중...
[14/59914] 상품 1168906 처리 중...
[15/59914] 상품 3228764 처리 중...
[16/59914] 상품 3231055 처리 중...
[17/59914] 상품 2578996 처리 중...
[18/59914] 상품 3674341 처리 중...
[19/59914] 상품 3663072 처리 중...
[20/59914] 상품 1735427 처리 중...
[21/59914] 상품 4993414 처리 중...
[22/59914] 상품 2744549 처리 중...
[23/59914] 상품 1924274 처리 중...
[24/59914] 상품

KeyboardInterrupt: 