In [1]:
# 필요한 라이브러리 설치
!pip install selenium webdriver-manager
!pip install mysql-connector-python



In [2]:
# Selenium을 사용한 간단한 크롤링 예제
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException
from bs4 import BeautifulSoup
import os
import json

import mysql.connector
from mysql.connector import Error

def setup_driver():
    driver_service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=driver_service)
    return driver

def login(driver, url, user_id, password):
    """
    웹사이트에 로그인하는 함수
    :param driver: Selenium WebDriver 객체
    :param url: 로그인 페이지 URL
    :param user_id: 사용자 ID
    :param password: 사용자 비밀번호
    """
    login_a_tag = driver.find_element(By.CSS_SELECTOR, '#hdr > ul > li:nth-child(1) > a')
    login_a_tag.click()
    
    id_input = driver.find_element(By.CSS_SELECTOR, '#login > div > form > fieldset > ul > li.mid > input')
    id_input.send_keys(user_id)
    
    password_input = driver.find_element(By.CSS_SELECTOR, '#login > div > form > fieldset > ul > li.mpasswd > input')
    password_input.send_keys(password)
    
    login_button = driver.find_element(By.CSS_SELECTOR, '#login > div > form > fieldset > a > img')
    login_button.click()

def get_html_from_urls(driver, url_list):
    """
    주어진 URL 리스트로부터 HTML을 수집하는 함수
    :param driver: Selenium WebDriver 객체
    :param url_list: 수집할 URL 리스트
    :return: URL과 해당 URL의 HTML을 포함한 튜플의 리스트
    """
    collected_data = []
    
    for url in url_list:
        try:
            driver.get(url)
            html = driver.page_source
            collected_data.append((html, url))
        except Exception as e:
            print(f"Error loading URL {url}: {e}")
            continue

    return collected_data

def extract_product_data(html, product_url ,base_url):
    soup = BeautifulSoup(html, 'html.parser')
    
    product_data = {}
    
    # 현재 URL
    product_data['url'] = product_url
    
    # 상품 이미지 URL
    img_element = soup.select_one('#lens_img')
    if img_element and 'src' in img_element.attrs:
        product_data['image_url'] = base_url + img_element['src']
    
    # 상품 제목
    title_element = soup.select_one('#form1 > div > h3')
    if title_element:
        product_data['title'] = title_element.get_text(strip=True)
    
    # 원산지
    origin_element = soup.select_one('#form1 > div > div.table-opt > table > tbody > tr:nth-child(1) > td > div')
    if origin_element:
        product_data['origin'] = origin_element.get_text(strip=True)
    
    # 상품번호
    product_number_element = soup.select_one('#form1 > div > div.table-opt > table > tbody > tr:nth-child(2) > td > div')
    if product_number_element:
        product_data['product_number'] = product_number_element.get_text(strip=True)
    
    # 배송방법
    shipping_method_element = soup.select_one('#form1 > div > div.table-opt > table > tbody > tr:nth-child(3) > td > div')
    if shipping_method_element:
        product_data['shipping_method'] = shipping_method_element.get_text(strip=True)
    
    # 배송비 안내
    shipping_cost_element = soup.select_one('#form1 > div > div.table-opt > table > tbody > tr:nth-child(4) > td > div')
    if shipping_cost_element:
        product_data['shipping_cost'] = shipping_cost_element.get_text(strip=True)
    
    # 판매가격
    price_element = soup.select_one('#form1 > div > div.table-opt > table > tbody > tr:nth-child(5) > td.price > div')
    if price_element:
        product_data['price'] = price_element.get_text(strip=True)
    
    # 옵션명
    option_element = soup.select_one('#form1 > div > div.table-opt > table > tbody > tr:nth-child(6) > td > div > dl > dd > select')
    if option_element:
        product_data['option'] = [option.get_text(strip=True) for option in option_element.find_all('option')]
    
    # 상품 정보 그리드
    info_grid_element = soup.select_one('#productWrap2')
    if info_grid_element:
        product_data['info_grid'] = info_grid_element.prettify()
    
    # 상세 정보
    detail_info_element = soup.select_one('.prd-detail')
    if detail_info_element:
        product_data['detail_info'] = detail_info_element.prettify()
    
    return product_data

def create_connection(host_name, user_name, user_password, db_name):
    """
    MySQL 데이터베이스에 연결하는 함수
    :param host_name: 호스트 이름
    :param user_name: 사용자 이름
    :param user_password: 사용자 비밀번호
    :param db_name: 데이터베이스 이름
    :return: Connection 객체 또는 None
    """
    connection = None
    try:
        connection = mysql.connector.connect(
            host=host_name,
            user=user_name,
            passwd=user_password,
            database=db_name
        )
        print("MySQL Database connection successful")
    except Error as e:
        print(f"The error '{e}' occurred")
    
    return connection

def insert_product(connection, product_data):
    """
    수집한 데이터를 데이터베이스에 삽입하는 함수
    :param connection: Connection 객체
    :param product_data: 삽입할 상품 데이터 딕셔너리
    """
    insert_query = """
    INSERT INTO PRODUCTS_DATA (product_title, product_url, product_price, product_num, thumbnail_url, detail_page_url, origin, detail_info, prd_options, shipping_info)
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    """
    cursor = connection.cursor()
    try:
        cursor.execute(insert_query, (
            product_data.get('title'),
            product_data['url'],
            product_data.get('price'),
            product_data.get('product_number'),
            product_data.get('image_url'),
            product_data.get('detail_info'),
            product_data.get('origin'),
            product_data.get('info_grid'),
            ','.join(product_data.get('option', [])),  # 옵션 리스트를 문자열로 변환
            product_data.get('shipping_cost')
        ))
        connection.commit()
        print("Product inserted successfully")
    except Error as e:
        print(f"The error '{e}' occurred")

def save_data_files(data_list, output_directory):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    for i, data in enumerate(data_list):
        with open(os.path.join(output_directory, f"product_{i+1}.json"), "w", encoding="utf-8") as file:
            json.dump(data, file, ensure_ascii=False, indent=4)

In [3]:
if __name__ == "__main__":
    driver = setup_driver()

    user_id = "dlswn666"
    user_password = "Injuchung2306!!"
    
    host_name = "localhost"
    host_id = "root"
    host_pw = "Injuchung2306!!"
    db_name = "selper"
    connection = create_connection(host_name, host_id, host_pw, db_name)
    

    driver.get('http://www.sohostar.co.kr/index.html')

    login(driver, 'https://www.sohostar.co.kr', user_id, user_password)

    url_list = [
        'http://www.sohostar.co.kr/shop/shopdetail.html?branduid=1974609&search=&xcode=023&mcode=009&scode=&special=3&GfDT=Z213UA%3D%3D',
        'http://www.sohostar.co.kr/shop/shopdetail.html?branduid=12240905&xcode=023&mcode=003&scode=&type=X&sort=regdate&cur_code=023&search=&GfDT=amV9',
        'http://www.sohostar.co.kr/shop/shopdetail.html?branduid=2045334&xcode=024&mcode=008&scode=&type=X&sort=regdate&cur_code=024&search=&GfDT=Z2d3WQ%3D%3D'
    ]

    # HTML 수집 및 저장
    all_html = get_html_from_urls(driver, url_list)
    all_data = []
    for html, url in all_html:
       all_data.append(extract_product_data(html, url,'http://www.sohostar.co.kr'))

    for product in all_data:
        insert_product(connection, product)
        
    output_directory = 'C:/Users/inyoung/Desktop/check'
    save_data_files(all_data, output_directory)

MySQL Database connection successful
Product inserted successfully
Product inserted successfully
Product inserted successfully
