In [1]:
import time
import os
from pathlib import Path
from datetime import datetime

import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from sqlalchemy import text, create_engine
from selenium import webdriver
from selenium.webdriver.common.by import By

In [2]:
## 환경 변수 셋업
os.environ['DB_HOST'] = "34.84.197.62"
os.environ['DB_USER'] = "postgres"
os.environ['DB_PASSWORD'] = "skku-intensive"
os.environ['DB_NAME'] = "postgres"

db_host = os.environ["DB_HOST"]
db_user = os.environ['DB_USER']
db_password = os.environ['DB_PASSWORD']
db_name = os.environ['DB_NAME']

### 연결 체크

In [3]:
engine = create_engine(f'postgresql+psycopg2://{db_user}:{db_password}@{db_host}:5432/{db_name}')

# 데이터베이스 연결
with engine.connect() as connection:
    # 버전 확인 쿼리 실행
    result = connection.execute(text("SELECT version();"))
    
    # 결과 출력
    version = result.fetchone()
    print(version[0])

# 엔진 종료
engine.dispose()

PostgreSQL 16.4 on x86_64-pc-linux-gnu, compiled by Debian clang version 12.0.1, 64-bit


### 스크래핑

In [15]:
def scroll_down_to_bottom(driver):
    scroll_location = driver.execute_script("return document.body.scrollHeight")
    for _ in range(10):
        driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
        time.sleep(2)
        scroll_height = driver.execute_script("return document.body.scrollHeight")
        if scroll_location == scroll_height:
            break
        else:

            scroll_location = driver.execute_script("return document.body.scrollHeight")

**접속**

In [5]:
driver = webdriver.Chrome()

In [6]:
store_info_list = [
    {"place_id":1477750254, "chef_name":"정지선", "store_name":"티엔미미"},
    {"place_id":1283188906, "chef_name":"히든천재", "store_name":"포노부오노"},
    {"place_id":1018077796, "chef_name":"나폴리맛피아", "store_name":"비아 톨레도 파스타바"},
    {"place_id":1775308300, "chef_name":"파브리", "store_name":"파브리키친"},
    {"place_id":1817207066, "chef_name":"철가방", "store_name":"도량"},
    {"place_id":1570882425, "chef_name":"트리플스타", "store_name":"트리드"},
    {"place_id":1647309508, "chef_name":"요리하는돌아이", "store_name":"디핀"},
    {"place_id":1132840035, "chef_name":"만찢남", "store_name":"조광"},
    {"place_id":280965665, "chef_name":"최현석", "store_name":"쵸이닷"},
]

In [7]:
store_info = store_info_list[5  ]

In [8]:
driver.get(f"https://m.place.naver.com/restaurant/{store_info['place_id']}/review/visitor")
## 느리게 안되면 모바일 페이지로: 
time.sleep(5) # Let the user actually see something!

In [9]:
## 리뷰 클릭
# driver.find_element(By.XPATH, '//*[@id="app-root"]/div/div/div/div[4]/div/div/div/div/a[4]/span').click()

In [12]:
driver.find_element(By.XPATH, '//*[@id="app-root"]/div/div/div/div[6]/div[2]/div[5]/div[2]/div/div/a').click()


In [16]:
for _ in range(100):
    scroll_down_to_bottom(driver)
    try:
        driver.find_element(By.XPATH, '//*[@id="app-root"]/div/div/div/div[6]/div[2]/div[3]/div[2]/div/a/span').click()
    except:
        break

**데이터 가져오기**

In [11]:
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')

In [None]:
## 파싱
res = []
for li in tqdm(soup.findAll("ul")[2].findAll("li")):
    user_name = li.find("span").text
    keywords = [key.text for key in li.find("a", attrs={"data-pui-click-code":"visitkeywords"}).findAll("span")]
    review_text = li.find("a", attrs={"data-pui-click-code":"rvshowmore"}).text
    review_dt = li.find("time").text
    raw_text = li.text
    temp_info = [user_name, keywords, review_text, review_dt, raw_text]
    res.append(temp_info)

print(f"리뷰 수: {len(res)}")

In [None]:
df = pd.DataFrame(res, columns = ["user_name", "keywords", "review_text", "review_dt", "raw_text"])
df = df.assign(
    chef_name = store_info["chef_name"], 
    store_name = store_info["store_name"],
    scraped_at = datetime.now()
)
display(df.head())
df.to_sql('review_raw', engine, if_exists='append', index=False)

In [18]:
df.to_parquet("temp.parquet")