https://movie.douban.com/subject/35196776/reviews

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def get_reviews_from_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    reviews = []
    for review in soup.find_all('div', class_='main review-item'):
        review_data = {
            'author': review.find('a', class_='name').text.strip(),
            'date': review.find('span', class_='main-meta').text.strip(),
            'rating': review.find('span', class_='main-title-rating')['title'] if review.find('span', class_='main-title-rating') else 'No rating',
            'title': review.find('h2').a.text.strip(),
            'content': review.find('div', class_='short-content').text.strip(),
            'useful_count': review.find('span', id=lambda x: x and x.startswith('r-useful_count-')).text.strip(),
            'useless_count': review.find('span', id=lambda x: x and x.startswith('r-useless_count-')).text.strip(),
            'comments_count': review.find('a', class_='reply').text.strip().replace('回应', '')
        }
        reviews.append(review_data)
    
    return reviews

def get_all_reviews(base_url):
    all_reviews = []
    page = 0
    while True:
        url = f"{base_url}?start={page*20}"
        print(f"Scraping page {page + 1}...")
        
        page_reviews = get_reviews_from_page(url)
        if not page_reviews:
            break
        
        all_reviews.extend(page_reviews)
        page += 1
        
        # 웹사이트에 부담을 주지 않기 위해 각 요청 사이에 잠시 대기
        time.sleep(2)
    
    return all_reviews

# 기본 URL
base_url = 'https://movie.douban.com/subject/35196776/reviews'

# 모든 리뷰 수집
all_reviews = get_all_reviews(base_url)

# 데이터프레임 생성
df = pd.DataFrame(all_reviews)

# 데이터프레임 확인
print(df.head())
print(f"Total reviews collected: {len(df)}")

# 데이터프레임 정보 확인
print(df.info())

# 기본 통계 확인
print(df.describe())

# 특정 열의 유니크 값 확인
print(df['rating'].value_counts())

# CSV 파일로 저장
df.to_csv('douban_all_reviews.csv', index=False)

In [None]:
df