google colab에서 실행

In [None]:
import os
from selenium.webdriver import Firefox, FirefoxOptions, FirefoxProfile, ActionChains
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from fake_useragent import UserAgent
from time import sleep
import urllib.request
import json


with open('/content/movie_list.txt', 'r') as file:
    ids = [line.strip() for line in file.readlines() if line.strip()]

def get_movie_data(driver: Firefox, id):
    """Get datas of the movie(title, year, age, runningtime,
    genres, casts, summaries, synopsis) from IMDb.

    Args:
        driver (Firefox): selenium webdriver
        id (str): IMDb movie id

    Returns:
        dict: movie datas(title, year, age, runningtime,
        genres, casts, summaries, synopsis)
    """
    movie_dict = {
        'genres': [],
        'casts': [],
        'summaries': []
    }

    movie_dict['id'] = id

    url = f"https://www.imdb.com/title/{id}/"
    driver.get(url)

    # Original Title
    try:
        original_title = driver.find_element(By.CLASS_NAME, 'fUCCIx').text.replace("Original title: ", "", 1)
        movie_dict['title'] = original_title
    except:
        movie_dict['title'] = driver.find_element(By.CLASS_NAME, 'hero__primary-text').text

    # Released Year
    meta_list = driver.find_element(By.CLASS_NAME, 'joVhBE')
    meta_list = meta_list.find_elements(By.TAG_NAME, 'li')
    try:
        year = meta_list[0].text
        age = meta_list[1].text
        running_time = meta_list[2].text
    except:
        year = meta_list[0].text
        age = ""
        running_time = ""

    movie_dict['year'] = year
    movie_dict['age'] = age
    movie_dict['running_time'] = running_time

    # Genres
    genres = driver.find_element(By.CLASS_NAME, 'iPHzA-d')
    genres = genres.find_elements(By.CLASS_NAME, 'ipc-chip__text')
    for i, genre in enumerate(genres):
        movie_dict['genres'].append(genre.text)

    # Casts
    casts = driver.find_elements(By.CLASS_NAME, 'kVdWAO')
    characters = driver.find_elements(By.CLASS_NAME, 'zVTic')

    for cast, character in zip(casts[:10], characters[:10]):
        movie_dict['casts'].append((cast.text, character.text))

    # Plot
    driver.get(url + 'plotsummary')

    summaries = driver.find_elements(By.CLASS_NAME, 'cczhld')

    for summary in summaries:
        block = summary.find_element(By.CLASS_NAME, 'ipc-html-content')
        block = block.find_element(By.CLASS_NAME, 'ipc-html-content-inner-div')
        movie_dict['summaries'].append(block.text)

    # Synopsis
    try:
        synopsis = driver.find_element(By.XPATH,
                                        '/html/body/div[2]/main/div/section/div/section/div/div[1]/section[2]/div[2]/ul/li/div/div/div/div/div')
        movie_dict['synopsis'] = synopsis.text
    except:
        movie_dict['synopsis'] = ''

    # 폴더 생성 (필요 시)
    output_dir = "/content/movie_synopsis"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # 데이터 저장
    with open(f"{output_dir}/{id}.json", "w") as f:
        json.dump(movie_dict, f)

    sleep(3)

    return movie_dict

In [None]:
import os
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from fake_useragent import UserAgent

# Chrome 옵션 설정
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

# UserAgent 설정
ua = UserAgent(browsers='Firefox')
userAgent = ua.random
chrome_options.add_argument(f'user-agent={userAgent}')

# 드라이버 초기화
driver = webdriver.Chrome(options=chrome_options)

for id in ids:
    if not os.path.exists(f"/content/movie_synopsis/{id}.json"):
        movie_data = get_movie_data(driver, id)
        title = movie_data['title']
    else:
        with open(f"/content/movie_synopsis/{id}.json", "r") as f:
            movie_data = json.load(f)
            title = movie_data['title']

# 작업 완료 후 드라이버 종료
driver.quit()