In [1]:
import time
import random
import requests
import json
import os
import pickle
from common import headers, get_new_proxy

In [2]:
# https://movie.douban.com/j/new_search_subjects?sort=R&range=0,10&tags=%E7%94%B5%E5%BD%B1&start=0&genres=%E5%89%A7%E6%83%85&countries=%E4%B8%AD%E5%9B%BD%E5%A4%A7%E9%99%86&year_range=2022,2022

ENDPOINT = "https://movie.douban.com/j/new_search_subjects"

GENRES = [
    "剧情", "喜剧", "动作", "爱情", "科幻", "动画", "悬疑", "惊悚", "恐怖", "犯罪", "同性", "音乐", "歌舞", "传记", "历史", "战争", "西部", "奇幻", "冒险",
    "灾难", "武侠", "情色"]

DEBOUNCE = 0.5  # don't spam the server

In [3]:
def get_default_params(limit):
    return {
        "sort": "R",
        "range": "0,10",  # rating range
        "tags": "电影",
        "start": 0,
        "limit": limit,
        "countries": "中国大陆",
    }

def parse_movies(text):
    return json.loads(text)["data"]

In [4]:
def get_genre_year(genre, year, proxy=None) -> []:
    movies = []
    limit = 100

    # setup session
    s = requests.Session()
    s.proxies = proxy
    s.headers.update(headers)

    # setup query params
    params = get_default_params(limit)
    params["genres"] = genre
    params["year_range"] = f"{year},{year}"

    # python doesn't have do while...
    r = s.get(ENDPOINT, params=params)
    l = parse_movies(r.text)
    movies.extend(l)

    while len(l) >= limit:
        jitter = random.random()
        time.sleep(DEBOUNCE + jitter)

        params["start"] += limit
        print(f"Starting {params['start']}")

        try:
            r = s.get(ENDPOINT, params=params)
            l = parse_movies(r.text)
            movies.extend(l)
        except:
            # something went wrong, probably our IP was flagged.
            # switch to a different proxy
            s.proxies = get_new_proxy()
            print(f"Something went wrong. Changing proxy to: {s.proxies['http']}")
            params["start"] -= limit

    print(f"Finished, list size {len(movies)}")

    return movies

In [6]:
start_time = time.time()

for g in GENRES:
    for y in range(2010, 2023):
        name = f"./data/list/{g}_{y}.pkl"
        # if already been scraped, skip
        if os.path.isfile(name):
            print(f"{g} {y} already scraped, skipping")
            continue
        print(f"Starting {g} {y}, time elapsed {time.time() - start_time}")
        movies = get_genre_year(g, y, get_new_proxy())
        with open(name, 'wb') as f:
           pickle.dump(movies, f)

剧情 2010 already scraped, skipping
剧情 2011 already scraped, skipping
剧情 2012 already scraped, skipping
剧情 2013 already scraped, skipping
剧情 2014 already scraped, skipping
剧情 2015 already scraped, skipping
剧情 2016 already scraped, skipping
剧情 2017 already scraped, skipping
剧情 2018 already scraped, skipping
剧情 2019 already scraped, skipping
剧情 2020 already scraped, skipping
剧情 2021 already scraped, skipping
剧情 2022 already scraped, skipping
喜剧 2010 already scraped, skipping
喜剧 2011 already scraped, skipping
喜剧 2012 already scraped, skipping
喜剧 2013 already scraped, skipping
喜剧 2014 already scraped, skipping
喜剧 2015 already scraped, skipping
喜剧 2016 already scraped, skipping
喜剧 2017 already scraped, skipping
喜剧 2018 already scraped, skipping
喜剧 2019 already scraped, skipping
喜剧 2020 already scraped, skipping
喜剧 2021 already scraped, skipping
喜剧 2022 already scraped, skipping
动作 2010 already scraped, skipping
动作 2011 already scraped, skipping
动作 2012 already scraped, skipping
动作 2013 alread