In [None]:
import requests
import pandas as pd
from tqdm import tqdm
import time
import os

API_KEY = "eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiI3YTA4ZDFiNTIzNzAyMTA5MzI4M2ZlNDYyM2NiMmVlNiIsIm5iZiI6MTczOTYxMzMyMC42MzMsInN1YiI6IjY3YjA2NDg4ZTVhNmVlYmFkZjM2MjY0MCIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.DhAOCiWFMSjwVZu1N3Q7D8MrxQVJiKtTJUVaiEpYf-A"
HEADERS = {"Authorization": f"Bearer {API_KEY}"}
BASE_URL = "https://api.themoviedb.org/3"
LANG = "en-US"
MAX_PAGES_PER_YEAR = 1000

# Конфигурация по типу данных
CONFIG = {
    # "tv": {
    #     "years": range(2021, 2024),
    #     "checkpoint": "tmdb_checkpoint_tv_1996_2023.csv",
    #     "output": "tmdb_tv_1996_2023.csv"
    # },
    "movie": {
        "years": range(2009, 2015),
        "checkpoint": "tmdb_checkpoint_movie_1996_2014.csv",
        "output": "tmdb_movies_1996_2014.csv"
    }
}

def enrich_items_batch(batch, media_type, processed_ids, all_items, checkpoint_path):
    enriched = []
    for item in tqdm(batch, desc=f"Обогащение {media_type}"):
        url = f"{BASE_URL}/{media_type}/{item['tmdb_id']}"
        params = {"language": LANG}

        try:
            response = requests.get(url, headers=HEADERS, params=params)
            if response.status_code != 200:
                continue

            data = response.json()

            genres = [g['name'] for g in data.get("genres", [])]
            item['genres'] = ", ".join(genres)

            companies = [c['name'] for c in data.get("production_companies", [])]
            item['production_companies'] = ", ".join(companies)

            if media_type == "tv":
                networks = data.get("networks", [])
                item['network_id'] = networks[0].get("id") if networks else None
                item['network_name'] = networks[0].get("name") if networks else None
            else:
                item['network_id'] = None
                item['network_name'] = None

            enriched.append(item)
            processed_ids.add(str(item['tmdb_id']))
            all_items.append(item)

            if len(enriched) % 20 == 0:
                pd.DataFrame(all_items).to_csv(checkpoint_path, index=False)

            time.sleep(0.1)

        except Exception as e:
            print(f"Ошибка при обработке {item['tmdb_id']}: {e}")

    return enriched

def discover_and_enrich(media_type, year, processed_ids, all_items, checkpoint_path):
    url = f"{BASE_URL}/discover/{media_type}"

    for page in range(1, MAX_PAGES_PER_YEAR + 1):
        params = {
            "language": LANG,
            "sort_by": "popularity.desc",
            "page": page
        }

        if media_type == "movie":
            params["primary_release_date.gte"] = f"{year}-01-01"
            params["primary_release_date.lte"] = f"{year}-12-31"
        else:
            params["first_air_date.gte"] = f"{year}-01-01"
            params["first_air_date.lte"] = f"{year}-12-31"

        try:
            response = requests.get(url, headers=HEADERS, params=params)
            if response.status_code != 200:
                break

            results = response.json().get("results", [])
            if not results:
                break

            batch = []
            for item in results:
                if str(item.get("id")) in processed_ids:
                    continue
                batch.append({
                    "tmdb_id": item.get("id"),
                    "name": item.get("title") if media_type == "movie" else item.get("name"),
                    "overview": item.get("overview"),
                    "release_date": item.get("release_date") if media_type == "movie" else item.get("first_air_date"),
                    "vote_average": item.get("vote_average"),
                    "vote_count": item.get("vote_count"),
                    "popularity": item.get("popularity"),
                    "original_language": item.get("original_language"),
                    "type": media_type
                })

            if batch:
                enrich_items_batch(batch, media_type, processed_ids, all_items, checkpoint_path)

            time.sleep(0.2)

        except Exception as e:
            print(f"Ошибка при загрузке страницы {page} {media_type} {year}: {e}")
            break

def run_pipeline(media_type):
    print(f"\n🚀 Запуск сбора для: {media_type.upper()}")

    years = CONFIG[media_type]["years"]
    checkpoint_path = CONFIG[media_type]["checkpoint"]
    output_path = CONFIG[media_type]["output"]

    # Загрузка чекпоинта, если есть
    if os.path.exists(checkpoint_path):
        checkpoint_df = pd.read_csv(checkpoint_path, low_memory=False)
        processed_ids = set(checkpoint_df["tmdb_id"].astype(str))
        all_items = checkpoint_df.to_dict(orient="records")
        print(f"🔁 Продолжаем с {len(all_items)} уже загруженных записей")
    else:
        processed_ids = set()
        all_items = []

    for year in tqdm(years, desc=f"📅 {media_type.upper()} - {years.start}–{years.stop - 1}"):
        print(f"\n🔍 Обработка {media_type} за {year}")
        discover_and_enrich(media_type, year, processed_ids, all_items, checkpoint_path)


    # Финальное сохранение
    df = pd.DataFrame(all_items)
    df.to_csv(output_path, index=False)
    print(f"\n✅ Сохранено {len(df)} записей в {output_path}")

if __name__ == "__main__":
    # run_pipeline("tv")     # сначала сериалы
    run_pipeline("movie")  # потом фильмы


🚀 Запуск сбора для: MOVIE
🔁 Продолжаем с 104972 уже загруженных записей


📅 MOVIE - 2001–2014:   0%|          | 0/14 [00:00<?, ?it/s]


🔍 Обработка movie за 2001


In [None]:
import pandas as pd
CHECKPOINT_FILE = "tmdb_checkpoint_2015.csv"
OUTPUT_FILE = "tmdb_full_dataset_2015.csv"
df = pd.read_csv(CHECKPOINT_FILE, low_memory=False)
df.shape

(116793, 13)

In [None]:
df = pd.read_csv(OUTPUT_FILE, low_memory=False)
df.shape

(116826, 13)

In [None]:
df.head(10)

Unnamed: 0,tmdb_id,name,overview,release_date,vote_average,vote_count,popularity,original_language,type,genres,production_companies,network_id,network_name
0,150540,Inside Out,"When 11-year-old Riley moves to a new city, he...",2015-06-17,7.91,22367,19.71104,en,movie,"Animation, Family, Adventure, Drama, Comedy","Pixar, Walt Disney Pictures",,
1,216015,Fifty Shades of Grey,When college senior Anastasia Steele steps in ...,2015-02-11,5.9,11915,13.313081,en,movie,"Drama, Romance, Thriller","Universal Pictures, Focus Features, Michael De...",,
2,99861,Avengers: Age of Ultron,When Tony Stark tries to jumpstart a dormant p...,2015-04-22,7.3,23314,10.878068,en,movie,"Action, Adventure, Science Fiction",Marvel Studios,,
3,135397,Jurassic World,Twenty-two years after the events of Jurassic ...,2015-06-06,6.692,20595,10.749987,en,movie,"Action, Adventure, Science Fiction, Thriller","Amblin Entertainment, Universal Pictures",,
4,76341,Mad Max: Fury Road,An apocalyptic story set in the furthest reach...,2015-05-13,7.6,22949,9.838556,en,movie,"Action, Adventure, Science Fiction","Warner Bros. Pictures, Village Roadshow Pictur...",,
5,150689,Cinderella,"When her father unexpectedly passes away, youn...",2015-03-06,6.822,7130,9.590064,en,movie,"Romance, Fantasy, Family, Drama","Walt Disney Pictures, Genre Films, Beagle Pug ...",,
6,262500,Insurgent,Beatrice Prior must confront her inner demons ...,2015-03-18,6.364,10082,9.075124,en,movie,"Action, Science Fiction, Thriller","Summit Entertainment, Red Wagon Entertainment,...",,
7,198184,Chappie,Every child comes into the world full of promi...,2015-03-04,6.786,8014,8.029741,en,movie,"Crime, Action, Science Fiction","Columbia Pictures, MRC, LStar Capital, Genre F...",,
8,168259,Furious 7,Deckard Shaw seeks revenge against Dominic Tor...,2015-04-01,7.226,10749,7.781748,en,movie,"Action, Thriller, Crime","Original Film, One Race, Universal Pictures",,
9,281957,The Revenant,"In the 1820s, a frontiersman, Hugh Glass, sets...",2015-12-25,7.533,18336,7.49532,en,movie,"Western, Drama, Adventure","Monarchy Enterprises S.a.r.l., Regency Enterpr...",,
