In [None]:
import requests
import pandas as pd
from pathlib import Path
import os
from time import sleep
from datetime import datetime

TMDB_API_READ = os.getenv("TMDB_API_READ")
HEADERS = {
    "accept": "application/json",
    "Authorization": f"Bearer {TMDB_API_READ}",
}

NUM_PAGES = 0


def get_movie_ids(pages=5, num_years=10):
    """Get a list of movie IDs sorted by revenue. The function automatically returns after reaching the last page."""
    current_year = datetime.now().year
    start_date = f"{current_year - num_years + 1}-01-01"
    end_date = f"{current_year}-12-31"
    movie_ids = []

    url = "https://api.themoviedb.org/3/discover/movie"

    for page in range(1, pages + 1):
        params = {
            "include_adult": "false",
            "include_video": "false",
            "language": "en-US",
            "page": page,
            "sort_by": "revenue.desc",
            "primary_release_date.gte": start_date,
            "primary_release_date.lte": end_date,
        }

        response = requests.get(url, headers=HEADERS, params=params)
        if response.status_code == 200:
            response = response.json()
            results = response.get("results", [])
            for movie in results:
                movie_ids.append(movie["id"])
            if page >= response.get("total_pages"):
                print("Final page reached.")
                return
        else:
            print(f"Error on page {page}: {response.status_code}")

        sleep(0.3)  # To avoid overwhelming the server

    return movie_ids


def get_movie_details(movie_id):
    """Fetch specific financial details for a single movie."""
    url = f"https://api.themoviedb.org/3/movie/{movie_id}"

    try:
        response = requests.get(url, headers=HEADERS)
        if response.status_code == 200:
            data = response.json()
            return {
                "id": data.get("id"),
                "title": data.get("title"),
                "release_date": data.get("release_date"),
                "budget": data.get("budget"),
                "revenue": data.get("revenue"),
                "genres": [
                    g["name"] for g in data.get("genres", [])
                ],  # get only genre names
                "runtime": data.get("runtime"),
                "vote_average": data.get("vote_average"),
                "popularity": data.get("popularity"),
            }
        else:
            print(f"Error fetching details for ID {movie_id}")
            return None
    except Exception as e:
        print(f"Exception for ID {movie_id}: {e}")
        return None


def get_movie_data():
    print("Discovering movies...")
    ids = get_movie_ids(pages=2)
    print(f"Found {len(ids)} movie IDs.")

    print("Fetching movie details ...")
    full_movie_data = []

    for idx, m_id in enumerate(ids):
        details = get_movie_details(m_id)
        if details:
            full_movie_data.append(details)

        if idx % 10 == 0:
            print(f"Processed {idx}/{len(ids)}")

        sleep(0.3)

    path = Path("movie_data.parquet")
    df = pd.DataFrame(full_movie_data)
    df.to_parquet(path, index=False, append=path.exists())
    return df


Discovering movies...
Found 40 movie IDs.
Fetching movie details ...
Processed 0/40
Processed 10/40
Processed 20/40
Processed 30/40


Discovering movies...


TypeError: list indices must be integers or slices, not str