# Getting the Data

In [2]:
!pip install tmdbsimple

Collecting tmdbsimple
  Downloading tmdbsimple-2.9.1-py3-none-any.whl.metadata (6.9 kB)
Downloading tmdbsimple-2.9.1-py3-none-any.whl (38 kB)
Installing collected packages: tmdbsimple
Successfully installed tmdbsimple-2.9.1


In [3]:
import numpy as np
import requests
import pandas as pd
import tmdbsimple as tmdb

tmdb.API_KEY = '1d48b5e24b27cd111582c21dcff9b8f5'
tmdb.REQUESTS_TIMEOUT = 5  # seconds, for both connect and read
tmdb.REQUESTS_SESSION = requests.Session()

### Juste un exemple : Chercher des données sur 5 films (sans réalisateur + sans acteurs) 

In [None]:
# Your TMDB Bearer Token (keep this private!)
headers = {
    "accept": "application/json",
    "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiIxZDQ4YjVlMjRiMjdjZDExMTU4MmMyMWRjZmY5YjhmNSIsIm5iZiI6MTc1OTUwMzQzNi4yNDgsInN1YiI6IjY4ZGZlNDRjMGRiMmM0N2YyZTNiOGVmZSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.gm5sELlgFYQKK2qomO9dFP9wTVAccNXD-Kd0d_OIRag"
}

movie_ids = [550, 278, 238]  # movie_ids for Fight Club, Shawshank Redemption, Godfather

movie_data = []

for movie_id in movie_ids:
    # 1. Get movie details
    details_url = f"https://api.themoviedb.org/3/movie/{movie_id}?language=en-US"
    details_res = requests.get(details_url, headers=headers).json()

    title = details_res.get("title")
    release_date = details_res.get("release_date")
    release_year = release_date.split("-")[0] if release_date else ""
    genres = ", ".join([g['name'] for g in details_res.get('genres', [])]) 
    vote_avg = str(details_res.get("vote_average"))
    popularity = str(details_res.get("popularity"))

    # 2. Get recommendations
    rec_url = f"https://api.themoviedb.org/3/movie/{movie_id}/recommendations?language=en-US&page=1"
    rec_res = requests.get(rec_url, headers=headers).json()
    recommendations = ", ".join([rec["title"] for rec in rec_res["results"]])

    # 3. Append to list
    movie_data.append({
        "movie_id": str(movie_id),
        "title": title,
        "genres": genres,
        "release_year": release_year,
        "vote_average": vote_avg,
        "popularity": popularity,
        "recommended_movies": recommendations
    })

# 4. Convert to DataFrame
df = pd.DataFrame(movie_data)

df

Unnamed: 0,movie_id,title,genres,release_year,vote_average,popularity,recommended_movies
0,550,Fight Club,"Drama, Thriller",1999,8.438,21.6358,"Pulp Fiction, Se7en, Forrest Gump, Looper, The..."
1,278,The Shawshank Redemption,"Drama, Crime",1994,8.712,27.0587,"The Godfather, Schindler's List, The Godfather..."
2,238,The Godfather,"Drama, Crime",1972,8.686,31.5682,"The Godfather Part II, Schindler's List, The S..."


### Ensuite, le code pour chercher tout (les réalisateurs + acteurs) + bulk data downloads

In [6]:
API_KEY = '1d48b5e24b27cd111582c21dcff9b8f5'

In [7]:
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

# Fetch genre ID → name mapping once
genre_response = requests.get(
    "https://api.themoviedb.org/3/genre/movie/list",
    params={"api_key": API_KEY, "language": "en-US"}
).json()

genre_mapping = {g['id']: g['name'] for g in genre_response.get('genres', [])}

def get_movie_credits(movie_id):
    """Fetch credits for a single movie."""
    credits_url = f"https://api.themoviedb.org/3/movie/{movie_id}/credits"
    response = requests.get(credits_url, params={"api_key": API_KEY, "language": "en-US"}).json()
    
    cast = [c['name'] for c in response.get('cast', [])[:3]]
    directors = [c['name'] for c in response.get('crew', []) if c['job'] == 'Director']
    
    return cast, directors

all_movies = []

for year in range(1950, 2024):
    movies_this_year = []

    for page in range(1, 11):  # 10 pages → 200 movies
        response = requests.get(
            "https://api.themoviedb.org/3/discover/movie",
            params={
                "api_key": API_KEY,
                "language": "en-US",
                "sort_by": "popularity.desc",
                "primary_release_year": year,
                "page": page
            }
        ).json()

        movies = response.get('results', [])

        # Fetch credits in parallel
        with ThreadPoolExecutor(max_workers=20) as executor:  # 20 threads
            future_to_movie = {executor.submit(get_movie_credits, m['id']): m for m in movies}
            
            for future in as_completed(future_to_movie):
                movie = future_to_movie[future]
                try:
                    cast, directors = future.result()
                except Exception:
                    cast, directors = [], []

                # Map genre_ids to genre names
                genre_names = [genre_mapping.get(gid, "") for gid in movie.get('genre_ids', [])]
                genre_names = [g for g in genre_names if g]  # remove empty

                movies_this_year.append({
                    "movie_id": movie['id'],
                    "title": movie['title'],
                    "genres": ", ".join(genre_names),
                    "release_year": year,
                    "original_language": movie['original_language'],
                    "vote_average": movie['vote_average'],
                    "popularity": movie['popularity'], 
                    "overview": movie.get('overview', ""),
                    "director": ", ".join(directors),
                    "cast": ", ".join(cast)
                })

        if len(movies_this_year) >= 200:
            break  # stop after reaching 200 movies
    
    all_movies.extend(movies_this_year[:200])
    print(f"Year {year} done, total movies collected: {len(all_movies)}")

Year 1950 done, total movies collected: 200
Year 1951 done, total movies collected: 400
Year 1952 done, total movies collected: 600
Year 1953 done, total movies collected: 800
Year 1954 done, total movies collected: 1000
Year 1955 done, total movies collected: 1200
Year 1956 done, total movies collected: 1400
Year 1957 done, total movies collected: 1600
Year 1958 done, total movies collected: 1800
Year 1959 done, total movies collected: 2000
Year 1960 done, total movies collected: 2200
Year 1961 done, total movies collected: 2400


KeyboardInterrupt: 