In [None]:
import json
import requests
import pandas as pd
from http import HTTPStatus

from database.getdata import query_data, get_connection, execute_query

### Récupération des données de l'API TMDB

In [None]:
def get_last_updated_movies_id(api_key: str) -> int:
    """
    Get the ID of the latest movie added to the TMDB database.
    """
    start_date = "2025-02-15"
    end_date = "2025-02-21"
    n_pages = 1
    url = f"https://api.themoviedb.org/3/movie/changes?end_date={end_date}&page={n_pages}&start_date={start_date}"
    headers = {
        "accept": "application/json",
        "Authorization": f"Bearer {api_key}",
    }

    # Retry the request until it succeeds
    retries = 0
    while retries < 2:
        try:
            response = requests.get(url, headers=headers)
            break
        except Exception:
            retries += 1

    response_in_json = json.loads(response.text)

    return response_in_json

In [None]:
def get_movies(api_key: str, origin_country: str) -> dict:
    """
    Get ID of movies from a country.
    """
    url = f"https://api.themoviedb.org/3/discover/movie?include_adult=false&include_video=false&language=fr-FR&page=1&with_origin_country={origin_country}"
    headers = {"accept": "application/json", "Authorization": f"Bearer {api_key}"}

    response = requests.get(url, headers=headers)
    response_in_json = json.loads(response.text)

    return response_in_json


french_movies = get_movies("FR")
french_movies

### Traitement des données récupérées

In [None]:
# Create a DataFrame from the JSON response
df_movies = pd.DataFrame(french_movies["results"])
print(df_movies.shape[0])

# Map column names to database schema
columns_fr = {
    "title": "nom_originel",
    "original_language": "langue_principale",
    "release_date": "date_sortie_france",
    "vote_average": "tmbd_note_moyenne",
    "vote_count": "tmdb_total_votes",
    "popularity": "tmdb_score",
    "overview": "description",
    "id": "tmdb_id",
}
df_movies.rename(columns=columns_fr, inplace=True)

# Drop columns that are not needed
columns_to_drop = [
    "adult",
    "backdrop_path",
    "poster_path",
    "video",
    "original_title",
    "genre_ids",
    # below is temporary drop until database schema if correctly updated
    "tmbd_note_moyenne",
    "tmdb_total_votes",
    "tmdb_score",
    "description",
    "tmdb_id",
]
df_movies.drop(columns=columns_to_drop, inplace=True)
df_movies.head()

### Comparaison avec les données déjà en base

In [None]:
# Get the movies from the database
df_db_movies = query_data("SELECT id, nom_originel FROM inegalites_cinema.film")
print("Number of rows in table inegalites_cinema.film: " + str(df_db_movies.shape[0]))

# Merge the two DataFrames to find the movies that are not already in the database
df_merged = pd.merge(
    df_movies, df_db_movies, how="left", left_on="nom_originel", right_on="nom_originel"
)

# Get the ID of the latest movie added to the TMDB database
start_id = int(df_db_movies.loc[:, "id"].max()) + 1
# Get the movies that didn't match
df_new_rows = df_merged[df_merged["id"].isnull()]
# Attribute an ID to the new rows
df_new_rows["id"] = range(start_id, start_id + len(df_new_rows))
df_new_rows.head()

### Insertion des données en base

In [None]:
df_new_rows.to_sql(
    "film",
    get_connection(),
    schema="inegalites_cinema",
    if_exists="append",
    index=False,
)