In [71]:
import json
import requests
import pandas as pd
from http import HTTPStatus

from database.getdata import query_data, get_connection, execute_query

### Récupération des données de l'API TMDB

In [None]:
def get_last_updated_movies_id(api_key: str) -> int:
    """
    Get the ID of the latest movie added to the TMDB database.
    """
    start_date = "2025-02-15"
    end_date = "2025-02-21"
    n_pages = 1
    url = f"https://api.themoviedb.org/3/movie/changes?end_date={end_date}&page={n_pages}&start_date={start_date}"
    headers = {
        "accept": "application/json",
        "Authorization": f"Bearer {api_key}",
    }

    # Retry the request until it succeeds
    retries = 0
    while retries < 2:
        try:
            response = requests.get(url, headers=headers)
            break
        except Exception:
            retries += 1

    response_in_json = json.loads(response.text)

    return response_in_json

In [None]:
def get_movies(api_key : str, origin_country : str) -> dict:
    """
    Get ID of movies from a country.
    """
    url = f"https://api.themoviedb.org/3/discover/movie?include_adult=false&include_video=false&language=fr-FR&page=1&with_origin_country={origin_country}"
    headers = {
        "accept": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    response = requests.get(url, headers=headers)
    response_in_json = json.loads(response.text)

    return response_in_json

french_movies = get_movies("FR")
french_movies

{'page': 1,
 'results': [{'adult': False,
   'backdrop_path': '/pqulyfkug9A7TmmRn5zrbRA8TAY.jpg',
   'genre_ids': [28, 35],
   'id': 1255788,
   'original_language': 'fr',
   'original_title': 'Le Jardinier',
   'overview': 'Chaque année, le Premier ministre fait éliminer une liste de gêneurs au nom de la raison d’État. Bien malgré lui, le nom de Serge Shuster, conseiller spécial à la présidence, s’y retrouve. Lui et sa famille sont condamnés à une mort certaine. Mais ce que tout le monde ignore, c’est que les Shuster ont un nouveau jardinier, Léo, qui par le passé n’a pas cisaillé que des haies.',
   'popularity': 180.881,
   'poster_path': '/qxqwLXgd4vjouSsxKXbGhQyjwoa.jpg',
   'release_date': '2025-01-30',
   'title': 'Le Jardinier',
   'video': False,
   'vote_average': 6.3,
   'vote_count': 81},
  {'adult': False,
   'backdrop_path': '/zwSDvbnN51JqU1ULzPnEc22DkqV.jpg',
   'genre_ids': [35, 18, 10749],
   'id': 1272149,
   'original_language': 'en',
   'original_title': 'Bridget Jo

### Traitement des données récupérées

In [None]:
# Create a DataFrame from the JSON response
df_movies = pd.DataFrame(french_movies['results'])
print(df_movies.shape[0])

# Map column names to database schema
columns_fr = {
    "title": "nom_originel",
    "original_language": "langue_principale",
    "release_date": "date_sortie_france",
    "vote_average": "tmbd_note_moyenne",
    "vote_count": "tmdb_total_votes",
    "popularity": "tmdb_score",
    "overview": "description",
    "id": "tmdb_id",
}
df_movies.rename(columns=columns_fr, inplace=True)

# Drop columns that are not needed
columns_to_drop = [
    "adult",
    "backdrop_path",
    "poster_path",
    "video",
    "original_title",
    "genre_ids",
    # below is temporary drop until database schema if correctly updated
    "tmbd_note_moyenne", 
    "tmdb_total_votes",
    "tmdb_score",
    "description",
    "tmdb_id",
]
df_movies.drop(columns=columns_to_drop, inplace=True)
df_movies.head()

20


Unnamed: 0,langue_principale,date_sortie_france,nom_originel
0,fr,2025-01-30,Le Jardinier
1,en,2025-02-12,Bridget Jones : Folle de lui
2,fr,2024-07-03,Elyas
3,fr,2024-11-14,"Miraculous World : Londres, la course contre l..."
4,fr,2024-06-19,Оцеляване


### Comparaison avec les données déjà en base

In [None]:
# Get the movies from the database
df_db_movies = query_data("SELECT id, nom_originel FROM inegalites_cinema.film")
print("Number of rows in table inegalites_cinema.film: " + str(df_db_movies.shape[0]))

# Merge the two DataFrames to find the movies that are not already in the database
df_merged = pd.merge(df_movies, df_db_movies, how="left", left_on="nom_originel", right_on="nom_originel")

# Get the ID of the latest movie added to the TMDB database
start_id = int(df_db_movies.loc[:,'id'].max()) + 1
# Get the movies that didn't match
df_new_rows = df_merged[df_merged['id'].isnull()]
# Attribute an ID to the new rows
df_new_rows['id'] = range(start_id, start_id+len(df_new_rows))
df_new_rows.head()

Number of rows in film table from database: 247
langue_principale     20
date_sortie_france    20
nom_originel          20
id                     0
dtype: int64


Unnamed: 0,langue_principale,date_sortie_france,nom_originel,id
0,fr,2025-01-30,Le Jardinier,248
1,en,2025-02-12,Bridget Jones : Folle de lui,249
2,fr,2024-07-03,Elyas,250
3,fr,2024-11-14,"Miraculous World : Londres, la course contre l...",251
4,fr,2024-06-19,Оцеляване,252
5,fr,2025-02-11,Lune de miel avec ma mère,253
6,fr,2024-06-28,Le Comte de Monte-Cristo,254
7,fr,2024-07-31,Largo Winch : Le Prix de l'argent,255
8,fr,2024-09-18,Les Graines du figuier sauvage,256
9,fr,2024-08-28,La Nuit se traîne,257


### Insertion des données en base

In [120]:
df_new_rows.to_sql(
    'film',
    get_connection(),
    schema='inegalites_cinema',
    if_exists='append',
    index=False
)

20