# Visualisation des données

In [4]:
# Importation des libriraires nécessaires
import pandas as pd
from matplotlib import pyplot as plt
import plotly.express as px
from ibmoviesdk import MovieClient, MovieConfig

* 'orm_mode' has been renamed to 'from_attributes'


In [5]:
from pathlib import Path


output_dir = Path("output")
output_dir.mkdir(exist_ok=True)

In [6]:
# Connexion à l'api via ibmoviesdk
config = MovieConfig(movie_base_url="https://data-movie-app-back.onrender.com")
client = MovieClient(config=config)

client.health_check()

MOVIE_API_BASE_URL in MovieConfig init: https://data-movie-app-back.onrender.com


{'message': 'API MovieLens Opérationnelle'}

In [7]:
analytics = client.get_analytics()
analytics

AnalyticsResponse(movie_count=9742, rating_count=100836, tag_count=100836, link_count=9742)

## Top 10 des genres de films les plus populaires

In [8]:
# init du compter de genre
from collections import Counter
import time


genre_couter = Counter()

# param pour le batching
limit = 500
skip = 0

# boucle pour parcourir les films
while True:
    batch = client.list_movies(skip=skip, limit=limit, output_format="dict")
    if not batch:
        break

    # on compte les genres
    for movie in batch:
        genres = movie.get("genres", [])
        genre_list = genres.split("|") if genres else []
        genre_couter.update(genre_list)

    skip += limit

# on convertit le compteur en DataFrame
genre_df = pd.DataFrame(genre_couter.items(), columns=["genre", "count"])
genre_df = genre_df.sort_values(by="count", ascending=False).head(10)


In [9]:
# Bar chart horizontal
fig = px.bar(
    genre_df,
    x="count",
    y="genre",
    orientation="h",
    color="count",
    color_continuous_scale=px.colors.sequential.Plasma,
    title="Top 10 des genres de films les plus populaires",
    labels={"count": "Nombre de films", "genre": "Genre"},
)
fig.update_layout(
    yaxis={"categoryorder": "total ascending"},
    height=500,
)

fig.show()

In [10]:
import json


api_movie_count = analytics.movie_count
print(f"Nombre total de films dans l'API : {api_movie_count}")

genre_data_file = output_dir / "genre_df.parquet"
meta_file = output_dir / "meta_data.json"

# Lecture du fichier meta_data s'il existe
if meta_file.exists():
    with open(meta_file, "r") as f:
        meta_data = json.load(f)
    cached_movie_count = meta_data.get("movie_count", 0)
else:
    meta_data = {}
    cached_movie_count = 0

# Décision: si on doit récupérer les données depuis l'API ou utiliser les données mises en cache
if genre_data_file.exists() and cached_movie_count == api_movie_count:
    print("Utilisation des données mises en cache.")
    genre_df = pd.read_parquet(genre_data_file)
else:
    print("Récupération des données depuis l'API.")
    genre_couter = Counter()
    limit = 1000
    skip = 0

    while True:
        batch = client.list_movies(skip=skip, limit=limit, output_format="dict")
        if not batch:
            break

        for movie in batch:
            genres = movie.get("genres", [])
            genre_list = genres.split("|") if genres else []
            genre_couter.update(genre_list)

        skip += limit
        time.sleep(0.5)  # pour éviter de surcharger l'API

        # conversion du compteur en DataFrame
        genre_df = pd.DataFrame(genre_couter.items(), columns=["genre", "count"])
        genre_df = genre_df.sort_values(by="count", ascending=False).head(10)

        # Sauvegarde intermédiaire
        genre_df.to_parquet(genre_data_file, index=False)
        with open(meta_file, "w") as f:
            json.dump({"movie_count": api_movie_count}, f)



Nombre total de films dans l'API : 9742
Utilisation des données mises en cache.


In [11]:
# Bar chart horizontal
fig = px.bar(
    genre_df,
    x="count",
    y="genre",
    orientation="h",
    color="count",
    color_continuous_scale=px.colors.sequential.Plasma,
    title="Top 10 des genres de films les plus populaires",
    labels={"count": "Nombre de films", "genre": "Genre"},
)
fig.update_layout(
    yaxis={"categoryorder": "total ascending"},
    height=500,
)

fig.show()

# Nombre total de films par année

In [15]:
import re

yearly_data_file = output_dir / "yearly_data.parquet"
meta_file = output_dir / "meta_movie_data.json"

analytics = client.get_analytics()

api_movie_count = analytics.movie_count

if meta_file.exists():
    with open(meta_file, "r") as f:
        meta_data = json.load(f)
    cached_movie_count = meta_data.get("movie_count", 0)
else:
    cached_movie_count = 0

if yearly_data_file.exists() and cached_movie_count == api_movie_count:
    print("Utilisation des données mises en cache.")
    yearly_df = pd.read_parquet(yearly_data_file)
else:
    print("Récupération des données depuis l'API.")
    movie_count = 0
    limit = 500
    skip = 0
    yearly_couter = Counter()
    year_parttern = re.compile(r"\((\d{4})\)$")

    while True:
        batch = client.list_movies(skip=skip, limit=limit, output_format="dict")
        if not batch:
            break

        for movie in batch:
            title = movie.get("title", "")
            match = year_parttern.search(title)
            if match:
                year = int(match.group(1))
                yearly_couter[year] += 1

        skip += limit
        time.sleep(0.5)
    
    df_yearly = pd.DataFrame(yearly_couter.items(), columns=["year", "movie_count"])

    df_yearly.to_parquet(yearly_data_file, index=False)
    with open(meta_file, "w") as f:
        json.dump({"movie_count": api_movie_count}, f)




Utilisation des données mises en cache.


In [13]:
df_yearly

Unnamed: 0,year,movie_count
0,1995,258
1,1994,237
2,1996,276
3,1976,44
4,1992,167
...,...,...
101,2015,274
102,2016,218
103,2017,147
104,2018,41


In [14]:
# Bar chart horizontal
fig = px.bar(
    df_yearly,
    x="year",
    y="movie_count",
    title="Nombre de films par année",
    labels={"year": "Année", "movie_count": "Nombre de films"},
)
fig.update_layout(
    xaxis_title="Année",
    yaxis_title="Nombre de films",
    height=500,
)

fig.show()