In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
#from data.dataset_enhancer import get_movies

  from pandas.core.computation.check import NUMEXPR_INSTALLED


### 0. Data Enhancement

First we ran the notebook data_preprocessing.ipynb to generate the necessary files for this notebook. The data_preprocessing.ipynb notebook loads the movie metadata and additional datasets, preprocesses the data, and saves the cleaned data to CSV files. The data_preprocessing.ipynb notebook also generates the necessary files for this notebook, such as the movie metadata for older and newer movies, and additional datasets for sequels, books, comics, remakes, and collections. There is no need to run it again, as the files are already on github. The data_preprocessing.ipynb notebook is run once to generate the necessary files for this notebook. You also need an API key, that we are not putting on github for security reasons.
The function stores the output in the data file.

In [2]:
"""keywords_name = ["sequels", "book", "comics", "remake"]
keywords_id = [9663, 818, 9717, 9714]
start_date = "1880-01-01"
end_date = "2010-01-01"

get_movies(keywords_name, keywords_id, start_date, end_date)

start_date = "2010-01-01"
end_date = "2024-01-01"
get_movies(keywords_name, keywords_id, start_date, end_date)"""

'keywords_name = ["sequels", "book", "comics", "remake"]\nkeywords_id = [9663, 818, 9717, 9714]\nstart_date = "1880-01-01"\nend_date = "2010-01-01"\n\nget_movies(keywords_name, keywords_id, start_date, end_date)\n\nstart_date = "2010-01-01"\nend_date = "2024-01-01"\nget_movies(keywords_name, keywords_id, start_date, end_date)'

## 1. Files loading and preprocessing

The following cell organizes and preprocesses movie datasets from different time periods (1880–2010 and 2010–2024) using the MovieFrames class. It first imports the class, then loads the movie metadata for older and newer movies. File paths for additional datasets (such as sequels, books, comics, remakes, and collections) are dynamically generated for both time periods. The MovieFrames objects (movie_frames_old and movie_frames_new) are then created to structure and preprocess the data. The movie_frames_old object specifically standardizes column names for the older dataset using the old=True flag. These objects help manage movie data by categories and prepare it for further analysis.

### 1.1 Data Collection

In [3]:
from src.models.movies_frame import MovieFrames

movie_df = pd.read_csv('data/MovieSummaries_filtered/movie_df.csv')
        
new_movie_df = pd.read_csv('data/all_sample/all_sample_2010_2024_metadata.csv')

keywords = ["sequels", "book", "comics", "remake"]
path_old = []
path_new = []

for keyword in keywords:        
    path_old.append(f"data/{keyword}/{keyword}_1880_2010_with_wiki_id.csv")
    path_new.append(f"data/{keyword}/{keyword}_2010_2024_metadata.csv")

path_old.append("data/collections/sequels_and_original_1880_2010_with_wiki_id.csv")
path_new.append("data/collections/sequels_and_original_2010_2024_metadata.csv")

movie_frames_old = MovieFrames(movie_df, path_old, 1880, 2010)
movie_frames_new = MovieFrames(new_movie_df, path_new, 2010, 2024)


### 1.2 Data Preprocessing

Then the following cell visualizes the size differences between datasets during the preprocessing steps using the display_data_cleaning_graph function. The function takes a MovieFrames object and calculates the number of movies at three stages:

- The original data loaded from TMDb.
- After matching the TMDb data with the Wikipedia data.
- After filtering out movies with mismatched release years.

These sizes are passed to the create_graph function, which generates a bar graph showing the changes in dataset sizes for five categories: sequel collections, sequels, books, comics, and remakes. The graph highlights how the preprocessing steps affect the number of movies in each category. 

In [4]:
## Size differencesKingdom Hospital
from src.models.movie_data_cleaner import display_data_cleaning_graph
fig = display_data_cleaning_graph(movie_frames_old)

fig

### 2.1 How many sequels are there compared to movies 

Groups the movies by 5-year intervals, counts how many movies fall into each interval, and returns the string representation of the interval labels. The first cell shows the result for data between 1880 and 2010, the second cell will show results between 2010-2024 when it will work.

Plot of number of movies per 5 year (left figure) VS. Plot of number of movies with sequels per 5 year (right figure)

In [5]:
from src.models.movie_counter import get_movie_counter_figure

#Plot figure 1 (left):  number of movies per 5 years
fig = get_movie_counter_figure(movie_frames_old)
fig


In [6]:
movie_frames_new.drop_different_years()
movie_frames_new.drop_impossible_years()
movie_frames_concat = movie_frames_old.concat_movie_frame(movie_frames_new)
fig = get_movie_counter_figure(movie_frames_concat)
fig

### 1.3 Ratio of sequels to original movies



Calculation and plot of the ratio between movies with sequel and the number of movies, both per 5 years.


$$
\text{Ratio} = \frac{\text{nb of movie with sequel per 5 year}}{\text{nb of movie per 5 year}}
$$

In [7]:

from src.models.movie_counter import get_ratio_movie_figure

fig = get_ratio_movie_figure(movie_frames_old)
fig

In [8]:

from src.models.movie_counter import get_ratio_movie_figure

fig = get_ratio_movie_figure(movie_frames_concat)
fig

## 3.1 Box office revenue


#### 3.1.1 Box office revenue for movies with sequels compared to all movies

In [10]:
from src.utils.evaluation_utils import inflate
import numpy as np
import swifter

for df in movie_frames_old.get_all_df():
    df["Movie box office revenue inflation adj"] = df.swifter.apply(lambda x: inflate(x["Movie box office revenue"], x["release year"]), axis=1)
                
for df in movie_frames_new.get_all_df():
    df["Movie box office revenue inflation adj"] = df.swifter.apply(lambda x: inflate(x["Movie box office revenue"], x["release year"]), axis=1)
    
for df in movie_frames_concat.get_all_df():
    df.reset_index(drop=True, inplace=True)
    df["Movie box office revenue inflation adj"] = df.swifter.apply(lambda x: inflate(x["Movie box office revenue"], x["release year"]), axis=1)



In the future `np.long` will be defined as the corresponding NumPy scalar.



AttributeError: module 'numpy' has no attribute 'long'

In [11]:
from src.models.box_office_revenue import get_box_office_absolute

# Plot figure 4: box office revenue per year
fig = get_box_office_absolute(movie_frames_concat)
fig

ModuleNotFoundError: No module named 'utils'

Calculation and plots the percentage of box office revenue each year contributed by movies with sequels, relative to the total box office revenue for all movies that year

$$
\text{Box Office \%} = \frac{\text{Box office of movies with sequel per year}}{\text{Box office for all movies per year}} * 100
$$

In [None]:
from src.models.box_office_revenue import get_box_office_ratio

fig = get_box_office_ratio(movie_frames_concat)
fig

Calculation and plot of the average inflation-adjusted box office revenue per year, both for all movies and for movies with sequels

In [None]:
from src.models.box_office_revenue import get_average_box_office_revenue

fig = get_average_box_office_revenue(movie_frames_concat)
fig

#### 3.1.2 Box office revenue for movies with sequel compared to the first movie of the collection

This plot shows the box office revenue for the first movie in a collection compared to the sequels. The x-axis represents the collection, and the y-axis is the box office revenue. On the left side, the box office of the entire serie following the first movie is seen, and on the right side, the average box office of the sequels is compared to the first. Improving sequels are linked in green, and decreasing sequels are linked in red.

compare_first_sequel splits the movies into first movies and sequels, calculates the box office revenue for each movie, and then plots the data.


In [None]:
from src.models.box_office_revenue import compare_first_sequel

fig_plt, fig2 = compare_first_sequel(movie_frames_concat)
fig_plt.show()
fig2.show()


These plots highlight where the first movie outperforms the sequel (red lines) and vice versa (green lines). The second plot also includes a yellow horizontal line showing the average box office revenue of all movies in the dataset.

The log scale on the y-axis is used to better visualize large differences in revenue, especially when there are very high values.

## 4. Number of movies in a collection

A plot of the comparison between the budget and the box office revenue for collection. The x-axis is the budget and the y-axis is the revenue. The size of the circles is proportional to the number of movies in the collection. 
get_budget_vs_revenue first computes the box office revenue and budget for each movie in the collection, then calculates the total box office revenue and budget for the collection. The budget is our first use of the extended data which wasn't in the original database, but was given by the TMDB API. The function then plots the data.



In [None]:
from src.models.collection_analysis import get_budget_vs_revenue
import seaborn as sns

fig = get_budget_vs_revenue(movie_frames_concat, ["data/sequels/sequels_1880_2010_extended.csv", "data/sequels/sequels_2010_2024_extended.csv"])
fig.show()


This pandas object has duplicate indices, and swifter may not be able to improve performance. Consider resetting the indices with `df.reset_index(drop=True)`.



Pandas Apply:   0%|          | 0/784 [00:00<?, ?it/s]

### 5. Time between sequels

This graph displays the time occured between sequels in a collection. The x-axis represents the number of years between sequels, and the y-axis is the collection. The size of the circles is proportional to the box office revenue of the movie. The link between each film . The graph helps identify patterns in the time between sequels and the revenue generated by each sequel.

get_time_between_sequels first separates the movies between first movie and the following sequels, creates a dataframe with the movies in each collection, their release date and the box office revenue. It then calculates the time between sequels. Then it draws the graph.

In [None]:
movie_frames_old.movie_df_sequel_original[movie_frames_old.movie_df_sequel_original["collection"] == "The Lord of the Rings Collection"]

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie languages,Movie countries,Movie genres,Unnamed: 0.2,id,release_date,original_title,title,collection,collection_id,release year,Movie box office revenue inflation adj
127,173944,/m/017gm7,The Lord of the Rings: The Two Towers,2002-12-05,926047100.0,179.0,"{""/m/05p2d"": ""Old English language"", ""/m/02h40...","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/0hj3n2k"": ""Fantasy Adventure"", ""/m/03k9fj...",58,121,2002-12-18,The Lord of the Rings: The Two Towers,The Lord of the Rings: The Two Towers,The Lord of the Rings Collection,119,2002.0,1568474000.0
1190,173941,/m/017gl1,The Lord of the Rings: The Fellowship of the Ring,2001-12-10,871530300.0,178.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/0hj3n2k"": ""Fantasy Adventure"", ""/m/03k9fj...",57,120,2001-12-18,The Lord of the Rings: The Fellowship of the Ring,The Lord of the Rings: The Fellowship of the Ring,The Lord of the Rings Collection,119,2001.0,1499475000.0
1209,174251,/m/017jd9,The Lord of the Rings: The Return of the King,2003-12-17,1119930000.0,250.0,"{""/m/05p2d"": ""Old English language"", ""/m/02h40...","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/0hj3n2k"": ""Fantasy Adventure"", ""/m/03k9fj...",59,122,2003-12-17,The Lord of the Rings: The Return of the King,The Lord of the Rings: The Return of the King,The Lord of the Rings Collection,119,2003.0,1854591000.0


In [None]:
from src.models.collection_analysis import get_time_between_sequels

fig = get_time_between_sequels(movie_frames_concat)
fig.show()

## Other line of enquiry:

- Highest grossing series
- By genre
- Find studios that do a lot of sequels
- Is there a correlation between the box office revenue of the first movie and the sequels
- Add more box office revenue data and get movie budget data
- ...

In [40]:
import plotly.express as px
from ipywidgets import interact, widgets

# Charger les données principales
#df = movie_frames_new.movie_df_sequel_original
#df['Movie release date'] = pd.to_datetime(df['Movie release date'])

# Charger les budgets et concaténer
df1_part1 = pd.read_csv("data/collections/sequels_and_original_1880_2010_extended.csv")
df1_part2 = pd.read_csv("data/collections/sequels_and_original_2010_2024_extended.csv")
df= pd.concat([df1_part1, df1_part2], ignore_index=True)
df.head(30)
df['release_date'] = pd.to_datetime(df['release_date'])
# Trier par collection et date de sortie
df = df.sort_values(by=['collection', 'release_date'])
df = df.drop_duplicates(subset=["title"], keep="first")
# Attribuer un numéro à chaque film dans une collection
df['Numéro'] = df.groupby('collection').cumcount() + 1

# Ajouter les budgets (fusion avec df1)
#df = pd.merge(df, df1[['id', 'budget','revenue','vote_average']], on='id', how='left')
print(df[df['collection'].str.contains("Harry Potter", na=False)])
# Garder uniquement les films avec un budget valide

df = df[(df['budget'].notna()) & (df['budget'] != 0) &
        (df['revenue'].notna()) & (df['revenue'] != 0) &
        (df['vote_average'].notna()) & (df['vote_average'] != 0)]
df = df[df['Numéro'] <= 5]



# Filtrer les collections qui ont au moins 5 films
df = df.groupby('collection').filter(lambda group: len(group) >= 5)

# Ajouter les colonnes pour les comparaisons
df["revenue_previous"] = df.groupby("collection")["revenue"].shift(1)
df["vote_previous"] = df.groupby("collection")["vote_average"].shift(1)

# Fonction pour définir les couleurs selon le critère choisi
def set_colors(comparison):
    if comparison == "revenus":
        df["Couleur"] = df.apply(lambda row: "Bleu" if row["revenue"] < row["revenue_previous"]
                                 else ("Rouge" if pd.notna(row["revenue_previous"]) else "Gris"), axis=1)
    elif comparison == "notes":
        df["Couleur"] = df.apply(lambda row: "Bleu" if row["vote_average"] < row["vote_previous"]
                                 else ("Rouge" if pd.notna(row["vote_previous"]) else "Gris"), axis=1)

# Fonction pour construire la figure
def build_figure(num_film, comparison="revenus"):
    set_colors(comparison)  # Appliquer les couleurs dynamiquement
    filtered_data = df[df["Numéro"] == num_film]
    
    if filtered_data.empty:
        print(f"Aucun film trouvé pour le numéro {num_film}")
        return
    
    fig = px.scatter(
        filtered_data,
        x="budget",
        y="vote_average",
        size="revenue",
        color="Couleur",
        hover_name="title",
        title=f"Analyse des films numéro {num_film} ({comparison})",
        labels={"budget": "Budget (M$)", "id": "Film ID"},
        range_y=[df['vote_average'].min(), df['vote_average'].max()],
        color_discrete_map={"Rouge": "red", "Bleu": "blue", "Gris": "grey"},
        log_x=True
    )
    return fig

# Fonction principale pour l'interactivité
def plot_interactive(num_film, comparison):
    fig = build_figure(num_film, comparison)
    fig.show()

# Ajouter les sliders interactifs
max_num_film = df["Numéro"].max()

slider_num_film = widgets.IntSlider(min=1, max=max_num_film, step=1, value=1, description="Numéro du film:")
comparison_dropdown = widgets.Dropdown(
    options=["revenus", "notes"],
    value="revenus",
    description="Comparer par:"
)

# Afficher les widgets
interact(plot_interactive, num_film=slider_num_film, comparison=comparison_dropdown)

         id release_date                                original_title  \
1130    671   2001-11-16      Harry Potter and the Philosopher's Stone   
1131    672   2002-11-13       Harry Potter and the Chamber of Secrets   
1134    673   2004-05-31      Harry Potter and the Prisoner of Azkaban   
1132    674   2005-11-16           Harry Potter and the Goblet of Fire   
1135    675   2007-07-08     Harry Potter and the Order of the Phoenix   
1133    767   2009-07-15        Harry Potter and the Half-Blood Prince   
1136  12444   2010-11-17  Harry Potter and the Deathly Hallows: Part 1   
1137  12445   2011-07-12  Harry Potter and the Deathly Hallows: Part 2   

                                             title               collection  \
1130      Harry Potter and the Philosopher's Stone  Harry Potter Collection   
1131       Harry Potter and the Chamber of Secrets  Harry Potter Collection   
1134      Harry Potter and the Prisoner of Azkaban  Harry Potter Collection   
1132           Ha

interactive(children=(IntSlider(value=1, description='Numéro du film:', max=5, min=1), Dropdown(description='C…

<function __main__.plot_interactive(num_film, comparison)>

In [42]:
import plotly.express as px
from ipywidgets import interact, widgets
import ast

# Ajouter les colonnes pour les comparaisons
df["revenue_previous"] = df.groupby("collection")["revenue"].shift(1)
df["vote_previous"] = df.groupby("collection")["vote_average"].shift(1)

def extract_genres(genres, first_only=False):
    try:
        # Convertir la chaîne en liste de dictionnaires
        genres_list = ast.literal_eval(genres)
        if not genres_list:
            return None
        if first_only:
            return genres_list[0]['name']
        clean_genres = [genre['name'] for genre in genres_list if 'name' in genre]
        return ", ".join(clean_genres)
    except (ValueError, SyntaxError, TypeError):
        return None


#on fait en sorte que le premier film d'un collection determine le egnre de la colection poru plus de coherence comme ca chaque collection a un seul genre 
df['main_genre'] = df['genres'].apply(lambda x: extract_genres(x, first_only=True))

# Créer un dictionnaire où le genre est déterminé par le film numéro 1 de chaque collection
genre_dict = df[df['Numéro'] == 1].set_index('collection')['main_genre'].to_dict()

df["Ratio_Revenu_Note"] = ((df["revenue"]-df["budget"] )/ df["budget"])*(df["vote_average"])


# Trier par main_genre et numéro de film
df = df.sort_values(by=["main_genre", "Numéro"])
# Agréger les données : calculer la moyenne du Ratio_Revenu_Note par Numéro et par Genre
df_grouped = df.groupby(["Numéro", "main_genre"], as_index=False).agg({
    "Ratio_Revenu_Note": "mean"
})

# Créer une courbe par genre
fig = px.line(
    df_grouped,
    x="Numéro",
    y="Ratio_Revenu_Note",
    color="main_genre",
    markers=True,
    title="Évolution du Ratio Revenu/Note Moyenne par Genre",
    labels={"Numéro": "Numéro du Film", "Ratio_Revenu_Note": "Ratio Revenu/Note"}
)

# Personnaliser le graphique
fig.update_traces(line=dict(width=2))
fig.update_layout(
    xaxis=dict(tickmode="linear", dtick=1),
    yaxis=dict(title="Ratio Revenu/Note Moyenne"),
    legend_title="main_genre",
    template="plotly_white"
)

fig.show()
# Vérifier les valeurs uniques de Numéro pour "Romance"


In [43]:
import plotly.express as px

# Étape 1 : Calculer la variation du Ratio_Revenu_Note d'un film au suivant dans chaque collection
df['ratio_growth'] = df.groupby("collection")["Ratio_Revenu_Note"].pct_change() * 100

# Étape 2 : Filtrer uniquement les lignes où la variation existe (pour éviter les NaN initiaux)
df_growth = df.dropna(subset=['ratio_growth'])

# Étape 3 : Calculer la moyenne de la variation par numéro de film et par genre
df_growth_grouped = df_growth.groupby(["Numéro", "main_genre"], as_index=False).agg({
    "ratio_growth": "mean"
})

# Étape 4 : Créer un graphique pour la variation du ratio
fig = px.line(
    df_growth_grouped,
    x="Numéro",
    y="ratio_growth",
    color="main_genre",
    markers=True,
    title="Variation Moyenne du Ratio Revenu/Budget par Numéro de Film et Genre",
    labels={"Numéro": "Numéro du Film", "ratio_growth": "Variation du Ratio (%)"}
)

# Améliorer l'affichage
fig.update_traces(line=dict(width=2))
fig.update_layout(
    xaxis=dict(tickmode="linear", dtick=1),
    yaxis=dict(title="Variation Moyenne (%)", tickformat=".2f"),
    template="plotly_white",
    legend_title="Genre"
)

# Afficher le graphique
fig.show()


In [44]:
import plotly.express as px

# Étape 1 : Extraire le budget du premier film de chaque collection
first_movie_budget = df[df['Numéro'] == 1].set_index('collection')['budget'].to_dict()

# Étape 2 : Grouper par collection pour obtenir le total des revenus
roi_by_collection = df.groupby("collection", as_index=False).agg(
    total_revenue=("revenue", "sum")
)

# Ajouter le budget du premier film à chaque collection
roi_by_collection['first_movie_budget'] = roi_by_collection['collection'].map(first_movie_budget)

# Étape 3 : Calculer le ROI basé sur le premier film
roi_by_collection['ROI'] = (roi_by_collection['total_revenue'] - roi_by_collection['first_movie_budget']) / roi_by_collection['first_movie_budget']

# Ajouter le genre principal basé sur le premier film
genre_dict = df[df['Numéro'] == 1].set_index('collection')['main_genre'].to_dict()
roi_by_collection['main_genre'] = roi_by_collection['collection'].map(genre_dict)

# Étape 4 : Visualisation avec Plotly
fig = px.bar(
    roi_by_collection,
    x="ROI",
    y="collection",
    color="main_genre",
    orientation='h',
    title="Rentabilité des Collections par Genre (Basé sur le Premier Film)",
    labels={"ROI": "Retour sur Investissement (ROI)", "collection": "Collection", "main_genre": "Genre"},
    hover_data=["total_revenue", "first_movie_budget"]
)

# Ajouter une ligne verticale pour un seuil de rentabilité neutre
fig.add_vline(x=0, line_dash="dot", line_color="red", annotation_text="Seuil de Rentabilité")
fig.update_traces(
    textposition='inside',  # Affiche les valeurs horizontalement à l'intérieur des barres
    textangle=0,  # Assure que le texte reste horizontal (angle à 0°)
    insidetextanchor="middle",  # Centre le texte à l'intérieur des barres
    marker=dict(line=dict(width=0.5)),  # Réduire l'épaisseur du bord des barres
    selector=dict(type='bar')
)
# Personnalisation du graphique
fig.update_layout(
    height=20 * len(roi_by_collection),  # Ajuster la taille dynamique du graphique
    xaxis=dict(title="Retour sur Investissement (ROI)", tickformat=".0%"),
    yaxis=dict(title="Collection"),
    template="plotly_white"
)

# Afficher le graphique
fig.show()

# Afficher les valeurs triées
print(roi_by_collection.sort_values(by='ROI', ascending=False))


                                  collection  total_revenue  \
15            Paranormal Activity Collection      811605295   
12                        Mad Max Collection      713465182   
25        Texas Chainsaw Massacre Collection      142186000   
20                            Saw Collection      669753364   
10                     James Bond Collection      515600000   
9                       Insidious Collection      744379972   
5                       Halloween Collection      139605426   
4                 Friday the 13th Collection      173079579   
23                      Star Wars Collection     3460213893   
31                 The Terminator Collection     1845327738   
30                      The Purge Collection      533895379   
8                   Indiana Jones Collection     2367696867   
0       A Nightmare on Elm Street Collection      203335206   
11                  Jurassic Park Collection     4889523548   
18                 Police Academy Collection      34070

In [45]:
import pandas as pd
import plotly.express as px

# Étape 1 : Calculer le ROI accumulé pour chaque collection à chaque temps T (numéro de film)
df['first_movie_budget'] = df.groupby('collection')['budget'].transform('first')  # Budget du premier film
df['cumulative_revenue'] = df.groupby('collection')['revenue'].cumsum()          # Revenus cumulés

# Calcul du ROI progressif et arrondi à l'unité
df['ROI_progressif'] = ((df['cumulative_revenue'] - df['first_movie_budget']) / df['first_movie_budget']).round(0)

# Étape 2 : Créer un DataFrame regroupé pour l'animation
df_race = df.groupby(['Numéro', 'collection', 'main_genre'], as_index=False).agg({
    'ROI_progressif': 'last'  # Dernière valeur du ROI accumulé pour chaque "temps"
})
df_race['collection_clean'] = df_race['collection'].str.replace(r'\s*Collection$', '', regex=True)

# Étape 3 : Créer un race chart avec Plotly
fig = px.bar(
    df_race,
    x="ROI_progressif",
    y="collection_clean",
    color="main_genre",
    animation_frame="Numéro",  # Animation par numéro de film
    orientation='h',
    title="Évolution du ROI Progressif par Collection et Genre",
    labels={"ROI_progressif": "Retour sur Investissement (ROI)", "collection_clean": "Collection", "main_genre": "Genre"},
    text="ROI_progressif"  # Afficher la valeur arrondie sur chaque barre
)

# Personnalisation du graphique pour plus de fluidité
fig.update_layout(
    xaxis=dict(title="Retour sur Investissement (ROI)", type="log"),  # Échelle logarithmique
    yaxis=dict(title="Collection", categoryorder='total ascending'),
    transition={"duration": 1200, "easing": "cubic-in-out"},  # Animation plus lente et fluide
    margin=dict(l=250, r=50, t=50, b=50),  # Fixer les marges pour éviter les mouvements
    height=800,
    updatemenus=[dict(type="buttons", showactive=False,
                      buttons=[dict(label="▶ Play",
                                    method="animate",
                                    args=[None, {"frame": {"duration": 1500, "redraw": True},
                                                 "fromcurrent": True,
                                                 "mode": "immediate"}]),
                               dict(label="⏸ Pause",
                                    method="animate",
                                    args=[[None], {"frame": {"duration": 0, "redraw": False},
                                                   "mode": "immediate"}])])]
)

# Mettre à jour les barres pour une transition douce
fig.update_traces(
    textposition='inside',  # Affiche les valeurs horizontalement à l'intérieur des barres
    textangle=0,  # Assure que le texte reste horizontal (angle à 0°)
    insidetextanchor="middle",  # Centre le texte à l'intérieur des barres
    marker=dict(line=dict(width=0.5)),  # Réduire l'épaisseur du bord des barres
    selector=dict(type='bar')
)

# Afficher le graphique
fig.show()


In [46]:
import pandas as pd
import plotly.graph_objects as go
import numpy as np
import plotly.io as pio

pio.templates.default = "simple_white"

# Étape 1 : Fixer le budget du premier film pour chaque collection
df['first_movie_budget'] = df.groupby('collection')['budget'].transform('first')

# Étape 2 : Recalculer les revenus cumulés par collection
df['cumulative_revenue'] = df.groupby('collection')['revenue'].cumsum()

# Étape 3 : Calculer le ROI progressif et l'arrondir
df['ROI_progressif'] = ((df['cumulative_revenue'] - df['first_movie_budget']) / df['first_movie_budget']).round(1)

# Étape 4 : Préparer les données pour la race chart
df_race = df.groupby(['Numéro', 'collection', 'main_genre'], as_index=False).agg({
    'ROI_progressif': 'last'
})
df_race['collection_clean'] = df_race['collection'].str.replace(r'\s*Collection$', '', regex=True)

# Tri dynamique par Numéro et ROI_progressif
df_race = df_race.sort_values(['Numéro', 'ROI_progressif'], ascending=[True, False])

# Étape 5 : Couleurs aléatoires pour chaque collection
unique_collections = df_race['collection_clean'].unique()
colors = {name: f"rgb({np.random.randint(1,255)},{np.random.randint(1,255)},{np.random.randint(1,255)})" for name in unique_collections}

# Étape 6 : Créer une fonction pour récupérer les 10 premières valeurs
def get_top_10(data, num):
    filtered = data[data['Numéro'] == num]
    return filtered.nlargest(10, 'ROI_progressif')

# Frames de l'animation
frames = []
for num in sorted(df_race['Numéro'].unique()):
    top10 = get_top_10(df_race, num)
    frames.append(go.Frame(
        data=[go.Bar(x=top10['collection_clean'], y=top10['ROI_progressif'],
                     marker_color=[colors[name] for name in top10['collection_clean']],
                     text=top10['ROI_progressif'].astype(str) + "%",
                     textposition='outside', cliponaxis=False)],
        layout=go.Layout(title_text=f"Évolution du ROI Progressif - Film {num}")
    ))

# Données initiales
initial_data = get_top_10(df_race, df_race['Numéro'].min())

# Création du graphique
fig = go.Figure(
    data=[go.Bar(x=initial_data['collection_clean'], y=initial_data['ROI_progressif'],
                 marker_color=[colors[name] for name in initial_data['collection_clean']],
                 text=initial_data['ROI_progressif'].astype(str) + "%",
                 textposition='outside', cliponaxis=False)],
    layout=go.Layout(
        title="Évolution du ROI Progressif par Collection et Genre",
        font=dict(size=20),
        height=700,
        xaxis=dict(title="Collection", showline=False, tickangle=-90),
        yaxis=dict(title="Retour sur Investissement (ROI)",type="log", showline=False),
        updatemenus=[dict(
            type="buttons",
            showactive=False,
            buttons=[
                dict(label="Play",
                     method="animate",
                     args=[None, {"frame": {"duration": 1000, "redraw": True},
                                  "fromcurrent": True}]),
                dict(label="Pause",
                     method="animate",
                     args=[[None], {"frame": {"duration": 0, "redraw": False},
                                    "mode": "immediate"}])
            ]
        )]
    ),
    frames=frames
)

# Afficher le graphique
fig.show()


In [86]:

# Vérifier les doublons pour la collection Underworld
df_underworld = df[df['collection'] == 'Underworld Collection']
df_underworld.head(5)



Unnamed: 0,id,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie languages,Movie countries,Movie genres,collection_id,collection_x,...,vote_average,revenue_previous,vote_previous,Couleur,main_genre,Ratio_Revenu_Note,ratio_growth,first_movie_budget,cumulative_revenue,ROI_progressif
8549,277,Underworld,2003-09-19,95708457,122,"[{'english_name': 'English', 'iso_639_1': 'en'...","[{'iso_3166_1': 'US', 'name': 'United States o...","[{'id': 14, 'name': 'Fantasy'}, {'id': 28, 'na...",2326,Underworld Collection,...,6.8,,,Gris,Fantasy,22.782614,,22000000,95708457,3.4
8550,834,Underworld: Evolution,2006-01-12,111476513,106,"[{'english_name': 'English', 'iso_639_1': 'en'...","[{'iso_3166_1': 'US', 'name': 'United States o...","[{'id': 14, 'name': 'Fantasy'}, {'id': 28, 'na...",2326,Underworld Collection,...,6.6,95708457.0,6.8,Rouge,Fantasy,8.1149,-64.381174,22000000,207184970,8.4
8551,12437,Underworld: Rise of the Lycans,2009-01-22,92158961,92,"[{'english_name': 'English', 'iso_639_1': 'en'...","[{'iso_3166_1': 'US', 'name': 'United States o...","[{'id': 14, 'name': 'Fantasy'}, {'id': 28, 'na...",2326,Underworld Collection,...,6.5,111476513.0,6.6,Bleu,Fantasy,10.615236,30.811667,22000000,299343931,12.6
8552,52520,Underworld: Awakening,2012-01-19,160112671,88,"[{'english_name': 'English', 'iso_639_1': 'en'...","[{'iso_3166_1': 'US', 'name': 'United States o...","[{'id': 14, 'name': 'Fantasy'}, {'id': 28, 'na...",2326,Underworld Collection,...,6.3,92158961.0,6.5,Rouge,Fantasy,8.11014,-23.599054,22000000,459456602,19.9
8553,346672,Underworld: Blood Wars,2016-11-24,81093313,91,"[{'english_name': 'English', 'iso_639_1': 'en'...","[{'iso_3166_1': 'US', 'name': 'United States o...","[{'id': 14, 'name': 'Fantasy'}, {'id': 28, 'na...",2326,Underworld Collection,...,5.9,160112671.0,6.3,Bleu,Fantasy,7.770016,-4.193821,22000000,540549915,23.6
