# Load

In [1]:
import pandas as pd
import numpy as np
import re
import json
import os

from src.data_completion import *
from src.data_preprocessing import *
from src.data_loading import *
from src.data_fetching import *
from src.data_visualization import *

import statsmodels.formula.api as smf

from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import plotly.express as px
import plotly.io as pio
from plotly.subplots import make_subplots
import plotly.graph_objects as go


%load_ext autoreload
%autoreload 2

In [3]:

# # Example: Create a simple scatter plot
# df = px.data.iris()  # Sample dataset
# fig = px.scatter(df, x="sepal_width", y="sepal_length", color="species", title="Sepal Dimensions")
# fig.show()
# # Save the plot as an HTML file
# fig.write_html("plot.html")

In [2]:
# Load the preprocessed datasets
full_movie_data_preprocessed = pd.read_csv('data/preprocessed/full_movie_data_preprocessed.csv')
full_characters_data_preprocessed = pd.read_csv('data/preprocessed/full_characters_data_preprocessed.csv')
subset_movie_with_full_data_on_lead_actors = pd.read_csv('data/preprocessed/subset_movie_with_full_data_on_lead_actors.csv')
lead_actors_data_on_subset_movie = pd.read_csv('data/preprocessed/lead_actors_data_on_subset_movie.csv')
characters_data_on_subset_movie = pd.read_csv('data/preprocessed/characters_data_on_subset_movie.csv')

# We must convert dates to datetime
lead_actors_data_on_subset_movie['actor_dob'] = pd.to_datetime(lead_actors_data_on_subset_movie['actor_dob'])
characters_data_on_subset_movie['actor_dob'] = pd.to_datetime(characters_data_on_subset_movie['actor_dob'])
full_movie_data_preprocessed['movie_release_date'] = pd.to_datetime(full_movie_data_preprocessed['movie_release_date'])


In [3]:
movie_data_completed = full_movie_data_preprocessed.copy()
movie_data_completed.describe()

Unnamed: 0,wikipedia_movie_id,movie_release_date,box_office_revenue,runtime,averageRating,numVotes,release_year,adjusted_box_office
count,3493.0,3493,3493.0,3493.0,3493.0,3493.0,3493.0,3493.0
mean,7225853.0,1995-05-02 01:52:57.440595456,41679770.0,107.578214,6.268394,60651.38,1994.808188,90749580.0
min,3746.0,1958-01-29 00:00:00,1.0,61.0,1.7,506.0,1958.0,1.913719
25%,1076567.0,1987-10-11 00:00:00,4106588.0,95.0,5.7,5905.0,1987.0,9125168.0
50%,3058252.0,1996-10-11 00:00:00,14715070.0,104.0,6.4,18507.0,1996.0,32386250.0
75%,9932614.0,2005-06-10 00:00:00,41000000.0,116.0,6.9,59226.0,2005.0,94593130.0
max,36814250.0,2012-11-02 00:00:00,963420400.0,198.0,8.8,2318314.0,2012.0,4022758000.0
std,8926649.0,,79774460.0,18.176518,0.972713,130017.2,11.856645,180633200.0


In [20]:
movie_data_completed["numVotes"].mean()

60651.37560835958

In [7]:
movie_data_completed.columns

Index(['wikipedia_movie_id', 'freebase_movie_id', 'movie_name',
       'movie_release_date', 'box_office_revenue', 'runtime', 'languages',
       'countries', 'genres', 'imdb_id', 'averageRating', 'numVotes',
       'lead_actor_1', 'lead_actor_2', 'release_year', 'adjusted_box_office'],
      dtype='object')

# Plotly histo

In [43]:

def plotly_histogram(
    data: pd.DataFrame,
    columns: list,
    titles: list,
    labels: list,
    bins=50,
    log_scale=False,
    kdes=True,
    hue=None,
):
    """Plot histograms for the specified columns using Plotly.

    Args:
        data (pd.DataFrame): The dataset to plot.
        columns (list): The columns from the dataset to plot.
        titles (list): The titles for each plot.
        labels (list): The x labels for each plot.
        bins (int or list, optional): The number of bins. Defaults to 50.
        log_scale (bool or list, optional): If true, uses a log scale. Defaults to False.
        kdes (bool or list, optional): If true, adds a kde curve. Defaults to True.
        hue (str or None, optional): Name of the column in the dataset for color grouping. Defaults to None.
    """
    if not isinstance(bins, list):
        bins = [bins] * len(columns)

    if not isinstance(log_scale, list):
        log_scale = [log_scale] * len(columns)

    if not isinstance(kdes, list):
        kdes = [kdes] * len(columns)

    figs = []
    for i, col in enumerate(columns):
        if log_scale[i]:
            data_col = data[col][data[col] > 0]  # Ensure no non-positive values
            bins_log = np.logspace(
                np.log10(data_col.min()), np.log10(data_col.max()), bins[i] + 1
            )
            hist_fig = px.histogram(
            data,
            x=col, log_x=True,
            color=hue,
            nbins=bins[i],
            title=titles[i], histnorm = 'density',
            labels={col: labels[i], 'count': 'Count'},
            marginal="box" if kdes[i] else None,
            )
            # hist_fig.update_layout(
            #     xaxis=dict(type="log"),
            #     #yaxis=dict(type="log" if hue is None else "linear"),
            # )
            #hist_fig.update_xaxes(xbins=dict(start=bins_log[0], end=bins_log[-1], size="D"))
        else:
            hist_fig = px.histogram(
                data,
                x=col,
                color=hue,
                nbins=bins[i],
                title=titles[i], histnorm = 'density',
                labels={col: labels[i], 'count': 'Count'},
                marginal="box" if kdes[i] else None,
            )
        
        hist_fig.update_layout(
            width=400, height=400,
            title_font_size=14, title_x=0.5,  # Centers the title horizontally
            title_y=0.8, xaxis=dict(
            title_font_size=12,  # Change x-axis label font size
            ), yaxis=dict(
            title_font_size=12,  # Change x-axis label font size
            ),
        )
        
        
        figs.append(hist_fig)

    return figs

def histo_acots_plotly(actors: pd.DataFrame, hue=None):
    """Plot histograms for actor data using Plotly.

    Args:
    actors (pd.DataFrame): The actors dataset 
    hue (str or None): Name of the column in the dataset for color grouping. Defaults to None.
    """
    columns = ["actor_height", "actor_age_at_release", "actor_dob"]
    titles = [
        "Height of the actor",
        "Age of actor at movie release",
        "Date of birth of the actor",
    ]
    labels = ["Height (m)", "Age (years)", "Date of birth"]

    return plotly_histogram(actors, columns, titles, labels, bins=25, hue=hue)


def histo_movies_plotly(
    movies: pd.DataFrame,
    hue: str | None = None,
    axes: list[plt.Axes] = None,
):
    """Plot histograms for the movie data.

    Args:
        movies (pd.DataFrame): The movie dataset.
        hue (str | None, optional): Name of the column in the dataset. Defaults to None.
        axes (list[plt.Axes], optional): The axes to plot on. Defaults to None.
    """
    columns = ["runtime", "box_office_revenue", "movie_release_date"]
    titles = [
        "Runtime of the movie",
        "Box office revenue of the movie",
        "Release date of the movie",
    ]
    labels = ["Runtime (min)", "Box office revenue (dollars, log scale)", "Release date"]
    log_scale = [False, True, False]

    return plotly_histogram(
        movies,
        columns,
        titles,
        labels,
        bins=50,
        log_scale=log_scale,
        hue=hue
    )


In [7]:
histo_movies_plotly(movie_data_completed)[0].show() # ancien plot

## new version for movie numerical data

In [8]:
# Updated version for histogram movies with subplot + logscale
movie_data_completed["movie_release_year"] = pd.to_datetime(movie_data_completed["movie_release_date"]).dt.year
columns = ["runtime", "box_office_revenue", "movie_release_year"]
titles = [
        "Runtime of the movie",
        "Box office revenue of the movie",
        "Release date of the movie",
]
labels = ["Runtime (min)", "Box office revenue (dollars, log scale)", "Release date"]
#log_scale = [False, True, False]

fig = make_subplots(rows=2, cols=2, subplot_titles=titles)

fig.add_trace(
    go.Histogram(x=movie_data_completed[columns[0]], 
                 nbinsx=50, histnorm='probability'), 
    row=1, col=1
)
# fig.add_trace(
#     go.Box(
#         x=movie_data_completed[columns[0]], 
#         boxpoints=False,  # Hide individual points
#         name="Boxplot",
#         opacity=0.5, marker_color='blue', y0=0.2
#     )
# )

fig.add_trace(
    go.Histogram(x=np.log10(movie_data_completed[columns[1]]), histnorm='probability', nbinsx=50),
    row=1, col=2
)

fig.add_trace(
    go.Histogram(x=movie_data_completed[columns[2]], histnorm='probability', nbinsx=50),
    row=2, col=1
)
# Customize the x-axis ticks for the first subplot
fig.update_xaxes(
    tickmode='linear',  # Use linear tick mode
    tick0=0,            # Start the ticks at 0
    dtick=20,           # Set the tick step to 20
    range=[60, 200],     # Limit the range of the x-axis to [0, 200]
    title_text=labels[0],
    row=1, col=1
)
fig.update_yaxes(title_text = 'Density', row=1, col=1, range=[0,0.2])
fig.update_xaxes(
    tickvals=np.log10([10, 1e3, 1e5, 1e7, 1e9]),  # Log-transformed values
    ticktext=["10", "1K", "100K", "10M", "1B"],  # Original scale labels
    title_text=labels[1],
    row=1, col=2
)
fig.update_yaxes(title_text = 'Density', row=1, col=2)
fig.update_xaxes(
    tickmode='linear',      # Linear ticks for years
    dtick=10,                # Ticks every 5 years
    range=[1950, 2020],     # Set the range explicitly (as plain years)
    #tickformat=".0f",       # Force plain year formatting (e.g., 1980, 1985, ...)
    title_text=labels[2],
    row=2, col=1
)
fig.update_yaxes(title_text = 'Density', row=2, col=1)

fig.update_layout(height=700, width=800, showlegend=False, title_text="Numerical features of the movies")


# Show the figure
fig.show()
fig.write_html("movies_numeric_features.html")

# Actors

In [9]:
#Demographic characteristics of lead actors 
figs_actor = histo_acots_plotly(lead_actors_data_on_subset_movie)
for i in figs_actor:
    i.show()
figs_actor[0].write_html("height_actor.html")
figs_actor[1].write_html("age_actor.html")


In [10]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

def count_plots(
    data,
    columns,
    titles,
    labels,
    cutoffs=None,
    horizontal=True,
    transforms=None,
    hue=None,
    axes=None,
):
    if axes is None:
        axes = []

    if not isinstance(cutoffs, list):
        cutoffs = [cutoffs] * len(columns)

    if not isinstance(transforms, list):
        transforms = [transforms] * len(columns)

    if not isinstance(horizontal, list):
        horizontal = [horizontal] * len(columns)

    for i, col in enumerate(columns):
        transform = transforms[i]
        if transform:
            col_data = transform(data[col])
            data_copy = pd.merge(
                col_data, data, left_index=True, right_index=True, suffixes=("_x", "")
            )
            col = f"{col}_x"
        else:
            data_copy = data

        col_counts = data_copy[col].value_counts().reset_index()
        col_counts.columns = [col, 'count']

        cutoff = cutoffs[i]
        if cutoff:
            col_counts = col_counts[:cutoff]

        fig = None
        if horizontal[i]:
            fig = px.bar(
                col_counts,
                y=col,
                x='count',
                orientation="h",
                title=titles[i],
                labels={col: labels[i], 'count': 'Count'},  
            )

        else:
            fig = px.bar(
                col_counts,
                x=col,
                y='count', 
                title=titles[i],
                labels={col: labels[i], 'count': 'Count'},
                
            )
        # uncomment for the 2nd plot, comment for 1st plot. opposite for the other update layout
        #fig.update_layout(title_font_size=16,title_x=0.5,
       #height=len(col_counts) * 10 + 100, 
        #yaxis=dict(tickmode="linear", tickfont=dict(size=10)),
       # margin=dict(l=120, r=10, t=50, b=50)  )
        fig.update_layout(title_font_size=16, title_x=0.5, width=250, height=350)
        axes.append(fig)
   

    return axes

def count_actors(
    actors,
    hue=None,
    axes=None,
):
    columns = ["actor_gender", "actor_ethnicity_label"]
    titles = ["Actor gender distribution", "Most common ethnicities"]
    labels = ["Gender", "Ethnicity"]
    cutoffs = [None, 20]
    horizontal = [False, True]

    return count_plots(
        actors,
        columns,
        titles,
        labels,
        cutoffs,
        horizontal=horizontal,
        hue=hue,
        axes=axes,
    )


figures = count_actors(lead_actors_data_on_subset_movie)
for fig in figures:
     fig.show()

#figures[1].write_html("actor_ethnicity.html")
figures[0].write_html("actor_gender.html")

#c'est normal que le graphe 1 soit casser quand on plot le graphe 2 avec tous les noms des ethnies tkt

# Pie charts

In [11]:
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

def count_pie_plots_with_subtitles(
    data1,
    data2,
    column,
    title,
    label,
    cutoff=None,
):
    col_counts_1 = data1[column].value_counts().reset_index()
    col_counts_1.columns = [column, 'count']

    col_counts_2 = data2[column].value_counts().reset_index()
    col_counts_2.columns = [column, 'count']

    if cutoff:
        col_counts_1 = col_counts_1[:cutoff]
        col_counts_2 = col_counts_2[:cutoff]

    fig = make_subplots(
        rows=1, cols=2,
        specs=[[{'type': 'domain'}, {'type': 'domain'}]],
        subplot_titles=["All Actors", "Lead Actors"]
    )
    fig.add_trace(
        go.Pie(
            labels=col_counts_1[column],
            values=col_counts_1['count'],
            name="All Actors"
        ),
        row=1, col=1
    )
    fig.add_trace(
        go.Pie(
            labels=col_counts_2[column],
            values=col_counts_2['count'],
            name="Lead Actors"
        ),
        row=1, col=2
    )
    fig.update_layout(
        title_text=title,
        title_x=0.5,
        template="plotly_white"
    )
    return fig

fig_ethnicity = count_pie_plots_with_subtitles(
    characters_data_on_subset_movie,
    lead_actors_data_on_subset_movie,
    column="actor_ethnicity_label",
    title="Most Common Ethnicities",
    label="Ethnicity",
    cutoff=10
)

fig_ethnicity.show()

fig_ethnicity.write_html("pie_actor_ethnicity.html")


In [12]:
figs_actor = histo_acots_plotly(lead_actors_data_on_subset_movie)
figs_all_actor = histo_acots_plotly(characters_data_on_subset_movie)

overlayed_figs = []

colors_actor = "blue"  
colors_all_actor = "rgba(255, 165, 0, 0.3)"  

for i in range(len(figs_actor)):
    fig = figs_actor[i]  # Current subplot

    for trace in fig.data:
        trace.marker.color = colors_actor
        trace.name = "Lead Actors"
        trace.showlegend = (i == 0)  # Show legend only for height subplot

    for trace in figs_all_actor[i].data:
        trace.marker.color = colors_all_actor
        trace.name = "All Actors"
        trace.showlegend = (i == 0)  # Show legend only for height subplot
        fig.add_trace(trace)  

    # Calculate widths for the first figure
    label_length = max(len(trace.name) for trace in fig.data)  # Longest label length
    label_width = label_length * 8  # Approximate 8 pixels per character for the legend
    plot_width = (750 - label_width) // 2  # Ensure both plots have the same width
    figure_1_width = plot_width + label_width
    figure_2_width = plot_width

    # Update layout
    fig.update_layout(
        barmode="overlay",
        legend_title="Dataset" if i == 0 else None,  # Legend title only for height subplot
        showlegend=(i == 0),  # Show legend only for height subplot
        width=figure_1_width if i == 0 else figure_2_width,  # Dynamically calculated widths
        legend=dict(
            x=1.05,  # Slightly outside the plot area
            y=0.5,   # Center vertically
            xanchor="left",
            yanchor="middle",
        ) if i == 0 else None  # Position legend only for height subplot
    )
    overlayed_figs.append(fig)

# Display and save updated figures
for i, fig in enumerate(overlayed_figs):
    fig.show()

# Save figures as HTML
figs_actor[0].write_html("height_actor_lead.html") 
figs_actor[1].write_html("age_actor_lead.html")     # No legend
figs_actor[2].write_html("actor_dob.html")          # No legend


In [13]:
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go

def compare_gender_distribution_pie_subplot(characters_data, lead_actors_data):
    gender_dist_all = characters_data['actor_gender'].value_counts(normalize=True) * 100
    gender_dist_lead = lead_actors_data['actor_gender'].value_counts(normalize=True) * 100

    labels = list(set(gender_dist_all.index).union(set(gender_dist_lead.index)))
    gender_dist_all = gender_dist_all.reindex(labels, fill_value=0)
    gender_dist_lead = gender_dist_lead.reindex(labels, fill_value=0)

    fig_gender = make_subplots(
        rows=1, cols=2,
        specs=[[{'type': 'domain'}, {'type': 'domain'}]],
        subplot_titles=["All Actors", "Lead Actors"]
    )

    fig_gender.add_trace(
        go.Pie(
            labels=labels,
            values=gender_dist_all,
            name="All Actors"
        ),
        row=1, col=1
    )

    fig_gender.add_trace(
        go.Pie(
            labels=labels,
            values=gender_dist_lead,
            name="Lead Actors"
        ),
        row=1, col=2
    )

    fig_gender.update_layout(
        title_text="Comparison of Gender Distribution",
        title_x=0.5,
        template="plotly_white"
    )
    fig_gender.show()
    fig_gender.write_html("pie_actor_gender.html")



compare_gender_distribution_pie_subplot(characters_data_on_subset_movie, lead_actors_data_on_subset_movie)



# Ratings

In [32]:
def histogram_movie_ratings_plotly(
    movies: pd.DataFrame,
    hue: str | None = None,
):
    """Plot histograms for the movie ratings using Plotly.

    Args:
        movies (pd.DataFrame): The movie dataset.
        hue (str | None, optional): Name of the column in the dataset for color grouping. Defaults to None.
    """
    columns = ["averageRating", "numVotes"]
    titles = ["Average rating of the movie", "Number of votes for the movie"]
    labels = ["Average rating", "Number of votes (log scale)"]
    log_scale = [False, True]
    bins = [12, 50]
    kdes = [True, True]

    return plotly_histogram(
        movies,
        columns,
        titles,
        labels,
        bins=bins,
        log_scale=log_scale,
        kdes=kdes,
        hue=hue,
    )


In [44]:
figs = histogram_movie_ratings_plotly(movie_data_completed)
# Save the plot as an HTML file
# figs.write_html("hist_movies.html")
for i in figs:
    i.show()

In [None]:
# Updated version for ratings movies with subplot + logscale
columns = ["averageRating", "numVotes"]
titles = ["Average rating of the movie", "Number of votes for the movie"]
labels = ["Average rating", "Number of votes (log scale)"]
log_scale = [False, True]
bins = [12, 50]
kdes = [False, True]

fig = make_subplots(rows=1, cols=2, subplot_titles=titles)

fig.add_trace(
    go.Histogram(x=movie_data_completed[columns[0]], 
                 nbinsx=12), 
    row=1, col=1
)

fig.add_trace(
    go.Histogram(x=np.log10(movie_data_completed[columns[1]]), nbinsx=50),
    row=1, col=2
)


# Customize the x-axis ticks for the first subplot
fig.update_xaxes(
    title_text=labels[0],
    row=1, col=1 , tick0=0, # Start the ticks at 0
    dtick=2,           # Set the tick step to 2
    range=[0, 10] 
)
fig.update_yaxes(title_text = 'Count', row=1, col=1)
fig.update_xaxes(
    tickvals=np.log10([1, 10, 100, 1e3, 1e4, 1e5, 1e6]),  # Log-transformed values
    ticktext=["1", "10", "100", "1K", "10K", "100K", "1M"],  # Original scale labels
    title_text=labels[1],
    row=1, col=2
)
fig.update_yaxes(title_text = 'Count', row=1, col=2)


fig.update_layout(height=400, width=750, showlegend=False, title_text="Ratings of the movies")


# Show the figure
fig.show()
fig.write_html("ratings.html")

# Count movies

In [16]:
def count_movies_plotly(
    movies: pd.DataFrame,
    hue: str | None = None,
):
    """Plot count plots for the movie data using Plotly.

    Args:
        movies (pd.DataFrame): The movie dataset.
        hue (str | None, optional): Name of the column in the dataset. Defaults to None.

    Returns:
        list[plotly.graph_objects.Figure]: List of plotly figures.
    """
    columns = ["genres", "languages", "countries"]
    titles = ["Most common genres", "Most common languages", "Most common countries"]
    labels = ["Genres", "Languages", "Countries"]
    cutoffs = [20, 10, 10]
    transforms = lambda col: col.apply(eval).explode()

    return count_plots_plotly(
        movies,
        columns,
        titles,
        labels,
        cutoffs,
        transforms=transforms,
        hue=hue,
    )

def count_plots_plotly(
    data: pd.DataFrame,
    columns: list[str],
    titles: list[str],
    labels: list[str],
    cutoffs: list[int] | int | None = None,
    transforms=None,
    hue: str | None = None,
):
    """Plot count plots for the specified columns using Plotly.

    Args:
        data (pd.DataFrame): The dataset to plot.
        columns (list[str]): The columns from the dataset to plot.
        titles (list[str]): The titles for each plot.
        labels (list[str]): The x labels for each plot.
        cutoffs (list[int] | int | None, optional): The cutoffs. Defaults to None.
        transforms ([type], optional): Transformation function for columns. Defaults to None.
        hue (str | None, optional): Name of the column in the dataset. Defaults to None.

    Returns:
        list[plotly.graph_objects.Figure]: List of plotly figures.
    """
    figures = []

    if not isinstance(cutoffs, list):
        cutoffs = [cutoffs] * len(columns)

    if not isinstance(transforms, list):
        transforms = [transforms] * len(columns)

    for i, col in enumerate(columns):
        transform = transforms[i]
        if transform:
            col_data = transform(data[col])
            col_data = col_data.value_counts().reset_index(name="count")
            col_data = col_data.rename(columns={"index": col})
        else:
            col_data = data[col].value_counts().reset_index(name="count")
            col_data = col_data.rename(columns={"index": col})

        cutoff = cutoffs[i]
        if cutoff:
            col_data = col_data.head(cutoff)

        col_data = col_data.sort_values(by="count", ascending=True)

        fig = px.bar(
            col_data,
            x="count",
            y=col,
            orientation="h",
            title=titles[i],
            labels={"count": "Count", col: labels[i]},
        )

        fig.update_layout(
            xaxis_title="Count",
            yaxis_title=labels[i],
            showlegend=False, 
            title_x=0.5, title_y=0.85, #change title location
            width = 700, height = 450, #width = 450, height = 350, #smaller -> for language & countries
            title_font_size=14, xaxis=dict(
            title_font_size=12,  # Change x-axis label font size
            ), yaxis=dict( tickfont=dict(size=10),
            title_font_size=12,  # Change x-axis label font size
            ),
        )

        figures.append(fig)

    return figures


In [17]:
figs = count_movies_plotly(movie_data_completed)
for i in figs:
    i.show()
figs[0].write_html("genres_count.html")
# figs[1].write_html("language_count.html")
# figs[2].write_html("countries_count.html")

In [6]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd

def inflation_plots(movie_inflation_data):
    def calculate_avg_box_office(data, top=False):
        """Helper function to calculate average box office revenues."""
        if top:
            avg_box_office = data.groupby('release_year').apply(
                lambda x: x.nlargest(10, 'box_office_revenue')
            ).reset_index(drop=True)
            avg_box_office = avg_box_office.groupby('release_year')[['adjusted_box_office', 'box_office_revenue']].mean()
        else:
            avg_box_office = data.groupby('release_year')[['adjusted_box_office', 'box_office_revenue']].mean()

        avg_box_office['adjusted_box_office'] = avg_box_office['adjusted_box_office'] / 1e6
        avg_box_office['box_office_revenue'] = avg_box_office['box_office_revenue'] / 1e6
        avg_box_office = avg_box_office.reset_index()

        return avg_box_office

    top_avg_box_office = calculate_avg_box_office(movie_inflation_data, top=True)
    all_avg_box_office = calculate_avg_box_office(movie_inflation_data, top=False)

    fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=("Top 10 Movies of Each Year", "All Movies"),
        horizontal_spacing=0.15
    )
    fig.add_trace(go.Scatter(
        x=top_avg_box_office['release_year'],
        y=top_avg_box_office['adjusted_box_office'],
        mode='lines+markers',
        name='Adjusted',
        marker=dict(color='#6699CC')
    ), row=1, col=1)
    fig.add_trace(go.Scatter(
        x=top_avg_box_office['release_year'],
        y=top_avg_box_office['box_office_revenue'],
        mode='lines+markers',
        name='Unadjusted',
        marker=dict(color='orange')
    ), row=1, col=1)
    fig.add_trace(go.Scatter(
        x=all_avg_box_office['release_year'],
        y=all_avg_box_office['adjusted_box_office'],
        mode='lines+markers',
        name='Adjusted BO',
        marker=dict(color='#6699CC'),
        showlegend=False
    ), row=1, col=2)
    fig.add_trace(go.Scatter(
        x=all_avg_box_office['release_year'],
        y=all_avg_box_office['box_office_revenue'],
        mode='lines+markers',
        name='Unadjusted BO',
        marker=dict(color='orange'),
        showlegend=False 
    ), row=1, col=2)
    fig.update_layout(
        title="Average Box Office of Movies Over Time",
        xaxis_title="Movie Release Year",
        yaxis_title="Average Box Office Revenue (in Millions of Dollars)",
        template="plotly_white", 
        #tickmode='linear',      # Linear ticks for years
        #dtick=10,                # Ticks every 5 years
        
    )
    fig.update_xaxes(title_text="Release Year", row=1, col=1)
    fig.update_xaxes(title_text="Release Year", row=1, col=2)
    fig.update_yaxes(title_text="Revenue (Millions)", row=1, col=1, range=[-5, 660])
    fig.update_yaxes(title_text="Revenue (Millions)", row=1, col=2, range=[-5, 660])

    fig.show()
    fig.write_html("inflation.html")

inflation_plots(movie_data_completed)







In [18]:
import plotly.express as px
import pandas as pd

def plot_box_office_by_genre(movies: pd.DataFrame):
    """Plot the total box office revenue by genre for the top 10 genres.
    
    Args:
        movies (pd.DataFrame): The movie dataset with box office revenue and genres.
    
    Returns:
        plotly.graph_objects.Figure: The plotly bar figure showing total revenue per genre.
    """
    # Exploding genres and aggregating the box office revenue by genre
    genres_exploded = movies["genres"].apply(eval).explode()
    genres_revenue = pd.DataFrame({
        "genre": genres_exploded,
        'adjusted_box_office': movies.loc[genres_exploded.index, 'adjusted_box_office']
    })

    # Aggregating box office revenue by genre
    genre_revenue_aggregated = genres_revenue.groupby("genre")['adjusted_box_office'].sum().reset_index()

    # Sorting genres by total box office revenue in descending order and selecting the top 10
    top_10_genres = genre_revenue_aggregated.sort_values(by='adjusted_box_office', ascending=False).head(10)

    # Creating the bar plot using plotly
    fig = px.bar(
        top_10_genres,
        x='adjusted_box_office',
        y="genre",
        orientation="h",
        title="Top 10 Genres by (Adjusted) Box Office Revenue",
        labels={'adjusted_box_office': "Total Adjusted Box Office Revenue", "genre": "Genre"}
    )

    fig.update_layout(
        xaxis_title="Total Adjusted Box Office Revenue",
        yaxis_title="Genre",
        showlegend=False,
        title_x=0.5, title_y=0.9,  # Adjust title location
        width=700, height=400,  # Adjust size of the plot
        title_font_size=16, xaxis=dict(title_font_size=12),
        yaxis=dict(tickfont=dict(size=10), title_font_size=12),
    )

    return fig


In [19]:
fig = plot_box_office_by_genre(movie_data_completed)
fig.show()
fig.write_html("adjBO_vs_genres.html")
# figs[1].write_html("language_count.html")
# figs[2].write_html("countries_count.html")