In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick # For percentage formatting
from matplotlib.gridspec import GridSpec
import seaborn as sns # For despine

In [14]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio

In [15]:
import plot_style
import importlib
importlib.reload(plot_style) # Reload to ensure latest changes
plot_style.activate_plot_style()


Custom plot style 'publication_quality_plots' applied.


In [16]:
df = pd.read_parquet("../data/movies.parquet")

FOCUS_GENRES = plot_style.MOVIE_GENRES

In [17]:
df.shape

(714015, 9)

In [18]:
print(df.columns)
print(FOCUS_GENRES)

Index(['title_id', 'title', 'original_title', 'release_year',
       'runtime_minutes', 'genre', 'imdb_rating', 'vote_count',
       'production_country'],
      dtype='object')
['Drama', 'Comedy', 'Documentary', 'Romance', 'Action', 'Crime', 'Thriller', 'Horror', 'Adventure', 'Mystery']


In [19]:
# Clean and explode genres
df_clean = df[['title', 'release_year', 'genre']].dropna()
df_clean['genre'] = df_clean['genre'].astype(str).str.split(',') # Ensure it's string before split
df_exploded = df_clean.explode('genre')
df_exploded['genre'] = df_exploded['genre'].str.strip() # Clean whitespace

# Filter by FOCUS_GENRES *after* exploding
df_exploded = df_exploded[df_exploded['genre'].isin(FOCUS_GENRES)]

# Filter years
df_exploded = df_exploded[df_exploded['release_year'].between(1930, 2023)] # Adjusted range for better viz

# Total movies per year (for the bar chart) - considering only movies with at least one FOCUS_GENRE
# This needs to be calculated based on unique titles per year if a movie can have multiple FOCUS_GENRES
# and you want to count the movie once. If you want to count each genre instance, .size() is fine.
# For "number of movies produced", unique titles is better.
total_movies_per_year = df_exploded.groupby('release_year')['title'].nunique()

# Genre distribution for FOCUS_GENRES
genre_counts = df_exploded.groupby(['release_year', 'genre']).size().unstack(fill_value=0)

# Ensure all FOCUS_GENRES are present as columns, even if count is 0 for some years
# And maintain the order from FOCUS_GENRES for consistent color mapping
genre_counts = genre_counts.reindex(columns=FOCUS_GENRES, fill_value=0)

# Calculate percentage distribution *among the FOCUS_GENRES*
genre_sum_per_year = genre_counts.sum(axis=1)
genre_percent = genre_counts.div(genre_sum_per_year, axis=0).fillna(0)


In [20]:
pio.templates.default = "plotly_white"

In [21]:
import pandas as pd # Ensure pandas is imported if not already in this cell

# ---- Interactive Plotting with Plotly (Further Refinements v3) ----

# Create subplots: 2 rows, 1 column, shared x-axis
fig_interactive = make_subplots(
    rows=2, cols=1,
    shared_xaxes=True,
    vertical_spacing=0.03, # Keep it tight as x-axis labels are only on bottom
    row_heights=[0.25, 0.75]
)

# --- Determine shared x-axis range & Prepare data for area chart alignment ---
min_year_data = total_movies_per_year.index.min()
max_year_data = total_movies_per_year.index.max()
x_axis_range_start = min_year_data - 0.5
x_axis_range_end = max_year_data + 0.5
x_axis_range = [x_axis_range_start, x_axis_range_end]

# Change 3: Create a temporary DataFrame for plotting the area chart with extended ends
genre_percent_plot = genre_percent.copy()
# Add a point at the beginning of the range, using values from the first actual year
first_year_data = genre_percent_plot.loc[min_year_data]
genre_percent_plot.loc[x_axis_range_start] = first_year_data
# Add a point at the end of the range, using values from the last actual year
last_year_data = genre_percent_plot.loc[max_year_data]
genre_percent_plot.loc[x_axis_range_end] = last_year_data
# Sort by index to ensure correct plotting order
genre_percent_plot = genre_percent_plot.sort_index()


# --- Top subplot: Total movie counts (bar chart) ---
fig_interactive.add_trace(
    go.Bar(
        x=total_movies_per_year.index,
        y=total_movies_per_year.values,
        name="Total Movies",
        marker_color='darkgray',
        opacity=0.8,
        width=1.0, # Bar width to 1.0 for tight edge alignment with the explicit range
        hovertemplate="Year: %{x}<br>Total Movies: %{y}<extra></extra>"
    ),
    row=1, col=1
)

# --- Bottom subplot: Stacked percent area chart (using genre_percent_plot) ---
for genre_name in FOCUS_GENRES:
    if genre_name in genre_percent_plot.columns and genre_name in genre_counts.columns:
        # For customdata, we still need to align it with the *original* years for accuracy.
        # We'll create a temporary customdata series that matches genre_percent_plot's index.
        # This is a bit more involved but ensures hover data is correct for the visual range.
        
        original_counts_for_genre = genre_counts[genre_name]
        custom_data_for_plot = pd.Series(index=genre_percent_plot.index, dtype='float')
        
        # Fill original years
        for year_idx in original_counts_for_genre.index:
            if year_idx in custom_data_for_plot.index:
                 custom_data_for_plot.loc[year_idx] = original_counts_for_genre.loc[year_idx]
        
        # Fill extended start/end points for customdata (can use original start/end counts)
        if x_axis_range_start in custom_data_for_plot.index and min_year_data in original_counts_for_genre.index:
            custom_data_for_plot.loc[x_axis_range_start] = original_counts_for_genre.loc[min_year_data]
        if x_axis_range_end in custom_data_for_plot.index and max_year_data in original_counts_for_genre.index:
            custom_data_for_plot.loc[x_axis_range_end] = original_counts_for_genre.loc[max_year_data]
        
        # Ensure custom_data_for_plot does not have NaNs if original_counts_for_genre might not cover all years
        # in genre_percent_plot (shouldn't happen with this setup but good for robustness)
        custom_data_for_plot = custom_data_for_plot.fillna(0)


        fig_interactive.add_trace(
            go.Scatter(
                x=genre_percent_plot.index, # Use the extended x-values
                y=genre_percent_plot[genre_name],
                name=genre_name,
                mode='lines',
                line=dict(width=0.5, color=plot_style.GENRE_COLOR_MAP.get(genre_name, '#CCCCCC')),
                stackgroup='one',
                fillcolor=plot_style.GENRE_COLOR_MAP.get(genre_name, '#CCCCCC'),
                customdata=custom_data_for_plot.values, # Use the aligned customdata
                hovertemplate=f"<b>{genre_name}</b><br>" +
                              "Movies: %{customdata:.0f}<br>" +
                              "Share: %{y:.2%}<extra></extra>"
            ),
            row=2, col=1
        )

# --- Update layout for the entire figure ---
fig_interactive.update_layout(
    title_text="Annual Movie Production and Genre Distribution (1930-2023)",
    title_x=0.5,
    font_family=plot_style.plt.rcParams['font.sans-serif'][0] if plot_style.plt.rcParams['font.sans-serif'] else "Arial",
    font_size=plot_style.plt.rcParams['font.size'] + 1,
    hovermode='x unified', # Change 2: Revert to default hover line for clarity
    plot_bgcolor='white',
    paper_bgcolor='white',
    width=1200,
    height=675,
    hoverlabel=dict( # Keep the customized hover box style
        bgcolor="rgba(255, 255, 255, 0.9)",
        bordercolor="rgba(128, 128, 128, 0.5)",
        font_size=11,
        font_family=plot_style.plt.rcParams['font.sans-serif'][0] if plot_style.plt.rcParams['font.sans-serif'] else "Arial"
    ),
    bargap=0, # Ensure bars touch if width is 1.0
    legend_title_text='Movie Genres',
    legend_traceorder="normal",
    legend=dict(
        x=1.02,
        y=0.5,
        xanchor='left',
        yanchor='middle',
        bgcolor='rgba(0,0,0,0)',  # Change 1: Transparent legend background
        bordercolor='rgba(0,0,0,0)', # Change 1: No legend border
        borderwidth=0
    )
)

# --- Layout for top subplot (Bar Chart) ---
fig_interactive.update_yaxes(
    title_text="Total Movies Produced",
    row=1, col=1,
    gridcolor='rgba(204,204,204,0.5)',
    zerolinecolor='rgba(204,204,204,0.5)'
)
fig_interactive.update_xaxes(
    showticklabels=False,
    row=1, col=1,
    showgrid=False,
    range=x_axis_range,
    autorange=False
)

# --- Layout for bottom subplot (Area Chart) ---
fig_interactive.update_yaxes(
    title_text="Genre Share (%)",
    tickformat=".0%",
    range=[0, 1],
    row=2, col=1,
    gridcolor='rgba(204,204,204,0.5)',
    zerolinecolor='rgba(204,204,204,0.5)'
)

fig_interactive.update_xaxes(
    title_text="Year",
    row=2, col=1,
    range=x_axis_range, # Use the same range
    autorange=False,
    # For ticks, we want them at integer years, not at the -.5/+_5 range edges
    # Plotly might auto-adjust ticks nicely, but we can guide it.
    # Let's use tickvals to show only the original years on the axis.
    tickvals=total_movies_per_year.index[::5 if (max_year_data - min_year_data) > 20 else 1], # Ticks every 5 or 1 original year
    ticktext=[str(int(year)) for year in total_movies_per_year.index[::5 if (max_year_data - min_year_data) > 20 else 1]],
    tickangle=45,
    showgrid=True,
    gridcolor='rgba(220,220,220,0.5)',
    griddash='dot',
    # Change 2: Revert spike line to default by removing spikecolor, spikedash etc.
    spikemode='across', # This can stay if you like the line itself
    spikesnap='cursor'  # This can stay
)

# Adjust margins for legend
fig_interactive.update_layout(margin=dict(l=80, r=150, t=80, b=80))

fig_interactive.show()

In [22]:
# Save the figure as an HTML file
pio.write_html(fig_interactive, "../figures/interactive_movie_genre_distribution.html", include_plotlyjs='cdn', full_html=True)