In [1]:
import pandas as pd
import numpy as np # For sample data generation
import plotly.express as px
import plotly.graph_objects as go # For more detailed layout control
import random
import plot_style

In [2]:
# Read /Users/frederikreimert/Library/CloudStorage/OneDrive-DanmarksTekniskeUniversitet/Kandidat_DTU/2025F/02806 Social Data Analysis and Visualization/final_project/data/movies_with_release_dates_status_filtered.parquet

df = pd.read_parquet("../data/movies_with_release_dates_status_filtered.parquet") # Adjust path if needed

In [3]:
df.columns

Index(['title_id', 'title', 'original_title', 'release_year',
       'runtime_minutes', 'genre', 'imdb_rating', 'vote_count',
       'production_country', 'release_date_full', 'fetch_status'],
      dtype='object')

In [4]:
df.shape

(714015, 11)

In [5]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

# --- 1. Import Style ---
try:
    import plot_style
    MOVIE_GENRES = plot_style.MOVIE_GENRES
    GENRE_COLOR_MAP = plot_style.GENRE_COLOR_MAP
except ImportError:
    MOVIE_GENRES = [
        'Drama', 'Comedy', 'Documentary', 'Romance', 'Action',
        'Crime', 'Thriller', 'Horror', 'Adventure', 'Mystery'
    ]
    GENRE_COLORS_HEX = [
        '#003f5c', '#f9a602', '#7a7a7a', '#ff7b9c', '#ef562f',
        '#2f4f4f', '#008080', '#8b0000', '#556b2f', '#6a0dad'
    ]
    GENRE_COLOR_MAP = dict(zip(MOVIE_GENRES, GENRE_COLORS_HEX))

# --- 2. Load Data ---
try:
    df = pd.read_parquet("../data/movies_with_release_dates_status_filtered.parquet")
except Exception as e:
    print(f"Failed to load DataFrame: {e}")
    exit()

# --- 3. Data Processing ---
def extract_primary_genre(genres_data, valid_genres):
    if pd.isna(genres_data): return None
    items = genres_data.split(',') if isinstance(genres_data, str) else genres_data
    return next((g.strip() for g in items if g.strip() in valid_genres), None)

df['primary_genre'] = df['genre'].apply(lambda x: extract_primary_genre(x, MOVIE_GENRES))
df_plot = df.dropna(subset=['primary_genre'])
df_plot = df_plot[df_plot['primary_genre'].isin(MOVIE_GENRES)]

df_plot['imdb_rating'] = pd.to_numeric(df_plot['imdb_rating'], errors='coerce')
df_plot['runtime_minutes'] = pd.to_numeric(df_plot['runtime_minutes'], errors='coerce')

genre_summary = df_plot.groupby('primary_genre').agg(
    avg_rating=('imdb_rating', 'mean'),
    avg_runtime=('runtime_minutes', 'mean'),
    movie_count=('primary_genre', 'count')
).reset_index().dropna()

# --- 4. Sorting + Range Limits ---
genre_summary_sorted_rating = genre_summary.sort_values('avg_rating', ascending=False)
genre_summary_sorted_runtime = genre_summary.sort_values('avg_runtime', ascending=False)
y_max_rating = genre_summary_sorted_rating['avg_rating'].max()
y_max_runtime = genre_summary_sorted_runtime['avg_runtime'].max()

# --- 5. Initial Figure ---
fig = go.Figure()

fig.add_trace(go.Bar(
    x=genre_summary_sorted_rating['primary_genre'],
    y=genre_summary_sorted_rating['avg_rating'],
    text=genre_summary_sorted_rating['avg_rating'].round(1),
    textposition='outside',
    marker_color=[GENRE_COLOR_MAP.get(g, '#cccccc') for g in genre_summary_sorted_rating['primary_genre']],
    customdata=np.stack((
        genre_summary_sorted_rating['avg_rating'],
        genre_summary_sorted_rating['avg_runtime'],
        genre_summary_sorted_rating['movie_count']
    ), axis=-1),
    hovertemplate=(
        "<b>%{x}</b><br><br>" +
        "Average IMDb Rating: %{customdata[0]:.1f}<br>" +
        "Average Runtime: %{customdata[1]:.0f} min<br>" +
        "Number of Movies: %{customdata[2]:,}<extra></extra>"
    )
))

# --- 6. Buttons ---
update_buttons = [
    dict(
        label="Show Average IMDb Rating",
        method="update",
        args=[
            {
                "x": [genre_summary_sorted_rating['primary_genre'].tolist()],
                "y": [genre_summary_sorted_rating['avg_rating'].tolist()],
                "text": [genre_summary_sorted_rating['avg_rating'].round(1).tolist()],
                "customdata": [np.stack((
                    genre_summary_sorted_rating['avg_rating'],
                    genre_summary_sorted_rating['avg_runtime'],
                    genre_summary_sorted_rating['movie_count']
                ), axis=-1)],
                "marker.color": [[GENRE_COLOR_MAP.get(g, '#cccccc') for g in genre_summary_sorted_rating['primary_genre']]]
            },
            {
                "yaxis.title.text": "Average IMDb Rating",
                "yaxis.range": [0, y_max_rating * 1.10],
                "title.text": "Average IMDb Rating by Movie Genre",
                "xaxis.categoryarray": genre_summary_sorted_rating['primary_genre'].tolist()
            }
        ]
    ),
    dict(
        label="Show Average Runtime",
        method="update",
        args=[
            {
                "x": [genre_summary_sorted_runtime['primary_genre'].tolist()],
                "y": [genre_summary_sorted_runtime['avg_runtime'].tolist()],
                "text": [(genre_summary_sorted_runtime['avg_runtime'].round(0).astype(int).astype(str) + " min").tolist()],
                "customdata": [np.stack((
                    genre_summary_sorted_runtime['avg_rating'],
                    genre_summary_sorted_runtime['avg_runtime'],
                    genre_summary_sorted_runtime['movie_count']
                ), axis=-1)],
                "marker.color": [[GENRE_COLOR_MAP.get(g, '#cccccc') for g in genre_summary_sorted_runtime['primary_genre']]]
            },
            {
                "yaxis.title.text": "Average Runtime (Minutes)",
                "yaxis.range": [0, y_max_runtime * 1.15],
                "title.text": "Average Runtime by Movie Genre",
                "xaxis.categoryarray": genre_summary_sorted_runtime['primary_genre'].tolist()
            }
        ]
    )
]

# --- 7. Layout ---
fig.update_layout(
    updatemenus=[
        dict(
            type="buttons",
            direction="right",
            active=0,
            buttons=update_buttons,
            pad={"r": 10, "t": 10},
            showactive=True,
            x=0.5, xanchor="center",
            y=1.22, yanchor="top"
        )
    ],
    title=dict(
        text="Average IMDb Rating by Movie Genre",
        font_size=20,
        x=0.5, xanchor='center',
        font_color='black'
    ),
    font=dict(family='Arial, Helvetica, sans-serif', size=12, color='black'),
    xaxis=dict(
        title_text="Movie Genre",
        title_font_size=15,
        tickfont_size=11,
        showgrid=False,
        categoryarray=genre_summary_sorted_rating['primary_genre'].tolist(),
        categoryorder='array'
    ),
    yaxis=dict(
        title_text="Average IMDb Rating",
        title_font_size=15,
        tickfont_size=11,
        showgrid=True,
        gridcolor='#cccccc',
        gridwidth=0.5,
        griddash='dash',
        zeroline=False,
        range=[0, y_max_rating * 1.15]
    ),
    plot_bgcolor='white',
    paper_bgcolor='white',
    margin=dict(l=80, r=40, t=170, b=40),
    height=600,
    bargap=0.2,
    hoverlabel=dict(
        bgcolor="rgba(255,255,255,0.9)",
        font_size=13,
        font_family='Arial, Helvetica, sans-serif',
        bordercolor="rgba(128,128,128,0.5)"
    ),
    showlegend=False,
    transition_duration=700,
    transition_easing='cubic-in-out'
)

fig.update_traces(
    textfont_size=10,
    textfont_color='black',
)

# MAke sure aspect ratio is 16:9
fig.update_layout(
    autosize=False,
    width=1200,
    height=605,
)

# --- 8. Show ---
fig.show()


In [7]:
# Save it as a html in ../figures/
fig.write_html("../figures/average_imdb_rating_by_genre.html", include_plotlyjs='cdn', full_html=True)