In [None]:
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import MinMaxScaler

# Load your main actors dataset
actors_df = pd.read_csv("../../data/movie_actors.csv")

# Count duplicates
num_duplicates = actors_df.duplicated().sum()
print(f"Number of duplicate rows: {num_duplicates}")
# Drop exact duplicates
actors_df = actors_df.drop_duplicates()

# Load ethnicity mappings
ethnicities_df = pd.read_csv("../../data/wikidata_ethnicities.csv")
ethnicity_map = dict(zip(ethnicities_df["freebase_id"], ethnicities_df["itemLabel"]))

# Map ethnicity
actors_df["ethnicity_label"] = (
    actors_df["actor_ethnicity_freebase_id"].map(ethnicity_map).fillna("Unknown")
)

# Convert gender to a simpler form
# We assume 'M' and 'F' are the only two categories. If not, handle accordingly.
actors_df = actors_df[actors_df["actor_gender"].isin(["M", "F"])]


# Define helper functions for computing diversity
def shannon_entropy(counts):
    total = sum(counts)
    if total == 0:
        return 0
    entropy = 0
    for c in counts:
        p = c / total
        if p > 0:
            entropy -= p * math.log2(p)
    return entropy


# Group by movie. We need a unique movie identifier.
# The dataset includes `wikipedia_movie_id` or `freebase_movie_id` that can identify each movie.
# Let's use `wikipedia_movie_id` as the unique movie key:
grouped = actors_df.groupby("wikipedia_movie_id")

# We'll create a new DataFrame with one row per movie
movies_data = []
for movie_id, group in grouped:
    # Calculate age diversity (std of actor ages)
    age_std = group["actor_age_at_movie_release"].std(ddof=1) if len(group) > 1 else 0.0

    # Calculate gender diversity (Shannon's entropy)
    gender_counts = group["actor_gender"].value_counts()
    gender_entropy = shannon_entropy(gender_counts.values)

    # Calculate height diversity (std of heights)
    height_std = group["actor_height_in_meters"].std(ddof=1) if len(group) > 1 else 0.0

    # Calculate ethnicity diversity (Shannon's entropy)
    ethnicity_counts = group["ethnicity_label"].value_counts()
    ethnicity_entropy = shannon_entropy(ethnicity_counts.values)

    # movie-level fields: average_rating and revenue should be same across all rows for a movie
    # Just take the first row's value
    avg_rating = group["average_rating"].iloc[0]
    revenue = group["revenue"].iloc[0]

    movies_data.append(
        {
            "wikipedia_movie_id": movie_id,
            "age_diversity": age_std if not np.isnan(age_std) else 0.0,
            "gender_diversity": gender_entropy,
            "height_diversity": height_std if not np.isnan(height_std) else 0.0,
            "ethnicity_diversity": ethnicity_entropy,
            "average_rating": avg_rating,
            "revenue": revenue,
        }
    )

movies_df = pd.DataFrame(movies_data)

# Remove rows with any missing values
movies_df = movies_df.dropna()
# Check for missing values
row_with_empty = movies_df.isnull().any(axis=1).sum()
print(f"Number of rows with missing values: {row_with_empty}")


# Define a function to remove outliers
def remove_outliers(df, columns, lower_percentile=1, upper_percentile=99):
    """Remove rows where specified columns have outliers."""
    for col in columns:
        lower_limit = np.percentile(df[col].dropna(), lower_percentile)
        upper_limit = np.percentile(df[col].dropna(), upper_percentile)
        df = df[(df[col] >= lower_limit) & (df[col] <= upper_limit)]
    return df


# Columns to check for outliers
columns_to_check = [
    "age_diversity",
    "gender_diversity",
    "height_diversity",
    "ethnicity_diversity",
    "average_rating",
    "revenue",
]

# Remove outliers
movies_df = remove_outliers(movies_df, columns_to_check)
# Filter movies with revenue under 100 million
movies_df = movies_df[movies_df["revenue"] < 2_000_000]

# Check for duplicate movies based on 'wikipedia_movie_id'
duplicate_movies = movies_df[
    movies_df.duplicated(subset=["wikipedia_movie_id"], keep=False)
]
print(f"Found {len(duplicate_movies)} duplicate movie entries")

# Select columns to normalize
columns_to_normalize = [
    "age_diversity",
    "gender_diversity",
    "height_diversity",
    "ethnicity_diversity",
]

# Apply Min-Max scaling
scaler = MinMaxScaler()
movies_df[columns_to_normalize] = scaler.fit_transform(movies_df[columns_to_normalize])

# Keep only rows where no diversity values are 0
movies_df = movies_df[(movies_df[columns_to_normalize] > 0.1).all(axis=1)]
print(f"Number of rows after filtering: {len(movies_df)}")

movies_df.to_json("../../docs/assets/data/actor_diversity.json", orient="records")

# Randomly sample 500 movies
# sampled_movies_df = movies_df.sample(n=100, random_state=42)

# Save to JSON for Plotly
# sampled_movies_df.to_json('../../website/assets/data/sampled_movies_df.json', orient='records')

In [1]:
from plot_settings import COMMON_LAYOUT

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

# Load the dataset
file_path = "../../docs/assets/data/actor_diversity.json"
movies_df = pd.read_json(file_path)

OUTPUT_PATH = "../../docs/_includes/plotly/"

# 1. Correlation Heatmap
correlation_matrix = movies_df.drop(columns=["wikipedia_movie_id"]).corr()
heatmap_fig = px.imshow(
    correlation_matrix,
    text_auto=True,
    title="Correlation Between Diversity Metrics and Revenue/Average Rating",
    color_continuous_scale="Viridis",
)
heatmap_fig.update_layout(
    **COMMON_LAYOUT, height=600, width=800, template="plotly_white", title_x=0.5
)
heatmap_fig.show()
heatmap_fig.write_html(f"{OUTPUT_PATH}rq2_correlation_heatmap.html", include_plotlyjs="cdn")
print(f"Saved Correlation Heatmap as rq2_correlation_heatmap.html")

Saved Correlation Heatmap as rq2_correlation_heatmap.html


In [3]:
import pandas as pd
import plotly.express as px

# Load dataset
movies_df = pd.read_json("../../docs/assets/data/actor_diversity.json")

# Define diversity groups for each diversity factor
for factor in ["age_diversity", "gender_diversity", "height_diversity", "ethnicity_diversity"]:
    movies_df[f"{factor}_group"] = pd.qcut(
        movies_df[factor], q=4, labels=["Low", "Medium", "High", "Very High"]
    )

# Plot 1: Revenue grouped by Diversity Groups
revenue_box_fig = px.box(
    movies_df.melt(
        id_vars=["revenue"], 
        value_vars=[f"{factor}_group" for factor in ["age_diversity", "gender_diversity", "height_diversity", "ethnicity_diversity"]],
        var_name="Diversity Factor", 
        value_name="Group"
    ),
    x="Group",
    y="revenue",
    color="Diversity Factor",
    title="Revenue by Diversity Groups",
    labels={"Group": "Diversity Group", "revenue": "Revenue"},
    color_discrete_sequence=px.colors.qualitative.Set2,
)

revenue_box_fig.update_layout(
    height=600, width=900, template="plotly_white", title_x=0.5,
    xaxis=dict(categoryorder="array", categoryarray=["Low", "Medium", "High", "Very High"])
)
revenue_box_fig.write_html("../../docs/_includes/plotly/rq2_revenue_boxplot.html", include_plotlyjs="cdn")
revenue_box_fig.show()

# Plot 2: Average Rating grouped by Diversity Groups
rating_box_fig = px.box(
    movies_df.melt(
        id_vars=["average_rating"], 
        value_vars=[f"{factor}_group" for factor in ["age_diversity", "gender_diversity", "height_diversity", "ethnicity_diversity"]],
        var_name="Diversity Factor", 
        value_name="Group"
    ),
    x="Group",
    y="average_rating",
    color="Diversity Factor",
    title="Average Rating by Diversity Groups",
    labels={"Group": "Diversity Group", "average_rating": "Average Rating"},
    color_discrete_sequence=px.colors.qualitative.Set2,
)

rating_box_fig.update_layout(
    **COMMON_LAYOUT, height=600, width=900, template="plotly_white", title_x=0.5,
    xaxis=dict(categoryorder="array", categoryarray=["Low", "Medium", "High", "Very High"])
)
rating_box_fig.write_html("../../docs/_includes/plotly/rq2_rating_boxplot.html", include_plotlyjs="cdn")
rating_box_fig.show()

In [4]:
import pandas as pd
import plotly.graph_objects as go

# Load the dataset
movies_df = pd.read_json("../../docs/assets/data/actor_diversity.json")

# Create revenue groups for radar chart
movies_df["revenue_group"] = pd.qcut(
    movies_df["revenue"], q=2, labels=["Low Revenue", "High Revenue"]
)

# Create average rating groups for radar chart
movies_df["average_rating_group"] = pd.qcut(
    movies_df["average_rating"], q=2, labels=["Low Rating", "High Rating"]
)

# Group data for radar charts
numeric_columns = ["age_diversity", "gender_diversity", "height_diversity", "ethnicity_diversity"]

# Data for revenue-based radar chart
radar_data = movies_df.groupby("revenue_group")[numeric_columns].mean().reset_index()

# Generate the radar chart for revenue
radar_fig = go.Figure()
for i, row in radar_data.iterrows():
    radar_fig.add_trace(
        go.Scatterpolar(
            r=row[numeric_columns].values,
            theta=numeric_columns,
            fill="toself",
            name=row["revenue_group"],
        )
    )

# Update layout for revenue radar chart
radar_fig.update_layout(
    **COMMON_LAYOUT,
    polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
    title="Diversity Radar Chart: High vs Low Revenue",
    height=600,
    width=800,
    template="plotly_white",
    title_x=0.5,
)

# Save the radar chart for revenue as an HTML file
radar_fig.write_html("../../docs/_includes/plotly/rq2_revenue_radar_chart.html", include_plotlyjs="cdn")
print(f"Saved Radar Chart as rq2_revenue_radar_chart.html")
radar_fig.show()

# Data for average rating-based radar chart
rating_data = movies_df.groupby("average_rating_group")[numeric_columns].mean().reset_index()

# Generate the radar chart for average rating
# rating_radar_fig = go.Figure()
# for i, row in rating_data.iterrows():
#     rating_radar_fig.add_trace(
#         go.Scatterpolar(
#             r=row[numeric_columns].values,
#             theta=numeric_columns,
#             fill="toself",
#             name=row["average_rating_group"],
#         )
#     )

# # Update layout for average rating radar chart
# rating_radar_fig.update_layout(
#     polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
#     title="Diversity Radar Chart: High vs Low Average Rating",
#     height=600,
#     width=800,
#     template="plotly_white",
#     title_x=0.5,
# )

# # Save the radar chart for average rating as an HTML file
# rating_radar_fig.write_html("../../docs/_includes/plotly/rq2_rating_radar_chart.html", include_plotlyjs="cdn")
# print(f"Saved Radar Chart as rq2_rating_radar_chart.html")
# rating_radar_fig.show()

# Prepare the radar chart data for average rating
# Prepare the radar chart data for average rating
rating_radar_fig = go.Figure()

# Add High Rating trace first
for i, row in rating_data.iterrows():
    if row["average_rating_group"] == "High Rating":
        rating_radar_fig.add_trace(
            go.Scatterpolar(
                r=row[numeric_columns].values,
                theta=numeric_columns,
                fill="toself",
                name=row["average_rating_group"],
                line=dict(color="red"),  # Ensure consistent color for High Rating
            )
        )

# Add Low Rating trace second
for i, row in rating_data.iterrows():
    if row["average_rating_group"] == "Low Rating":
        rating_radar_fig.add_trace(
            go.Scatterpolar(
                r=row[numeric_columns].values,
                theta=numeric_columns,
                fill="toself",
                name=row["average_rating_group"],
                line=dict(color="blue"),  # Ensure consistent color for Low Rating
            )
        )

# Update layout for the radar chart
rating_radar_fig.update_layout(
    polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
    title="Diversity Radar Chart: High vs Low Average Rating",
    height=600,
    width=800,
    template="plotly_white",
    title_x=0.5,
)

# Show and save the radar chart
rating_radar_fig.show()
rating_radar_fig.write_html(f"{OUTPUT_PATH}rq2_rating_radar_chart.html", include_plotlyjs="cdn")

Saved Radar Chart as rq2_revenue_radar_chart.html










In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import statsmodels.api as sm

# Load the dataset
movies_df = pd.read_json("../../docs/assets/data/actor_diversity.json")

# Define diversity factors and dependent variables
diversity_factors = ["age_diversity", "gender_diversity", "height_diversity", "ethnicity_diversity"]
dependent_vars = ["average_rating", "revenue"]

# ------------------------ #
# Helper Function for Linear Regression
# ------------------------ #
def perform_regression(x, y):
    x_with_const = sm.add_constant(x)  # Add intercept
    model = sm.OLS(y, x_with_const).fit()
    return model

# ------------------------ #
# Scatter Plot with Multiple Regression Lines
# ------------------------ #
for dep_var in dependent_vars:
    combined_fig = go.Figure()

    for factor in diversity_factors:
        x = movies_df[factor]
        y = movies_df[dep_var]
        
        # Perform regression
        model = perform_regression(x, y)
        y_pred = model.predict(sm.add_constant(x))
        
        # Add scatter points
        combined_fig.add_trace(go.Scatter(
            x=x, y=y, mode='markers',
            name=f"{factor}",
            marker=dict(size=5, opacity=0.7),
        ))
        
        # Add regression line
        combined_fig.add_trace(go.Scatter(
            x=x, y=y_pred, mode='lines',
            name=f"{factor} Regression",
            line=dict(width=2),
        ))

    combined_fig.update_layout(
        title=f"Scatter Plot with Regression Lines: Diversity Factors vs {dep_var}",
        xaxis_title="Diversity Factors",
        yaxis_title=dep_var,
        template="plotly_white",
        legend=dict(title="Factors")
    )

    # Save figure
    combined_fig.show()
    combined_fig.write_html(
        f"../../docs/_includes/plotly/rq2_combined_{dep_var}.html",
        full_html=False,
        include_plotlyjs="cdn",
    )
    print(f"Saved plot for {dep_var} as ../../docs/_includes/plotly/rq2_combined_{dep_var}.html")
