In [None]:
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
data_path = "../../data/movie_directors_actors.csv"
df = pd.read_csv(data_path)

# Get the total number of rows
total_rows = len(df)
print(f"Total number of rows in the dataset: {total_rows}")

# Count rows with empty or NaN revenue
empty_revenue_count = df["revenue"].isna().sum()
print(f"Number of rows with empty revenue: {empty_revenue_count}")

# Count rows with empty or NaN average_rating
empty_average_rating_count = df["average_rating"].isna().sum()
print(f"Number of rows with empty average_rating: {empty_average_rating_count}")\

# Remove rows where either 'revenue' or 'average_rating' is NaN
df_cleaned = df.dropna(subset=["revenue", "average_rating"])

# Number of rows after cleaning
rows_after_cleaning = len(df_cleaned)
print(f"Total number of rows after cleaning: {rows_after_cleaning}")

# Number of rows removed
rows_removed = total_rows - rows_after_cleaning
print(f"Number of rows removed: {rows_removed}")

In [None]:
import pandas as pd
import json

# Step 1: Load the original dataset
data_path = "../../data/movie_directors_actors.csv"
df = pd.read_csv(data_path)

# Step 2: Clean the data by removing rows with empty or NaN values in 'revenue' and 'average_rating'
df_cleaned = df.dropna(subset=["revenue", "average_rating"])

# Step 3: Parse 'genres_x' into lists
df_cleaned["genres_list"] = (
    df_cleaned["genres_x"].fillna("").apply(lambda x: x.split(",") if x != "\\N" else [])
)

# Step 4: Function to parse 'genres_y' strings
def parse_genres_y(s):
    try:
        if pd.isnull(s) or s == "\\N":
            return []
        s = s.replace('""', '"').replace("\\", "")
        genres_dict = json.loads(s)
        return list(genres_dict.values())
    except json.JSONDecodeError:
        return []

# Apply the function to 'genres_y'
df_cleaned["genres_y_list"] = df_cleaned["genres_y"].apply(parse_genres_y)

# Step 5: Combine the two genre lists and remove duplicates
df_cleaned["all_genres"] = df_cleaned.apply(
    lambda row: list(set(row["genres_list"] + row["genres_y_list"])), axis=1
)

# Step 6: Explode the genres
df_exploded = df_cleaned.explode("all_genres")

# Step 6.5: Remove duplicate rows for the same director and same movie
df_exploded = df_exploded.drop_duplicates(subset=["director_name", "movie_id"])

# Step 7: Remove rows with empty genres
df_exploded = df_exploded[
    df_exploded["all_genres"].notna() & (df_exploded["all_genres"] != "")
]

# Step 8: Group by director and genre
grouped = df_exploded.groupby(["director_name", "all_genres"])

# Compute the metrics
result = grouped.agg(
    num_movies=("movie_id", "nunique"),  # Total number of unique movies
    avg_revenue=("revenue", "mean"),    # Mean revenue
    avg_rating=("average_rating", "mean"),  # Mean rating
).reset_index()

# Step 9: Calculate the number of genres per director
director_genre_counts = (
    result.groupby("director_name")
    .agg(
        num_genres=("all_genres", "nunique"),        # Total number of unique genres
        total_movies=("num_movies", "sum"),         # Total number of movies
        overall_avg_revenue=("avg_revenue", "mean"),  # Mean revenue across all genres
        overall_avg_rating=("avg_rating", "mean"),   # Mean rating across all genres
    )
    .reset_index()
)

# Step 10: Stratified sampling by number of genres
sampled_directors = (
    director_genre_counts.groupby("num_genres")
    .apply(lambda x: x.sample(n=min(len(x), 200 // len(director_genre_counts["num_genres"].unique())), random_state=42))
    .reset_index(drop=True)
)

# Step 11: Save the new dataset to a CSV file
output_path = "../../data/new_director_dataset.csv"
sampled_directors.to_csv(output_path, index=False)
print(f"New dataset saved to {output_path}")

In [None]:
# Step 1: Load the original dataset
data_path = "../../data/new_director_dataset.csv"
df = pd.read_csv(data_path)

# Prepare data for Sankey plot
genre_groups = df.groupby("num_genres").agg(
    avg_rating=("overall_avg_rating", "mean"),
    avg_revenue=("overall_avg_revenue", "mean"),
    line_thickness=("overall_avg_revenue", "mean")  # Use revenue for line thickness
).reset_index()

# Save the dataset
parallel_plot_data_path = "../../data/director_sankey_data.csv"
genre_groups.to_csv(parallel_plot_data_path, index=False)
print(f"Saved Parallel Coordinates dataset to {parallel_plot_data_path}")

In [102]:
from plot_settings import COMMON_LAYOUT

In [None]:
import plotly.graph_objects as go
import pandas as pd

# Data
data_path = "../../data/director_sankey_data.csv"
data = pd.read_csv(data_path)

# Define rating groups
rating_groups = ["5.5-6", "6-6.5", "6.5-7", "7-7.5"]
rating_bins = [5.5, 6.0, 6.5, 7.0, 7.5]

# Map avg_rating to bins
data["rating_group"] = pd.cut(data["avg_rating"], bins=rating_bins, labels=rating_groups, include_lowest=True)

# Handle NaN in rating_group
data = data.dropna(subset=["rating_group"])  # Drop rows with NaN in rating_group

# Normalize line thickness for visualization
data["line_thickness_norm"] = data["line_thickness"] / data["line_thickness"].max() * 10

# Prepare Sankey nodes
left_nodes = sorted(data["num_genres"].unique())  # Left side: num_genres
right_nodes = rating_groups  # Right side: rating groups

# Node details
nodes = {
    "labels": [str(num) for num in left_nodes] + right_nodes,
    "x": [0.1] * len(left_nodes) + [0.9] * len(right_nodes),
    "y": list((1 - (i / len(left_nodes))) for i in range(len(left_nodes))) + list(
        (1 - (i / len(right_nodes))) for i in range(len(right_nodes))
    ),
    "color": ["blue"] * len(left_nodes) + ["green"] * len(right_nodes),
    "customdata": [f"Number of Genres: {num}" for num in left_nodes] +
                  [f"Rating Group: {group}" for group in right_nodes]
}

# Prepare Sankey links
sources = [left_nodes.index(num) for num in data["num_genres"]]  # Indices for num_genres
targets = [
    len(left_nodes) + rating_groups.index(group) for group in data["rating_group"]
]  # Map to grouped ratings
values = data["line_thickness_norm"]

# Add hover data to links
link_customdata = [
    f"Num Genres: {row['num_genres']}<br>Avg Rating: {row['avg_rating']:.1f}<br>Avg Revenue: ${row['avg_revenue']:,}"
    for _, row in data.iterrows()
]

# Create Sankey figure
fig = go.Figure(go.Sankey(
    arrangement='snap',
    node=dict(
        label=nodes["labels"],
        x=nodes["x"],
        y=nodes["y"],
        pad=10,
        align="center",
        color=nodes["color"],
        customdata=nodes["customdata"],
        hovertemplate="Node: %{customdata}<extra></extra>"
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values,
        customdata=link_customdata,
        hovertemplate="Link:<br>%{customdata}<extra></extra>"
    )
))

# Update layout
fig.update_layout(
    title_text="Sankey Diagram: Number of Directors' Genres to Ratings Groups with Revenue Thickness",
    font_size=12,
    title_x=0.5,
    annotations=[
        dict(
            x=0.5,
            y=-0.14,
            xref="paper",
            yref="paper",
            text="Note: The thicker the link, the higher the average revenue.",
            showarrow=False,
            font=dict(size=12, color="black")
        ),
        dict(
            x=0.05,
            y=1.05,
            xref="paper",
            yref="paper",
            text="Number of Directors' Genres",
            showarrow=False,
            font=dict(size=14, color="blue")
        ),
        dict(
            x=0.95,
            y=1.05,
            xref="paper",
            yref="paper",
            text="Rating Groups",
            showarrow=False,
            font=dict(size=14, color="green")
        )
    ]
)

# Show plot
fig.show()

# Save plot to the specified location
output_path = "../../docs/_includes/plotly/rq3_genres_to_ratings.html"
fig.write_html(output_path)
print(f"Plot saved to {output_path}")
