In [43]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import plotly.express as px
import scipy.stats as stats

In [2]:
final_df = pd.read_csv('../data/final_v2.csv')
final_df

Unnamed: 0,Name,Countries,Genres,Budget(USD)_Inflated,Domestic(USD)_Inflated,Domestic_Percentage,Foreign(USD)_Inflated,Foreign_Percentage,Worldwide(USD)_Inflated,Runtime(mins),...,animation,teen,film adaptation,musical,history,coming of age,sports,black comedy,war film,Worldwide_profit
0,10 cloverfield lane,united states of america,"drama, horror, sci-fi, thriller",6.076746e+06,8.760602e+07,0.654010,4.634613e+07,0.345990,1.339521e+08,103,...,0,0,0,0,0,0,0,0,0,22.043400
1,"10,000 bc",united states of america,"thriller, adventure, costume drama, action, ad...",1.441563e+08,1.301309e+08,0.351333,2.402606e+08,0.648667,3.703914e+08,109,...,0,0,0,0,0,0,0,0,0,2.569373
2,12 rounds,united states of america,"action thrillers, thriller, action, crime fiction",2.726275e+07,1.667757e+07,0.708013,6.877890e+06,0.291987,2.355546e+07,108,...,0,0,0,0,0,0,0,0,0,0.864016
3,12 strong,united states of america,"action, drama, history, war",3.979050e+07,5.209113e+07,0.644274,2.876133e+07,0.355726,8.085246e+07,130,...,0,0,0,0,1,0,0,0,0,2.031954
4,12 years a slave,united states of america,"biography, drama, history",2.539260e+07,7.195247e+07,0.301875,1.663993e+08,0.698125,2.383517e+08,134,...,0,0,0,0,1,0,0,0,0,9.386660
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2684,zoolander,united states of america,"parody, comedy of errors, comedy",4.851315e+07,7.826600e+07,0.743197,2.704388e+07,0.256803,1.053099e+08,90,...,0,0,0,0,0,0,0,0,0,2.170749
2685,zoolander 2,united states of america,"action, adventure, comedy, mystery, romance",6.076746e+07,3.506124e+07,0.508592,3.387664e+07,0.491408,6.893788e+07,101,...,0,0,0,0,0,0,0,0,0,1.134454
2686,zoom,united states of america,"science fiction, action, fantasy, adventure",5.014200e+07,1.717625e+07,0.958658,7.407176e+05,0.041342,1.791697e+07,93,...,0,0,0,0,0,0,0,0,0,0.357325
2687,zootopia,united states of america,"action, adventure, animation, comedy, crime, f...",1.823024e+08,4.147601e+08,0.333340,8.294952e+08,0.666660,1.244255e+09,108,...,1,0,0,0,0,0,0,0,0,6.825228


In [3]:
ratings = final_df["Rating"]
rating_types = np.unique(ratings)
rating_counts = final_df.groupby(["Rating"]).size()
print( rating_counts)

print(rating_types)

Rating
g          29
nc-17       2
pg        375
pg-13    1115
r        1168
dtype: int64
['g' 'nc-17' 'pg' 'pg-13' 'r']


In [4]:
# droping ng-17 since there's only 5
final_df = final_df[final_df["Rating"] != "nc-17"]

### Evolution of ratings throughout the years

In [5]:
# Group by Year and Rating and count the number of movies
rating_counts_by_year = final_df.groupby(['Year', 'Rating']).size().reset_index(name='Count')

# Create a line plot
fig = px.line(
    rating_counts_by_year, 
    x='Year', 
    y='Count', 
    color='Rating', 
    markers=True,
    title="Count of Movies per Content rating by Year",
    labels={'Count': 'Number of Movies', 'Year': 'Year'}
)

# Show the plot
fig.show()

In [6]:
fig = px.box(
    final_df, 
    x="Rating", 
    y="Domestic_Percentage", 
    color="Rating",
    title="Distribution of Domestic Percentage by Rating"
)

fig.show()

fig = px.box(
    final_df, 
    x="Rating", 
    y="Foreign_Percentage", 
    color="Rating",
    title="Distribution of Foreign Percentage by Rating"
)
fig.show()


In [7]:
import plotly.express as px

# Calculate the average domestic and foreign percentage per rating
avg_perc_by_rating = final_df.groupby("Rating").agg(
    avg_domestic=("Domestic_Percentage", "mean"),
    avg_foreign=("Foreign_Percentage", "mean")
).reset_index()

# Create a grouped bar chart
fig = px.bar(
    avg_perc_by_rating,
    x="Rating",
    y=["avg_domestic", "avg_foreign"],
    barmode="group",
    title="Average Domestic vs Foreign Percentage by Rating",
    labels={"value": "Percentage", "Rating": "Rating", "variable": "Type"}
)

fig.show()


In [25]:
import plotly.express as px

# Box plot grouped by Rating and Foreign_higher
fig = px.box(
    final_df,
    x="Rating",  # Group by Rating on the x-axis
    y="Worldwide(USD)_Inflated",  # Box plot for revenue
    color="Foreign_higher",  # Differentiate by Foreign_higher
    title="Distribution of Worldwide Revenue by Rating",
    labels={"Foreign_higher": "Revenue Type", "Worldwide(USD)_Inflated": "Total Revenue (USD)"},
    color_discrete_map={0: "blue", 1: "red"},  # Customize colors
    category_orders={"Foreign_higher": [0, 1]},  # Ensure 0 comes before 1
)

# Update the legend labels dynamically
fig.for_each_trace(
    lambda t: t.update(name="Domestic" if t.name == "0" else "Foreign")
)

fig.update_layout(
    xaxis_title="Rating",
    yaxis_title="Total Revenue (USD)",
    legend_title="Revenue Type",
    boxmode="group",  # Ensures box plots are grouped side-by-side
)

fig.show()


In [67]:
filtered_df = final_df[final_df["Worldwide_profit"] <= 10]

# Box plot grouped by Rating and Foreign_higher
fig = px.box(
    filtered_df,
    x="Rating",  # Group by Rating on the x-axis
    y="Worldwide_profit",  # Box plot for revenue
    color="Foreign_higher",  # Differentiate by Foreign_higher
    title="Distribution of Profitability by Rating",
    labels={"Foreign_higher": "Revenue Type", "Worldwide_profit": "Profitability"},
    color_discrete_map={0: "blue", 1: "red"},  # Customize colors
    category_orders={"Foreign_higher": [0, 1]},  # Ensure 0 comes before 1
)

# Update the legend labels dynamically
fig.for_each_trace(
    lambda t: t.update(name="Domestic" if t.name == "0" else "Foreign")
)

fig.update_layout(
    title={
        "text": "Distribution of Profitability by Rating",
        "x": 0.5,  # Center the title
        "xanchor": "center",  # Anchor the title to the center
    },
    xaxis_title="Rating",
    yaxis_title="Profitability",
    legend_title="Revenue Type",
    boxmode="group",  # Ensures box plots are grouped side-by-side
)

# fig.write_html("../_includes/profit_rating.html")
fig.show()


# Analysis

In [56]:
import plotly.graph_objects as go

# Exclude the "g" rating from the analysis
filtered_df = final_df[final_df["Rating"] != "g"]

# Separate Domestic and Foreign movies
domestic_movies = filtered_df[filtered_df["Foreign_higher"] == 0]
international_movies = filtered_df[filtered_df["Foreign_higher"] == 1]

# Group by Rating and calculate the median Worldwide_profit for both groups
domestic_median_profitability = domestic_movies.groupby("Rating")["Worldwide_profit"].median().reset_index()
international_median_profitability = international_movies.groupby("Rating")["Worldwide_profit"].median().reset_index()

# Add Revenue_Type column
domestic_median_profitability["Revenue_Type"] = "Domestic"
international_median_profitability["Revenue_Type"] = "Foreign"

# Merge the two dataframes
combined_profitability = pd.concat([domestic_median_profitability, international_median_profitability])

# Perform Kruskal-Wallis test for Domestic movies and Foreign movies
kruskal_results = {}

# For Domestic movies
domestic_ratings = domestic_movies["Rating"].unique()
domestic_p_values = {}
for rating in domestic_ratings:
    if rating != "g":  # Exclude "g" rating
        # Perform Kruskal-Wallis test for Domestic movies
        rating_data = [domestic_movies[domestic_movies["Rating"] == r]["Worldwide_profit"] for r in domestic_ratings if r != "g"]
        stat, p_value = stats.kruskal(*rating_data)
        domestic_p_values[rating] = p_value

# For Foreign movies
foreign_ratings = international_movies["Rating"].unique()
foreign_p_values = {}
for rating in foreign_ratings:
    if rating != "g":  # Exclude "g" rating
        # Perform Kruskal-Wallis test for Foreign movies
        rating_data = [international_movies[international_movies["Rating"] == r]["Worldwide_profit"] for r in foreign_ratings if r != "g"]
        stat, p_value = stats.kruskal(*rating_data)
        foreign_p_values[rating] = p_value

# Combine the two dataframes for plotting
combined_profitability = pd.concat([domestic_median_profitability, international_median_profitability])

# Create the grouped bar chart
fig_combined = px.bar(
    combined_profitability,
    x="Rating", 
    y="Worldwide_profit", 
    color="Revenue_Type",  # Differentiate by Revenue_Type
    title="Median Profitability by Rating (Domestic vs. Foreign Movies)",
    labels={"Worldwide_profit": "Median Profitability", "Rating": "Rating"},
    color_discrete_map={"Domestic": "blue", "Foreign": "red"},  # Blue for Domestic, Red for Foreign
    barmode="group",  # Group the bars
)

# Create the p-value table
table_data = []
for rating in combined_profitability["Rating"].unique():
    if rating != "g":
        p_value_domestic = domestic_p_values.get(rating, "N/A")
        p_value_foreign = foreign_p_values.get(rating, "N/A")
        table_data.append([rating, p_value_domestic, p_value_foreign])

# Create the table figure
table_fig = go.Figure(data=[go.Table(
    header=dict(values=["Rating", "Domestic p-value", "Foreign p-value"]),
    cells=dict(values=[list(zip(*table_data))[0], list(zip(*table_data))[1], list(zip(*table_data))[2]])
)])

# Update layout for the table figure
table_fig.update_layout(
    title="Kruskal-Wallis p-values by Rating (Domestic vs. Foreign)",
    title_x=0.5,  # Center the title of the table
)

# Update the bar chart layout
fig_combined.update_layout(
    title={
        "text": "Median Profitability by Rating (Domestic vs. Foreign Movies)",
        "x": 0.5,  # Center the title
        "xanchor": "center",  # Anchor the title to the center
    },
    xaxis_title="Rating",
    yaxis_title="Median Profitability",
)

# Show the plot
fig_combined.show()

# Show the p-value table
table_fig.show()


In [24]:
# Grouped bar chart
fig = px.histogram(
    final_df,
    x="Rating",
    color="Foreign_higher",  # Separate by Foreign_higher
    barmode="group",  # Group the bars for comparison
    title="Count of Movies by Rating (Separated by Foreign Percentage)",
    labels={"Foreign_higher": "Revenue Type"},  # Label for the legend
    color_discrete_map={0: "blue", 1: "red"},  # Optional: Customize colors
    category_orders={"Foreign_higher": [0, 1]},  # Ensure 0 comes before 1
)

# Update the legend labels dynamically
fig.for_each_trace(
    lambda t: t.update(name="Domestic" if t.name == "0" else "Foreign")
)

fig.update_layout(
    xaxis_title="Rating",
    yaxis_title="Count of Movies",
    legend_title="Foreign Percentage Higher",
    bargap=0.2,  # Optional: Adjust spacing between bars
)

fig.show()


In [92]:
import plotly.express as px
from plotly.subplots import make_subplots
import pandas as pd

final_df["log_profit"] = np.log(final_df["Worldwide_profit"])

# Box plot grouped by Rating and Foreign_higher
fig_box = px.box(
    final_df,
    x="Rating",  # Group by Rating on the x-axis
    y="log_profit",  # Box plot for revenue
    color="Foreign_higher",  # Differentiate by Foreign_higher
    title="Distribution of Log Profitability by Rating",
    labels={"Foreign_higher": "Revenue Type", "log_profit": "Log of Profitability"},
    color_discrete_map={0: "blue", 1: "red"},  # Customize colors
    category_orders={"Foreign_higher": [0, 1]},  # Ensure 0 comes before 1
)

# Update the legend labels dynamically
fig_box.for_each_trace(
    lambda t: t.update(name="")
)

fig_box.update_layout(
    title={
        "text": "Distribution of Log Profitability by Rating",
        "x": 0.5,  # Center the title
        "xanchor": "center",  # Anchor the title to the center
    },
    xaxis_title="Rating",
    yaxis_title="Log of Profitability",
    legend_title="Revenue Type",
    boxmode="group",  # Ensures box plots are grouped side-by-side
)

# Grouped bar chart for counts of movies by Rating and Foreign_higher
fig_bar = px.histogram(
    final_df,
    x="Rating",
    color="Foreign_higher",  # Separate by Foreign_higher
    barmode="group",  # Group the bars for comparison
    title="Count of Movies by Rating (Separated by Foreign Percentage)",
    color_discrete_map={0: "blue", 1: "red"},  # Optional: Customize colors
)

# Update the legend labels dynamically
fig_bar.for_each_trace(
    lambda t: t.update(name="Movies with domestic % > 50%" if t.name == "0" else "Movies with foreign % > 50%")
)

# Update the layout of the bar chart
fig_bar.update_layout(
    xaxis_title="Rating",
    yaxis_title="Count of Movies",
    showlegend=False,  # Hide legend for the bar chart
    yaxis=dict(
        autorange="reversed",  # Flip the y-axis to make the bars go downward
    ),
)

# Create subplots: 2 rows, 1 column (vertical layout)
fig_combined = make_subplots(
    rows=2, cols=1,
    row_heights=[0.8, 0.2],  # More space for the box plot
    shared_xaxes=True,
    vertical_spacing=0.05,
    subplot_titles=["Ratings", ""],  # Empty title for second subplot
)

# Add all traces from the box plot to the first row
for trace in fig_box['data']:
    fig_combined.add_trace(trace, row=1, col=1)

# Add bar chart to the second row
for trace in fig_bar['data']:
    fig_combined.add_trace(trace, row=2, col=1)

# Update layout to center the title and maintain consistency in axis titles
fig_combined.update_layout(
    title={
        "text": "Distribution of Log Profitability and Movie Counts by Rating",
        "x": 0.5,
        "xanchor": "center"
    },
    boxmode="group",  # Group the box plots side-by-side
    yaxis_title="Log Profitability",
    showlegend=True,
    yaxis2=dict(
        title="Count",
        autorange="reversed",  # Ensure count axis is reversed for downward bars
    ),
    margin=dict(t=50, b=80),  # Adjust top and bottom margins for proper spacing
    annotations=[
        dict(
            x=0.5,  # Position at the center
            y=-0.15,  # Position below the x-axis (outside the plot area)
            xref="paper",  # Reference paper (not data)
            yref="paper",
            showarrow=False,
            font=dict(size=14),
            align="center"
        )
    ]
)

# Show the combined figure
fig_combined.write_html("../_includes/profit_rating_count.html")
fig_combined.show()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [94]:
# Assuming final_df is your dataframe and 'log_profit' column is already created

# 1. Shapiro-Wilk Test for Normality on log-transformed profitability
shapiro_results = stats.shapiro(final_df["log_profit"])
print(f"Shapiro-Wilk Test p-value: {shapiro_results.pvalue}")

# If the p-value > 0.05, the data is likely normal, and we can proceed with ANOVA.

# Filter data by Foreign_higher (0 and 1)
df_domestic = final_df[final_df['Foreign_higher'] == 0]
df_foreign = final_df[final_df['Foreign_higher'] == 1]

# Perform the Kruskal-Wallis H test for each group if the data is not normal
if shapiro_results.pvalue <= 0.05:
    print("Data is not normal, using Kruskal-Wallis H test.")

    # 2. Kruskal-Wallis H Test for movies with Foreign_higher == 0 (Domestic movies)
    ratings_groups_domestic = [df_domestic[df_domestic['Rating'] == rating]["log_profit"] for rating in df_domestic['Rating'].unique() if rating != "G"]
    h_stat_domestic, p_value_kw_domestic = stats.kruskal(*ratings_groups_domestic)
    print(f"Kruskal-Wallis H Test (Domestic Movies) p-value: {p_value_kw_domestic}")

    # 3. Kruskal-Wallis H Test for movies with Foreign_higher == 1 (Foreign movies)
    ratings_groups_foreign = [df_foreign[df_foreign['Rating'] == rating]["log_profit"] for rating in df_foreign['Rating'].unique() if rating != "G"]
    h_stat_foreign, p_value_kw_foreign = stats.kruskal(*ratings_groups_foreign)
    print(f"Kruskal-Wallis H Test (Foreign Movies) p-value: {p_value_kw_foreign}")

else:
    print("Data is normal, you can use ANOVA.")

    # ANOVA test for comparing means if data is normal
    # Split the data based on ratings (exclude 'G' as per your earlier explanation)
    ratings_groups_domestic = [df_domestic[df_domestic['Rating'] == rating]["log_profit"] for rating in df_domestic['Rating'].unique() if rating != "G"]
    ratings_groups_foreign = [df_foreign[df_foreign['Rating'] == rating]["log_profit"] for rating in df_foreign['Rating'].unique() if rating != "G"]
    
    # Perform ANOVA for domestic movies
    f_stat_domestic, p_value_anova_domestic = stats.f_oneway(*ratings_groups_domestic)
    print(f"ANOVA p-value (Domestic Movies): {p_value_anova_domestic}")
    
    # Perform ANOVA for foreign movies
    f_stat_foreign, p_value_anova_foreign = stats.f_oneway(*ratings_groups_foreign)
    print(f"ANOVA p-value (Foreign Movies): {p_value_anova_foreign}")


Shapiro-Wilk Test p-value: 8.143180420964858e-26
Data is not normal, using Kruskal-Wallis H test.
Kruskal-Wallis H Test (Domestic Movies) p-value: 0.6825967230186682
Kruskal-Wallis H Test (Foreign Movies) p-value: 9.624868476479149e-09
