In [2]:
import pandas as pd
from pyparsing.diagram import template
from wasabi import color

import src.utils.pipelines as pip
import plotly.express as px
import numpy as np


In [3]:
# tous les prénoms influencés
#prophet = pd.read_csv("data/clean/influenced_names_prophet.csv")
prophet = pd.read_csv("data/clean/influenced_prophet_with_genres.csv")
influenced_prophet = prophet[prophet["Influenced"] > 0]
# removing common identification mistakes such as "the", "a" or "Mr"
influenced_prophet = influenced_prophet[~influenced_prophet["Character Name"].isin(["the", "a", "Mr"])]
print("Number of influenced names with prophet: ", len(influenced_prophet))
print(influenced_prophet.shape)

influenced_prophet.drop(columns=["Count",'Mean Difference'], inplace=True)

Number of influenced names with prophet:  432
(432, 11)


Unnamed: 0,Wikipedia ID,Movie Name,Year,Character Name,Full name,Normalized_name,Influenced,Wikipedia_movie_ID,Genres
0,451866,mission: impossible ii,2000,Ethan,Ethan Hunt,ETHAN,1,451866.0,"Thriller, Action Thrillers, Action/Adventure, ..."
1,633411,the avengers,1998,Emma,Emma Peel,EMMA,1,633411.0,"Thriller, Adventure, Action/Adventure, Comedy,..."
2,3727473,man on fire,1987,Samantha,"Samantha ""Sam"" Balletto",SAMANTHA,1,3727473.0,"Thriller, Action, Drama, Indie"
3,347000,suspiria,1977,Sarah,Sarah,SARAH,1,347000.0,"Horror, World cinema, Supernatural, Gothic Fil..."
4,320401,barton fink,1991,Taylor,Audrey Taylor,TAYLOR,1,320401.0,"Thriller, Indie, Comedy-drama, Mystery, Period..."


In [4]:
global_names, _, _, _, _ = pip.read_all_names_data()

UK : Loading clean data from data/clean/names/ukbabynames.csv
France : Loading clean data from data/clean/names/france.csv
US : Loading clean data from data/clean/names/babyNamesUSYOB-full.csv
Norway : Loading clean data from data/clean/names/norway_merged.csv
UK & France & US & Norway : Loading clean data from data/clean/names/UK_France_US_Norway.csv


In [5]:

df_character = global_names()
#Keep only Name and Sex columns of df_character

df_character.rename(columns={'Sex': 'Gender'}, inplace=True)

Unnamed: 0,Year,Name,Gender,Count
0,1960,AAGE,M,23
1,1960,AARON,M,1774
2,1960,AASE,F,41
3,1960,AASHILD,F,5
4,1960,AASMUND,M,8
...,...,...,...,...
1517204,2023,ZURI,F,3
1517205,2023,ZUZANNA,F,11
1517206,2023,ZYAD,M,23
1517207,2023,ZYAN,M,8


In [6]:
# Perform a merge based on 'Normalized_name' == 'Name' and 'Year'
merged_df = influenced_prophet.merge(
    df_character[['Name', 'Gender', 'Year', 'Count']],  # Select only the relevant columns
    how='left',  # Use a left join to keep all rows in influenced_prophet
    left_on=['Normalized_name', 'Year'],  # Match these columns from influenced_prophet
    right_on=['Name', 'Year']  # With these columns from df_character
)

# Drop the extra 'Name' column if needed (optional)
merged_df.drop(columns='Name', inplace=True)

merged_df

Unnamed: 0,Wikipedia ID,Movie Name,Year,Character Name,Full name,Normalized_name,Influenced,Wikipedia_movie_ID,Genres,Gender,Count
0,451866,mission: impossible ii,2000,Ethan,Ethan Hunt,ETHAN,1,451866.0,"Thriller, Action Thrillers, Action/Adventure, ...",M,18059.0
1,633411,the avengers,1998,Emma,Emma Peel,EMMA,1,633411.0,"Thriller, Adventure, Action/Adventure, Comedy,...",F,19870.0
2,3727473,man on fire,1987,Samantha,"Samantha ""Sam"" Balletto",SAMANTHA,1,3727473.0,"Thriller, Action, Drama, Indie",F,18864.0
3,347000,suspiria,1977,Sarah,Sarah,SARAH,1,347000.0,"Horror, World cinema, Supernatural, Gothic Fil...",F,19777.0
4,320401,barton fink,1991,Taylor,Audrey Taylor,TAYLOR,1,320401.0,"Thriller, Indie, Comedy-drama, Mystery, Period...",F,10255.0
...,...,...,...,...,...,...,...,...,...,...,...
427,155997,boyz n the hood,1991,Ferris,Ferris,FERRIS,1,155997.0,"Crime Fiction, Drama, Coming of age, Teen",M,14.0
428,4440277,highlander,1986,Ramirez,Lobos Ramirez,RAMIREZ,1,4440277.0,"Cult, Action/Adventure, Science Fiction, Fanta...",M,6.0
429,1917925,what ever happened to baby jane?,1962,Hudson,Baby Jane Hudson,HUDSON,1,1917925.0,"Thriller, Drama, Horror, Psychological thriller",M,6.0
430,31557,"the good, the bad and the ugly",1966,Blondie,Blondie,BLONDIE,1,31557.0,"World cinema, Epic Western, Spaghetti Western,...",,


In [7]:
# Data to use for filling missing values
data = {
    'Wikipedia ID': [323715, 920296, 11077335, 97758, 97646, 146947, 8695, 10645970, 23487440, 321496, 
                     97758, 68245, 133648, 1210303, 950929, 697113, 3917873, 31557],
    'Movie Name': ['troy', 'somewhere in time', 'doctor zhivago', 'doctor zhivago', 'die hard', 'spider-man', 
                   'dr. strangelove or: how i learned to stop worrying and love the bomb', 'rocky', 'alien', 
                   'pirates of the caribbean: the curse of the black pearl', 'doctor zhivago', 'bonnie and clyde', 
                   'scent of a woman', 'constantine', 'the haunting', 'big trouble in little china', 
                   'chitty chitty bang bang', 'the good, the bad and the ugly'],
    'Year': [2004, 1980, 1965, 1965, 1988, 2002, 1964, 1981, 1979, 2003, 1965, 1967, 1992, 2005, 1963, 
             1986, 1968, 1966],
    'Character Name': ['Briseis', 'McKenna', 'Yuri', 'Yuri', 'Kristoff', 'Daily', 'Alexei', 'Shankar', 'Ash', 
                       'Sparrow', 'Pasha', 'Moss', 'Ranger', 'Lucifer', 'Hill', 'Lo', 'Jemima', 'Blondie'],
    'Gender': ['F', 'F', 'M', 'M', 'M', np.nan, 'M', 'M', 'M', 'M', 'M', 
            'M', 'M', 'M', 'M', 'M', 'F', 'F']
}
df_reference = pd.DataFrame(data)

# Example merged_df (already existing)
# merged_df should have columns: Wikipedia ID, Movie Name, Year, Character Name, and Gender (with NaN values)
# merged_df = <your existing DataFrame>

# Merge to bring in Gender information from df_reference
merged_df = merged_df.merge(
    df_reference[['Wikipedia ID', 'Movie Name', 'Year', 'Character Name', 'Gender']],
    on=['Wikipedia ID', 'Movie Name', 'Year', 'Character Name'],
    how='left',
    suffixes=('', '_reference')
)

# Fill NaN values in 'Gender' using 'Gender_reference' from df_reference
merged_df['Gender'] = merged_df['Gender'].fillna(merged_df['Gender_reference'])

# Drop the temporary 'Gender_reference' column
merged_df.drop(columns=['Gender_reference'], inplace=True)

# Drop rows with 'Character Name' as 'Daily', 'Hill', or 'Lo'
merged_df = merged_df[~merged_df['Character Name'].isin(['Daily', 'Hill', 'Lo'])]

merged_df


Unnamed: 0,Wikipedia ID,Movie Name,Year,Character Name,Full name,Normalized_name,Influenced,Wikipedia_movie_ID,Genres,Gender,Count
0,451866,mission: impossible ii,2000,Ethan,Ethan Hunt,ETHAN,1,451866.0,"Thriller, Action Thrillers, Action/Adventure, ...",M,18059.0
1,633411,the avengers,1998,Emma,Emma Peel,EMMA,1,633411.0,"Thriller, Adventure, Action/Adventure, Comedy,...",F,19870.0
2,3727473,man on fire,1987,Samantha,"Samantha ""Sam"" Balletto",SAMANTHA,1,3727473.0,"Thriller, Action, Drama, Indie",F,18864.0
3,347000,suspiria,1977,Sarah,Sarah,SARAH,1,347000.0,"Horror, World cinema, Supernatural, Gothic Fil...",F,19777.0
4,320401,barton fink,1991,Taylor,Audrey Taylor,TAYLOR,1,320401.0,"Thriller, Indie, Comedy-drama, Mystery, Period...",F,10255.0
...,...,...,...,...,...,...,...,...,...,...,...
427,155997,boyz n the hood,1991,Ferris,Ferris,FERRIS,1,155997.0,"Crime Fiction, Drama, Coming of age, Teen",M,14.0
428,4440277,highlander,1986,Ramirez,Lobos Ramirez,RAMIREZ,1,4440277.0,"Cult, Action/Adventure, Science Fiction, Fanta...",M,6.0
429,1917925,what ever happened to baby jane?,1962,Hudson,Baby Jane Hudson,HUDSON,1,1917925.0,"Thriller, Drama, Horror, Psychological thriller",M,6.0
430,31557,"the good, the bad and the ugly",1966,Blondie,Blondie,BLONDIE,1,31557.0,"World cinema, Epic Western, Spaghetti Western,...",F,


In [8]:
# Print each Nan row
merged_df[merged_df.isna().any(axis=1)]


Unnamed: 0,Wikipedia ID,Movie Name,Year,Character Name,Full name,Normalized_name,Influenced,Wikipedia_movie_ID,Genres,Gender,Count
16,403227,the nightmare before christmas,1993,Jack,Jack assigns Lock,JACK,1,403227.0,,M,2763.0
51,443972,hook,1991,Jack,Jack,JACK,1,443972.0,,M,2099.0
56,1277918,charlie and the chocolate factory,2005,Charlie,Charlie Bucket,CHARLIE,1,1277918.0,,M,5282.0
57,525314,shrek 2,2004,Lillian,Queen Lillian,LILLIAN,1,525314.0,,F,4839.0
109,3700174,jumanji,1995,Sam,Sam,SAM,1,3700174.0,,M,729.0
164,335298,toy story 2,1999,Andy,Andy,ANDY,1,335298.0,,M,1940.0
195,294998,home alone 2: lost in new york,1992,Duncan,Duncan,DUNCAN,1,294998.0,,M,365.0
207,53085,toy story,1995,Davis,Andy Davis,DAVIS,1,53085.0,,M,563.0
212,343408,the aristocats,1970,Abigail,Abigail Gabble,ABIGAIL,1,343408.0,,F,362.0
220,525314,shrek 2,2004,King,King Harold,KING,1,525314.0,,M,89.0


# Do movies shape names differently for men and women ?

You could start by breaking down how often female versus male names appear with a positive influence (the boolean “influenced” indicator) and then comparing their average influence scores. Here are a few angles to consider:

### Overall comparison by gender:
Look at the proportion of female names influenced versus male names influenced.

In [30]:
# Calculate the proportion of male vs female names influenced
gender_counts = merged_df['Gender'].value_counts()

# Create a pie chart
fig = px.pie(
    gender_counts,
    values=gender_counts.values,
    names=gender_counts.index,
    title='Proportion of Male vs Female names influenced'
)

# Set transparent background and update font styles
fig.update_layout(
    template='plotly_white',  # White background with gridlines
    font=dict(size=14, color='black'),  # Adjust font size and color for labels
)

fig.show()
fig.write_html("docs/_includes/pie_chart_gender.html")

In [10]:
# Step 1: Aggregate counts and concatenate movie names for hover information
merged_df_aggregated = (
    merged_df.groupby(['Normalized_name', 'Year', 'Gender'], as_index=False)
    .agg({
        'Count': 'sum',
        'Movie Name': lambda x: ', '.join(x.unique())  # Concatenate unique movie names
    })
)

# Step 2: Sort the aggregated DataFrame by 'Count' in descending order
merged_df_sorted = merged_df_aggregated.sort_values(by='Count', ascending=False)

# Step 3: Filter top 5 male first names
top_5_male = merged_df_sorted[merged_df_sorted['Gender'] == 'M'].head(10)

# Step 4: Filter top 5 female first names
top_5_female = merged_df_sorted[merged_df_sorted['Gender'] == 'F'].head(10)

# Step 5: Plot top 5 male first names with custom color
fig_male = px.bar(
    top_5_male,
    x='Normalized_name',
    y='Count',
    title='Top 5 Male First Names and Their Count',
    labels={'Normalized_name': 'Male First Name', 'Count': 'Count'},
    hover_data={'Movie Name': True, 'Year': True}  # Exclude 'Color' from hover data
)

# Manually set bar colors for male names
fig_male.update_traces(marker_color='#002fa7')  # Lighter blue for male
fig_male.show()
fig.write_html("docs/_includes/top5_count_male.html")

# Step 6: Plot top 5 female first names with custom color
fig_female = px.bar(
    top_5_female,
    x='Normalized_name',
    y='Count',
    title='Top 5 Female First Names and Their Count',
    labels={'Normalized_name': 'Female First Name', 'Count': 'Count'},
    hover_data={'Movie Name': True, 'Year': True}  # Exclude 'Color' from hover data
)

# Manually set bar colors for female names
fig_female.update_traces(marker_color='#0077cc')  # Darker blue for female
fig_female.show()
fig.write_html("docs/_includes/top5_count_female.html")


In [None]:
top_5_male

### Shifts in modern cinema
Difference between bdfore and after modern cinema

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot styles for better aesthetics
sns.set(style="whitegrid")
plt.rcParams.update({'figure.figsize': (12, 8), 'font.size': 14})

# 1. Load the Dataset
# Replace 'your_dataset.csv' with the path to your dataset
df = merged_df

# Group by Year and Gender, then sum the counts
gender_trend = df.groupby(['Year', 'Gender'])['Count'].sum().reset_index()

# Pivot the table for easier plotting
gender_trend_pivot = gender_trend.pivot(index='Year', columns='Gender', values='Count').fillna(0)

# Calculate percentages to observe shifts over time
gender_trend_pivot['Total'] = gender_trend_pivot.sum(axis=1)
gender_trend_pivot['Male_Percent'] = (gender_trend_pivot['M'] / gender_trend_pivot['Total']) * 100
gender_trend_pivot['Female_Percent'] = (gender_trend_pivot['F'] / gender_trend_pivot['Total']) * 100

# Visualize the Trends Over Time

plt.figure(figsize=(14, 7))
sns.lineplot(data=gender_trend_pivot, x=gender_trend_pivot.index, y='Male_Percent', label='Male Names', color='blue')
sns.lineplot(data=gender_trend_pivot, x=gender_trend_pivot.index, y='Female_Percent', label='Female Names', color='pink')
plt.title('Trend of Influenced Baby Names by Gender Over Time')
plt.xlabel('Year')
plt.ylabel('Percentage of Influenced Names')
plt.legend()
plt.show()

In [None]:
#Set plot styles for better aesthetics
sns.set(style="whitegrid")
plt.rcParams.update({'figure.figsize': (16, 10), 'font.size': 14})

# 1. Load and Prepare the Data
# Replace 'your_dataset.csv' with the actual path to your dataset
df = merged_df

# Inspect the first few rows to understand the data structure
print("First few rows of the dataset:")
print(df.head())

# Ensure the 'Year' column is of integer type
df['Year'] = df['Year'].astype(int)

# Verify the unique values in the 'Gender' column
print("\nUnique genders in the dataset:")
print(df['Gender'].unique())

# Handle potential inconsistencies in the 'Gender' column
# For example, standardize 'M'/'F' to 'Male'/'Female' if necessary
# Uncomment and modify the following lines if needed:
# df['Gender'] = df['Gender'].replace({'M': 'Male', 'F': 'Female'})

# 2. Aggregate Influenced Names by Gender and Year
# Group by 'Year' and 'Gender' and sum the 'Count' of influenced names
gender_trend = df.groupby(['Year', 'Gender'])['Count'].sum().reset_index()

# Inspect the aggregated data
print("\nAggregated gender trend data:")
print(gender_trend.head())

# 3. Split the Data into Pre-2000 and Post-2000
# Define the cutoff year
cutoff_year = 2000

# Create separate DataFrames for each period
pre_2000 = gender_trend[gender_trend['Year'] < cutoff_year]
post_2000 = gender_trend[gender_trend['Year'] >= cutoff_year]

# Verify the splits
print("\nPre-2000 Data:")
print(pre_2000.head())

print("\nPost-2000 Data:")
print(post_2000.head())

# 4. Calculate Percentage of Influenced Names by Gender
def calculate_percentage(data):
    """
    Calculates the percentage of male and female influenced names per year.
    
    Parameters:
    - data (DataFrame): Aggregated data with 'Year', 'Gender', and 'Count'.
    
    Returns:
    - pivot (DataFrame): Pivoted table with percentage columns.
    """
    # Pivot the table to have genders as columns
    pivot = data.pivot(index='Year', columns='Gender', values='Count').fillna(0)
    
    # Calculate total influenced names per year
    pivot['Total'] = pivot.sum(axis=1)
    
    # Calculate percentage for each gender
    pivot['Male_Percent'] = (pivot.get('M', 0) / pivot['Total']) * 100
    pivot['Female_Percent'] = (pivot.get('F', 0) / pivot['Total']) * 100
    
    return pivot

# Calculate percentages for both periods
pre_2000_pivot = calculate_percentage(pre_2000)
post_2000_pivot = calculate_percentage(post_2000)

# Inspect the pivot tables
print("\nPre-2000 Percentages:")
print(pre_2000_pivot.head())

print("\nPost-2000 Percentages:")
print(post_2000_pivot.head())

# 5. Visualize the Trends

# Create subplots: one for Pre-2000 and one for Post-2000
fig, axes = plt.subplots(2, 1, figsize=(18, 12), sharex=False)

# Plot for Pre-2000
sns.lineplot(ax=axes[0], data=pre_2000_pivot, x=pre_2000_pivot.index, y='Male_Percent', label='Male Names', color='blue')
sns.lineplot(ax=axes[0], data=pre_2000_pivot, x=pre_2000_pivot.index, y='Female_Percent', label='Female Names', color='pink')
axes[0].set_title('Trend of Influenced Baby Names by Gender (Before 2000)')
axes[0].set_xlabel('Year')
axes[0].set_ylabel('Percentage of Influenced Names')
axes[0].legend()
axes[0].grid(True)

# Plot for Post-2000
sns.lineplot(ax=axes[1], data=post_2000_pivot, x=post_2000_pivot.index, y='Male_Percent', label='Male Names', color='blue')
sns.lineplot(ax=axes[1], data=post_2000_pivot, x=post_2000_pivot.index, y='Female_Percent', label='Female Names', color='pink')
axes[1].set_title('Trend of Influenced Baby Names by Gender (2000 and After)')
axes[1].set_xlabel('Year')
axes[1].set_ylabel('Percentage of Influenced Names')
axes[1].legend()
axes[1].grid(True)

# Adjust layout for better spacing
plt.tight_layout()

# Display the plots
plt.show()

# Optional: Save the plots to files
# fig.savefig('gender_trend_pre_post_2000.png')

### By genre



In [None]:
# 1. Load the Dataset
# If genre information is not in the main dataset, ensure you have a separate dataset linking movies to genres
# For this example, we'll assume 'Genre' is a column in your main dataset

df = merged_df.copy()

# If a movie can belong to multiple genres, split them
# Assuming genres are separated by commas
df['Genres'] = df['Genres'].str.split(',')

# Explode the genres so each genre has its own row
df_exploded = df.explode('Genres')

# Clean whitespace in genres
df_exploded['Genres'] = df_exploded['Genres'].str.strip()

# 2. Aggregate Influenced Names by Genre and Gender
genre_gender_trend = df_exploded.groupby(['Genres', 'Gender'])['Count'].sum().reset_index()

# 3. Identify Top Influential Genres
# Let's focus on genres with the highest total influence
top_genres = genre_gender_trend.groupby('Genres')['Count'].sum().reset_index()
top_genres = top_genres.sort_values(by='Count', ascending=False).head(10)  # Top 10 genres

# Filter the original genre_gender_trend for top genres
genre_gender_trend_top = genre_gender_trend[genre_gender_trend['Genres'].isin(top_genres['Genres'])]

# 4. Visualize the Influence by Genre and Gender

plt.figure(figsize=(16, 10))
sns.barplot(data=genre_gender_trend_top, x='Count', y='Genres', hue='Gender')
plt.title('Influence of Movie Genres on Baby Names by Gender')
plt.xlabel('Number of Influenced Names')
plt.ylabel('Genre')
plt.legend(title='Gender')
plt.show()

In [None]:

# Plot styles
sns.set(style="whitegrid")
plt.rcParams.update({'figure.figsize': (16, 10), 'font.size': 14})

df = merged_df.copy()

# Check for NaN values in 'Genres' column
nan_genres = df['Genres'].isna().sum()
print(f"\nNumber of NaN values in 'Genres' before handling: {nan_genres}")

# Drop rows where 'Genres' is NaN
df = df.dropna(subset=['Genres'])

# If a movie can belong to multiple genres, split them
df['Genres'] = df['Genres'].str.split(',')

# Explode the genres so each genre has its own row
df_exploded = df.explode('Genres')

# Clean whitespace in genres
df_exploded['Genres'] = df_exploded['Genres'].str.strip()

# Group by 'Genres' and 'Gender' and sum the 'Count'
genre_gender_trend = df_exploded.groupby(['Genres', 'Gender'])['Count'].sum().reset_index()

# Calculate total influence per genre
top_genres_total = genre_gender_trend.groupby('Genres')['Count'].sum().reset_index()

# Sort genres by total influence in descending order and select top 10
top_genres = top_genres_total.sort_values(by='Count', ascending=False).head(10)

# Filter the original genre_gender_trend for top genres
genre_gender_trend_top = genre_gender_trend[genre_gender_trend['Genres'].isin(top_genres['Genres'])]

# Pivot the table to have genders as separate columns
genre_pivot = genre_gender_trend_top.pivot(index='Genres', columns='Gender', values='Count').fillna(0)

# Calculate total influenced names per genre
genre_pivot['Total'] = genre_pivot.sum(axis=1)

# Calculate percentage for each gender within genres
genre_pivot['M_Percent'] = (genre_pivot['M'] / genre_pivot['Total']) * 100
genre_pivot['F_Percent'] = (genre_pivot['F'] / genre_pivot['Total']) * 100

# Reset index to turn 'Genres' back into a column
genre_pivot = genre_pivot.reset_index()

# Melt the pivot table to long format for easier plotting with seaborn
genre_percentage_melted = genre_pivot.melt(id_vars='Genres', value_vars=['M_Percent', 'F_Percent'], var_name='Gender', value_name='Percentage')

# Replace 'M_Percent'/'F_Percent' with 'M'/'F' for clarity in the plot
genre_percentage_melted['Gender'] = genre_percentage_melted['Gender'].str.replace('_Percent', '')

# Create a horizontal bar plot to display percentage distribution within each genre
plt.figure(figsize=(18, 10))
sns.barplot(data=genre_percentage_melted, 
            x='Percentage', 
            y='Genres', 
            hue='Gender', 
            palette={'M': 'blue', 'F': 'pink'})

plt.title('Percentage of influenced baby names by gender across top genres')
plt.xlabel('Percentage of influenced names')
plt.ylabel('Genre')
plt.legend(title='Gender', labels=['Male', 'Female'])
plt.xlim(0, 100)  # Since percentages range from 0 to 100
plt.grid(True, axis='x', linestyle='--', alpha=0.7)

plt.show()

In [31]:
def plot_genre_gender_influence(df):
    """
    Plots the percentage distribution of influenced baby names
    
    Parameters:
    - df (DataFrame): The dataset containing influenced baby names and genres.
    
    Returns:
    - None: Displays the plot.
    """

    # Drop rows where 'Genres' is NaN
    df = df.dropna(subset=['Genres'])

    # If a movie can belong to multiple genres, split them
    df['Genres'] = df['Genres'].str.split(',')

    # Explode the genres so each genre has its own row
    df_exploded = df.explode('Genres')

    # Clean whitespace in genres
    df_exploded['Genres'] = df_exploded['Genres'].str.strip()

    # Group by 'Genres' and 'Gender' and sum the 'Count'
    genre_gender_trend = df_exploded.groupby(['Genres', 'Gender'])['Count'].sum().reset_index()

    # Calculate total influence per genre
    top_genres_total = genre_gender_trend.groupby('Genres')['Count'].sum().reset_index()

    # Sort genres by total influence in descending order and select top 10
    top_genres = top_genres_total.sort_values(by='Count', ascending=False).head(10)

    # Filter the original genre_gender_trend for top genres
    genre_gender_trend_top = genre_gender_trend[genre_gender_trend['Genres'].isin(top_genres['Genres'])]

    # Pivot the table to have genders as separate columns
    genre_pivot = genre_gender_trend_top.pivot(index='Genres', columns='Gender', values='Count').fillna(0)

    # Calculate total influenced names per genre
    genre_pivot['Total'] = genre_pivot.sum(axis=1)

    # Calculate percentage for each gender within genres
    genre_pivot['M_Percent'] = (genre_pivot['M'] / genre_pivot['Total']) * 100
    genre_pivot['F_Percent'] = (genre_pivot['F'] / genre_pivot['Total']) * 100

    # Reset index to turn 'Genres' back into a column
    genre_pivot = genre_pivot.reset_index()

    # Melt the pivot table to long format for easier plotting with seaborn
    genre_percentage_melted = genre_pivot.melt(id_vars='Genres', value_vars=['M_Percent', 'F_Percent'], var_name='Gender', value_name='Percentage')

    # Replace 'M_Percent'/'F_Percent' with 'M'/'F' for clarity in the plot
    genre_percentage_melted['Gender'] = genre_percentage_melted['Gender'].str.replace('_Percent', '')

    # Create a horizontal bar plot to display percentage distribution within each genre
    fig = px.bar(
        genre_percentage_melted,
        x='Percentage',
        y='Genres',
        color='Gender',
        color_discrete_map={'M': 'blue', 'F': 'pink'},
        title='Percentage of influenced baby names by gender across top genres',
        labels={'Percentage': 'Percentage of influenced names', 'Genre': 'Genre', 'Gender': 'Gender'},
        orientation='h'  # Horizontal bars
    )

    fig.update_layout(
        xaxis=dict(range=[0, 100]),  # Since percentages range from 0 to 100
        barmode='stack',  # Stack the bars for gender comparison
        template='plotly_white'  # White background with gridlines
    )

    fig.show()
    fig.write_html("docs/_includes/plot_genre_gender_influence.html")


In [32]:
plot_genre_gender_influence(merged_df)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



### Statistical significance:
Perform statistical tests (like a t-test or nonparametric equivalent) to see if differences in influence scores or proportions are significant. This could help you say with confidence whether the effect on one gender’s names is truly different from the other’s, rather than a random fluctuation.

By combining these approaches—exploring proportions, mean influence levels, changes over time, and differences by genre—you can paint a richer picture of how gender interacts with cinematic influence on baby names.