In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import plotly.express as px
import plotly.graph_objects as go
import json
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

# Analysis

In [2]:
movie_directors = pd.read_csv('data/processed/movies_director.csv')

In [3]:
movie_directors.head()

Unnamed: 0,wikipedia_movie_id,Director,Gender
0,975900,John Carpenter,M
1,9363483,Donald Cammell,M
2,261236,Robert van Ackeren,M
3,10408933,Henry King,M
4,175026,Dorothy Arzner,F


In [4]:
# distribution of genders amongst movie directors
fig = px.histogram(movie_directors, x='Gender', title='Gender distribution of movies director')
fig.show()

# Analysis with characters

## Splitting the movie ids: male / female real

In [5]:
movie_directors["wikipedia_movie_id"] = movie_directors["wikipedia_movie_id"].apply(lambda x: str(x))

In [6]:
movies_M = movie_directors[movie_directors["Gender"].apply(lambda x: x=="M")]
movies_F = movie_directors[movie_directors["Gender"].apply(lambda x: x=="F")]

## Actors age

In [7]:
# load characters metadata
char_metadata_df = pd.read_csv("data/processed/characters_metadata.csv")
char_metadata_df["wikipedia_movie_id"] = char_metadata_df["wikipedia_movie_id"].apply(lambda x: str(x))

Splitting the char_metadata_df between the gender of the realisator. First, removing duplicates on actor_names to get actor data and not character.
Check for duplicates: char_metadata_df.where(char_metadata_df.duplicated(subset=["wikipedia_movie_id", "actor_name"]) == True).value_counts()

In [8]:
actors_age = char_metadata_df.copy(deep=True)
actors_age = actors_age.drop_duplicates(subset=["wikipedia_movie_id", "actor_name"])
actors_age["actor_gender_binary"] = actors_age["actor_gender"].apply(lambda x: 0 if x == "M" else 1)

At this step, we complete the age missing values by computing the difference between date of birth and release date.

In [9]:
def to_datetime(df, column):
    # filtering of data for panda.dataframe compliances
    df[column] = df[column].astype(str)

    df = df[(
    df[column] > '1850-01-01') & (df[column] < '2022-01-01'
    )]
    # handling the nan case
    df[column] = df[column].fillna('1850-01-01')
    # keeping only year
    df[column] = pd.to_datetime(df[column], errors="coerce") # need to remove format="mixed" so it works!

    # df = df[df[column].notna()]
    df[column] = df[column].dt.year
    # converting to int
    df[column] = df[column].apply(lambda x: int(x) if pd.notna(x) else x)

    return df

In [10]:
actors_age = to_datetime(actors_age, "actor_date_of_birth")
actors_age.loc[actors_age["actor_age"].isna(), "actor_age"] = (
    (actors_age["movie_release_date"] - actors_age["actor_date_of_birth"])
)

In [11]:
actors_age_M = actors_age[actors_age["wikipedia_movie_id"].isin(movies_M["wikipedia_movie_id"])]
actors_age_F = actors_age[actors_age["wikipedia_movie_id"].isin(movies_F["wikipedia_movie_id"])]

In [12]:
def plot_age_distribution(actors_age, gender_real:str):

    actors_age_gender = actors_age.groupby("actor_age")["actor_gender_binary"].value_counts().unstack(fill_value=0).reset_index()
    actors_age_gender["Tot_actors"] = actors_age_gender[0] + actors_age_gender[1]
    actors_age_gender = actors_age_gender[actors_age_gender["Tot_actors"] >= 200]

    # Calculate weighted average age for each gender
    avg_age_male = (actors_age_gender['actor_age'] * actors_age_gender[0]).sum() / actors_age_gender[0].sum()
    avg_age_female = (actors_age_gender['actor_age'] * actors_age_gender[1]).sum() / actors_age_gender[1].sum()

    male_trace = go.Scatter(
        x=actors_age_gender['actor_age'],
        y=actors_age_gender[0],
        mode="lines+markers",
        name="Male actors",
        line=dict()
    )
    female_trace = go.Scatter(
        x=actors_age_gender['actor_age'],
        y=actors_age_gender[1],
        mode="lines+markers",
        name="Female actors",
        line=dict()
    )


    # Create traces for the average age lines
    avg_male_line = go.Scatter(
        x=[avg_age_male, avg_age_male],
        y=[0, actors_age_gender[[0, 1]].values.max()],
        mode="lines",
        line=dict(color="red", dash="dash"),
        name=f"Avg Male Age: {avg_age_male:.0f}"
    )
    avg_female_line = go.Scatter(
        x=[avg_age_female, avg_age_female],
        y=[0, actors_age_gender[[0, 1]].values.max()],
        mode="lines",
        line=dict(color="red"),
        name=f"Avg Female Age: {avg_age_female:.0f}"
    )

    # Create the figure
    fig = go.Figure(data=[male_trace, female_trace, avg_male_line, avg_female_line])

    # Update layout
    fig.update_layout(
        title=f"Number of Characters by Age - {gender_real} Filmmaker",
        xaxis_title="Age",
        yaxis_title="Number of Actors",
        barmode="stack",
        legend_title="Legend",
        template="plotly_white"
    )


    # Show the plot
    fig.show()


In [13]:
plot_age_distribution(actors_age_M, "Male")

In [14]:
plot_age_distribution(actors_age_F, "Female")

**Discussion:** Do a percentage, both on the same

## Number of actors distribution

In [15]:
# Group by movie release date and calculate the proportion of female actors (actor_gender_binary == 1)
characters_gender_time_M = actors_age_M.groupby("movie_release_date")["actor_gender_binary"].mean().reset_index()
characters_gender_time_F = actors_age_F.groupby("movie_release_date")["actor_gender_binary"].mean().reset_index()

# Add the number of movies for each year
release_years_M = actors_age_M.groupby("movie_release_date")["wikipedia_movie_id"].count()
release_years_F = actors_age_F.groupby("movie_release_date")["wikipedia_movie_id"].count()

characters_gender_time_M["Num_movies"] = release_years_M.values
characters_gender_time_F["Num_movies"] = release_years_F.values

# Create the plot with Plotly
fig = go.Figure()

# Female representation for male-directed movies
fig.add_trace(go.Bar(
    x=characters_gender_time_M["movie_release_date"],
    y=characters_gender_time_M["actor_gender_binary"],  # Proportion of female actresses
    name="Male Directors",
    width=0.4,
    offsetgroup=0  # Ensures the bars for male and female directors are side by side
))

# Female representation for female-directed movies
fig.add_trace(go.Bar(
    x=characters_gender_time_F["movie_release_date"],
    y=characters_gender_time_F["actor_gender_binary"],  # Proportion of female actresses
    name="Female Directors",
    width=0.4,
    offsetgroup=1  # Ensures the bars for male and female directors are side by side
))

# Customize the layout
fig.update_layout(
    title="Evolution of Female Characters - Representation by Director Gender",
    xaxis_title="Year",
    yaxis_title="Proportion of Female Actresses",
    barmode="group",  # Grouped bars
    xaxis=dict(
        tickmode='array',
        tickvals=characters_gender_time_M["movie_release_date"][::5],  # Display ticks every 5 years
        ticktext=characters_gender_time_M["movie_release_date"][::5]
    ),
    legend_title="Director Gender",
    height=600
)

## Characters and movie metadata

In [19]:
movies_metadata_df = pd.read_csv("data/processed/movies_metadata.csv")
movies_metadata_df["wikipedia_movie_id"] = movies_metadata_df["wikipedia_movie_id"].apply(lambda x: str(x))

In [20]:
import ast

movies_df = movies_metadata_df.copy(deep=True)
# only interesting columns
movies_df = movies_df[["wikipedia_movie_id", "movie_release_date", "movie_genres", "movie_countries"]]
# groupby to make a list that contains all genders for a given movie
genders_grouped = (
    char_metadata_df.groupby("wikipedia_movie_id")["actor_gender"]
    .apply(list)  # Collect genders into lists
    .reset_index()  # Reset index to make it a DataFrame
    .rename(columns={"actor_gender": "actor_genders"})  # Rename the column
)
# merging
movies_df = movies_df.merge(genders_grouped, on="wikipedia_movie_id", how="left")
movies_df['movie_genres'] = movies_df['movie_genres'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
movies_df['movie_countries'] = movies_df['movie_countries'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

movies_df


Unnamed: 0,wikipedia_movie_id,movie_release_date,movie_genres,movie_countries,actor_genders
0,975900,2001,"[Thriller, Science Fiction, Horror, Adventure,...",[United States],"[F, F, M, M, F, F, F, M, M, M, M, M, M, M, M, ..."
1,3196793,2000,"[Horror, Biographical and Real-Life Inspired, ...",[United States],"[M, F, M, F, M, M, M, F, F, M, M, M, M, M, M]"
2,28463795,1988,"[Crime and Mystery, Drama]",[Norway],"[M, F, M, F]"
3,9363483,1987,[Thriller],[United Kingdom],"[M, F]"
4,261236,1983,[Drama],[Germany],"[F, M, M]"
...,...,...,...,...,...
73866,35228177,2011,[Drama],[United States],
73867,34980460,2011,"[Biographical and Real-Life Inspired, Drama, D...","[Ireland, United Kingdom]",
73868,9971909,1972,[Comedy],[United States],"[M, M]"
73869,913762,1992,"[Science Fiction, World Cinema, Adventure, Ani...",[Japan],"[M, M, F, F, M, M, M, M, F, M, M, F, M, F, M]"


In [21]:
import plotly.graph_objects as go

def plot_top10_genres(movies_df, gender_real):
    genre_gender_counts = movies_df.explode("actor_genders").explode("movie_genres")
    gender_genre_counts = genre_gender_counts.groupby(['movie_genres', 'actor_genders']).size().unstack(fill_value=0)
    gender_genre_percentages = gender_genre_counts.div(gender_genre_counts.sum(axis=1), axis=0) * 100
    genre_counts = genre_gender_counts.groupby('movie_genres').size().sort_values(ascending=False).head(10)
    gender_genre_counts_top10 = gender_genre_counts.loc[genre_counts.index]
    gender_genre_percentages_top10 = gender_genre_percentages.loc[genre_counts.index]

    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=gender_genre_percentages_top10.index,
        y=gender_genre_percentages_top10['F'],
        name="Female",
        marker_color="#EF553B",
    ))
    fig.add_trace(go.Bar(
        x=gender_genre_percentages_top10.index,
        y=gender_genre_percentages_top10['M'],
        name="Male",
        marker_color="#636EFA"
    ))

    fig.update_layout(
        title=f"Gender Representation Across Top-10 Movie Genres - {gender_real} Filmmaker",
        xaxis_title="Movie Genre",
        yaxis_title="Percentage of Characters (%)",
        barmode='stack',
        xaxis=dict(
            tickmode='array',
            tickvals=gender_genre_percentages_top10.index,
            ticktext=gender_genre_percentages_top10.index,
            tickangle=45
        ),
        height=600,
        legend_title="Gender",
        legend=dict(title="Gender", orientation="h", x=0.5, xanchor="center", y=1.1)
    )
    fig.show()

def plot_top15_genres(movies_df, gender_real):
    genre_gender_counts = movies_df.explode("actor_genders").explode("movie_genres")
    gender_genre_counts = genre_gender_counts.groupby(['movie_genres', 'actor_genders']).size().unstack(fill_value=0)
    gender_genre_percentages = gender_genre_counts.div(gender_genre_counts.sum(axis=1), axis=0) * 100
    top_10_genres_female_percentage = gender_genre_percentages["F"].sort_values(ascending=False).head(15)

    fig2 = go.Figure()
    fig2.add_trace(go.Bar(
        x=top_10_genres_female_percentage.index,
        y=top_10_genres_female_percentage,
        name="Female",
        marker_color="#EF553B",
    ))

    fig2.update_layout(
        title=f"Top-15 Genres with Highest Percentage of Female Characters - {gender_real} Filmmaker",
        xaxis_title="Movie Genre",
        yaxis_title="Percentage of Female Characters (%)",
        xaxis=dict(
            tickmode='array',
            tickvals=top_10_genres_female_percentage.index,
            ticktext=top_10_genres_female_percentage.index,
            tickangle=45
        ),
        height=600
    )
    fig2.show()


In [22]:
movies_genres_M = movies_df[movies_df["wikipedia_movie_id"].isin(movies_M["wikipedia_movie_id"])]
movies_genres_F = movies_df[movies_df["wikipedia_movie_id"].isin(movies_F["wikipedia_movie_id"])]

In [23]:
plot_top10_genres(movies_genres_M, "Male")
plot_top10_genres(movies_genres_F, "Female")

**Discussion:** Add the total number of movies

In [24]:
plot_top15_genres(movies_genres_M, "Male")
plot_top15_genres(movies_genres_F, "Female")

In [25]:
import plotly.graph_objects as go

def plot_genre_country_gender_representation(movies_genres_country_df, gender_real=""):
    genre_country_gender_counts = movies_genres_country_df.explode("actor_genders").explode("movie_genres").explode("movie_countries")

    genre_country_gender_counts = genre_country_gender_counts.groupby(['movie_genres', 'movie_countries', 'actor_genders']).size().unstack(fill_value=0)

    gender_percentage = genre_country_gender_counts['F'] / genre_country_gender_counts.sum(axis=1) * 100

    total_actors = genre_country_gender_counts.sum(axis=1)

    countries = sorted(genre_country_gender_counts.groupby('movie_countries').size().sort_values(ascending=False).head(10).index)
    
    gender_percentage_top_10_countries = gender_percentage[gender_percentage.index.get_level_values('movie_countries').isin(countries)]
    gender_percentage_top_10_countries = gender_percentage_top_10_countries[gender_percentage_top_10_countries >= 40]

    genre_country_gender_counts_top_10_countries = genre_country_gender_counts.loc[gender_percentage_top_10_countries.index]

    bubble_size = total_actors.loc[gender_percentage_top_10_countries.index] * 0.1  # Adjust this factor to reduce bubble size

    # Ensure the bubble sizes are within a reasonable range
    max_bubble_size = bubble_size.max()
    min_bubble_size = bubble_size.min()
    bubble_size = (bubble_size - min_bubble_size) / (max_bubble_size - min_bubble_size) * 150  # Scale the bubble size to a max value of 40

    genres = sorted(genre_country_gender_counts.index.get_level_values('movie_genres').unique())

    genre_indices = [genres.index(genre) for genre in gender_percentage_top_10_countries.index.get_level_values('movie_genres')]
    country_indices = [countries.index(country) for country in gender_percentage_top_10_countries.index.get_level_values('movie_countries')]

    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=genre_indices,
        y=country_indices,
        mode='markers',
        marker=dict(
            size=bubble_size,
            color=gender_percentage.loc[gender_percentage_top_10_countries.index],
            # colorscale='PuRd',
            showscale=True,
            opacity=0.6,
            line=dict(width=1, color='white')
        ),
        text=gender_percentage_top_10_countries.index.get_level_values('movie_genres') + " - " + gender_percentage_top_10_countries.index.get_level_values('movie_countries'),
        hovertemplate='%{text}<br>Female Percentage: %{marker.color:.2f}%<br>Total Actors: %{marker.size}<extra></extra>'
    ))

    fig.update_layout(
        title=f"Gender Representation Across Movie Genres and Top-10 Countries - {gender_real} Filmmaker",
        xaxis_title="Movie Genre",
        yaxis_title="Country",
        xaxis=dict(
            tickvals=list(range(len(genres))),
            ticktext=genres,
            tickangle=45
        ),
        yaxis=dict(
            tickvals=list(range(len(countries))),
            ticktext=countries
        ),
        showlegend=False,
        height=600
    )

    fig.show()


In [123]:
plot_genre_country_gender_representation(movies_genres_M, "Male")

In [124]:
plot_genre_country_gender_representation(movies_genres_F, "Female")

**Discussion:** Reduce scale on the right, create a map

## With TV tropes

In [126]:
from src.data.data_loader import load_csv
tvtropes_df = load_csv('data/raw/' + 'tvtropes.clusters.txt', has_column_names=False, is_tsv=True, column_names=['character_type', 'metadata'])

Loaded data from data/raw/tvtropes.clusters.txt, shape: (500, 2)


In [127]:
tvtropes_df['metadata'] = tvtropes_df['metadata'].apply(ast.literal_eval)
tvtropes_df = pd.concat([tvtropes_df.drop(['metadata'], axis=1), tvtropes_df['metadata'].apply(pd.Series)], axis=1)
tvtropes_df = tvtropes_df.rename(columns={'char': 'char_name', 'actor': 'actor_name', 'movie': 'movie_name', 'id': 'char_actor_id'})
# We add the character information and especially its gender
tvtropes_df['actor_gender'] = tvtropes_df['actor_name'].map(char_metadata_df.set_index('actor_name')['actor_gender'].to_dict())

tvtropes_df.head()

Unnamed: 0,character_type,char_name,movie_name,char_actor_id,actor_name,actor_gender
0,absent_minded_professor,Professor Keenbean,Richie Rich,/m/02vchl3,Michael McShane,M
1,absent_minded_professor,Dr. Reinhardt Lane,The Shadow,/m/0k6fkc,Ian McKellen,M
2,absent_minded_professor,Dr. Harold Medford,Them!,/m/0k6_br,Edmund Gwenn,M
3,absent_minded_professor,Daniel Jackson,Stargate,/m/0k3rhh,James Spader,M
4,adventurer_archaeologist,Indiana Jones,Indiana Jones and the Kingdom of the Crystal S...,/m/0jzx78,Harrison Ford,M


In [137]:
# Complete tvtropes to get the movie ID, using the name of the movie and the char_actor_id, to get movie genres and countries
# we first need to add the movie_name column in the char_metadata_df
char_movie_df = pd.merge(
    actors_age,
    movies_df[['wikipedia_movie_id', "movie_countries", "movie_genres"]],  # Only need char_id and wikipedia_movie_id
    how='left',  # Use left join to keep all rows from tv_tropes_df
    on=['wikipedia_movie_id']  # Merge on char_id and movie_name
)
# now we can complete tvtropes
tvtropes_completed_df = pd.merge(
    tvtropes_df,
    char_movie_df[['char_actor_id', 'wikipedia_movie_id', "movie_countries", "movie_genres", "movie_release_date"]],  # Only need char_id and wikipedia_movie_id
    how='left',  # Use left join to keep all rows from tv_tropes_df
    on=['char_actor_id']  # Merge on char_id and movie_name
)

In [178]:
import plotly.graph_objects as go

def plot_gender_tvtropes(tvtropes_completed_df, gender_real):
    # Group by character type and actor gender
    character_type_gender_counts = tvtropes_completed_df.groupby(['character_type', 'actor_gender']).size().unstack(fill_value=0)
    filtered_character_type_gender_counts = character_type_gender_counts[character_type_gender_counts['F'] >= 0]

    # Create the plot with Plotly
    fig = go.Figure()

    # Plot Female data (stacked bar)
    fig.add_trace(go.Bar(
        x=filtered_character_type_gender_counts.index,
        y=filtered_character_type_gender_counts['F'],
        name='Female',
        marker_color="rgba(255, 99, 132, 0.7)",  # Soft red
        text=filtered_character_type_gender_counts['F'],
        hovertemplate='Character Type: %{x}<br>Female Actors: %{y}<extra></extra>',
    ))

    # Plot Male data (stacked bar)
    fig.add_trace(go.Bar(
        x=filtered_character_type_gender_counts.index,
        y=filtered_character_type_gender_counts['M'],
        name='Male',
        marker_color="rgba(54, 162, 235, 0.7)",  # Soft blue
        text=filtered_character_type_gender_counts['M'],
        hovertemplate='Character Type: %{x}<br>Male Actors: %{y}<extra></extra>',
    ))

    # Update layout for overall aesthetics
    fig.update_layout(
        title=f"Gender Representation Across Character Types - {gender_real} Filmmaker",
        xaxis_title="Character Type",
        yaxis_title="Number of Actors",
        barmode='stack',  # Stack the bars for gender representation
        showlegend=True,
        height=600,  # Adjust the plot height for better readability
        xaxis=dict(
            tickangle=45,  # Rotate tick labels to make them readable
            tickfont=dict(size=10),  # Adjust font size of x-ticks
            tickmode='array',
            tickvals=filtered_character_type_gender_counts.index,
            ticktext=filtered_character_type_gender_counts.index,
        ),
        yaxis=dict(
            title="Number of Actors",
            tickfont=dict(size=12),  # Adjust font size of y-ticks
        ),
        legend_title="Gender",
        legend=dict(
            orientation="v",  # Vertical legend for space efficiency
            yanchor="top", 
            y=1,  # Position at the top of the plot
            xanchor="right", 
            x=1,  # Align to the right
        ),
        margin=dict(t=50, b=100, l=50, r=50),  # Adjust margins for better spacing
        template="plotly_white"  # Use white background for the plot
    )

    fig.show()


In [179]:
tvtropes_M = tvtropes_completed_df[tvtropes_completed_df["wikipedia_movie_id"].isin(movies_M["wikipedia_movie_id"])]
tvtropes_F = tvtropes_completed_df[tvtropes_completed_df["wikipedia_movie_id"].isin(movies_F["wikipedia_movie_id"])]

In [180]:
plot_gender_tvtropes(tvtropes_M, "Male")

In [181]:
plot_gender_tvtropes(tvtropes_F, "Female")

**Discussion:** Go check the tv-tropes.