In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go



In [2]:
char_order_df = pd.read_csv('processed_data/name_by_movie_ordered_df.csv')
display(char_order_df)

Unnamed: 0,wiki_ID,char_words,order,gender
0,3217,Gold,6.0,
1,3217,Linda,7.0,F
2,3217,Henry,4.0,M
3,3217,Duke,4.0,M
4,3217,Warrior,9.0,M
...,...,...,...,...
172901,37478048,Ajay,9.0,M
172902,37501922,Murphy,3.0,F
172903,37501922,Hunter,1.0,M
172904,37501922,John,1.0,M


## x. TMDB data graphs
We now create a graphs that showcases content for TMDB. Mor precisely, it will be about the order of characters in movies. My aim here is to create 2 graphs in total:
- **Graph A:** A simple graph that shows the average order (importance of role) of the top 10 characters names in movies, ordered by the smallest average order (which means the most important characters). This is to see if some character names get more important roles than others.
- **Graph B:** An interactive graph that shows the name of the top 10 characters names in movies (number of occurences), with a slider that allows to select a specific order. This is to see if some names are usually the leader in the movie, or if they are more often the sidekick.

### x.1. Graph A
Average order of top 10 characters names in movies

In [3]:
# Drop NaNs in order column
char_order_df = char_order_df.dropna(subset=['order'])

# Sort by order
char_order_df.sort_values('order', inplace=True)

# Clean data from random names that don't make sense
char_order_df['char_words'] = char_order_df['char_words'].apply(lambda a: a if a not in ['The', 'Man', 'Woman', 'Girl', 'Boy'] else np.nan)

# Now set index
char_order_df.set_index(['wiki_ID','char_words'], inplace=True)
display(char_order_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,order,gender
wiki_ID,char_words,Unnamed: 2_level_1,Unnamed: 3_level_1
8166078,General,0.0,M
23924255,Carlson,0.0,F
23924255,Elizabeth,0.0,F
905483,Mark,0.0,M
13001345,Damini,0.0,F
...,...,...,...
31306385,Rock,169.0,M
31306385,Fan,169.0,M
300972,Gill,300.0,M
300972,Captain,302.0,M


In [4]:
# Group and count
groupped_names_df = char_order_df.groupby(['char_words', 'gender'])['order'].agg(['count', 'mean'])
display(groupped_names_df.sort_values(by=('count'), ascending=False))

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean
char_words,gender,Unnamed: 2_level_1,Unnamed: 3_level_1
John,M,965,4.065285
Jack,M,643,4.359253
Frank,M,525,4.579048
Mary,F,513,5.019493
Joe,M,476,4.827731
...,...,...,...
Karly,F,1,0.000000
Karma,F,1,11.000000
Karmin,F,1,20.000000
Karna,F,1,15.000000


The issue is that now if we do the averages, that means if a name appeared in one movie and was the first character, then for any other famous names that appeared in many movies, it would be harder to get a better average. So we decided to take only the top 100 most used names in movies and work on that.

In [5]:
def adjust_means(df):
    best_mean = df['mean'].min()
    df['adjusted_mean'] = 1 / df['mean']
    return df

# Sort by count and take top 100
top_100_char_names_df = groupped_names_df.sort_values(by=('count'), ascending=False).head(100).copy(deep=True)

# Take top 10 for both genders and mix
top_100_char_names_df.reset_index(inplace=True)
top_10_mix = top_100_char_names_df.sort_values(by='mean', ascending=True).head(10).set_index('char_words')
top_10_male = top_100_char_names_df[top_100_char_names_df['gender'] == 'M'].sort_values(by='mean', ascending=True).head(10).set_index('char_words')
top_10_female = top_100_char_names_df[top_100_char_names_df['gender'] == 'F'].sort_values(by='mean', ascending=True).head(10).set_index('char_words')

# Drop count column from all
top_10_mix.drop('count', axis=1, inplace=True)
top_10_male.drop('count', axis=1, inplace=True)
top_10_female.drop('count', axis=1, inplace=True)

# Set values for ranking
top_10_mix['ranking'] = range(1,11)
top_10_male['ranking'] = range(1,11)
top_10_female['ranking'] = range(1,11)

# Adjust means by inverting
top_10_male = adjust_means(top_10_male)
top_10_female = adjust_means(top_10_female)
top_10_mix = adjust_means(top_10_mix)

# Sort by lowest mean, take top 10
display(top_10_mix)
display(top_10_male)
display(top_10_female)


Unnamed: 0_level_0,gender,mean,ranking,adjusted_mean
char_words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Kate,F,3.245283,1,0.30814
Elizabeth,F,3.36715,2,0.296987
Adam,M,3.8375,3,0.260586
Anna,F,3.911197,4,0.255676
Claire,F,3.955056,5,0.252841
Anne,F,3.9625,6,0.252366
Ben,M,4.019417,7,0.248792
William,M,4.023529,8,0.248538
Julie,F,4.05314,9,0.246722
Harry,M,4.059459,10,0.246338


Unnamed: 0_level_0,gender,mean,ranking,adjusted_mean
char_words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Adam,M,3.8375,1,0.260586
Ben,M,4.019417,2,0.248792
William,M,4.023529,3,0.248538
Harry,M,4.059459,4,0.246338
John,M,4.065285,5,0.245985
Daniel,M,4.182353,6,0.2391
Michael,M,4.210256,7,0.237515
Smith,M,4.236994,8,0.236016
Max,M,4.302222,9,0.232438
Jack,M,4.359253,10,0.229397


Unnamed: 0_level_0,gender,mean,ranking,adjusted_mean
char_words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Kate,F,3.245283,1,0.30814
Elizabeth,F,3.36715,2,0.296987
Anna,F,3.911197,3,0.255676
Claire,F,3.955056,4,0.252841
Anne,F,3.9625,5,0.252366
Julie,F,4.05314,6,0.246722
Annie,F,4.136646,7,0.241742
Jane,F,4.269231,8,0.234234
Marie,F,4.424242,9,0.226027
Alice,F,4.466387,10,0.223895


In [6]:
# Adjusting traces for horizontal orientation
trace_male = go.Bar(y=top_10_male['ranking'], x=top_10_male['mean'], name='Male', orientation='h', marker_color='blue', text=top_10_male.index)
trace_female = go.Bar(y=top_10_female['ranking'], x=top_10_female['mean'], name='Female', orientation='h', marker_color='pink', text=top_10_female.index)
trace_combined = go.Bar(y=top_10_mix['ranking'], x=top_10_mix['mean'], name='Combined', orientation='h', marker_color=['pink' if gender == 'F' else 'blue' for gender in top_10_mix['gender']], text=top_10_mix.index)

# Creating the figure with all traces
fig = go.Figure(data=[trace_combined, trace_male, trace_female])

# Set the initial visibility to show both
fig.data[0].visible = True
fig.data[1].visible = False
fig.data[2].visible = False

title = "Top Characters by Average Importance"

# Find the minimum mean value to set as the starting point for the x-axis
min_mean = min(top_10_male['mean'].min(), top_10_female['mean'].min(), top_10_mix['mean'].min())

# Find the maximum mean value to set as the ending point for the x-axis
max_mean = max(top_10_male['mean'].max(), top_10_female['mean'].max(), top_10_mix['mean'].max())

# Updating layout for interactive buttons and axis ranges
fig.update_layout(
    updatemenus=[
        dict(
            type="buttons",
            direction="right",
            active=0,  # Sets the 'All' button as active initially
            x=0.57,
            y=1.2,
            buttons=list([
                dict(label="All",
                     method="update",
                     args=[{"visible": [True, False, False]},
                           {"title": f'{title} (All)'}]),
                dict(label="Male",
                     method="update",
                     args=[{"visible": [False, True, False]},
                           {"title": f'{title} (M)'}]),
                dict(label="Female",
                     method="update",
                     args=[{"visible": [False, False, True]},
                           {"title": f'{title} (F)'}]),
            ]),
        )
    ],
    xaxis=dict(title='Mean Order', range=[min_mean * 0.95, max_mean * 1.01]),  # Adjusting x-axis range
    yaxis=dict(title='Rank', type='category', autorange='reversed'),
    title_text=f"Leading Roles: {title}"
)

# Show the figure
fig.show()


In [7]:
# Export to html
fig.write_html('top_characters_ranking.html', include_plotlyjs="cdn")

### x.1. Graph B
Top 10 characters names in movies (number of occurences) with a slider that allows to select a specific order

In [8]:
# Group by order and count names
names_per_order = char_order_df.reset_index().groupby(['order','char_words','gender']).size().reset_index(name='count')

# Keep only top 10 orders, sort by order (ascending) and then name count (descending)
names_per_order = names_per_order[names_per_order['order'] <= 9].sort_values(by=['order', 'count'], ascending=[True, False])

# Artificially add +1 to the order to start from 1
names_per_order['order'] = names_per_order['order'] + 1

# Take top 10 names per order
names_per_order = names_per_order.groupby(['order']).head(10)
names_per_order['order'] = names_per_order['order'].astype(int)
display(names_per_order.set_index(['order','char_words']))

Unnamed: 0_level_0,Unnamed: 1_level_0,gender,count
order,char_words,Unnamed: 2_level_1,Unnamed: 3_level_1
1,John,M,300
1,Jack,M,180
1,James,M,124
1,Frank,M,117
1,Tom,M,115
...,...,...,...
10,Sam,M,19
10,Jack,M,17
10,Captain,M,16
10,Joe,M,16


In [9]:
def ordinal_suffix(value):
    if 10 <= value % 100 <= 20:
        suffix = 'th'
    else:
        suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(value % 10, 'th')
    return str(value) + suffix

# Create a figure
fig = go.Figure()

# Find the unique orders
unique_orders = sorted(names_per_order['order'].unique())

# Add a bar trace for each order
for order in unique_orders:
    filtered_df = names_per_order[names_per_order['order'] == order]
    # Sort by count (descending) for ranking
    filtered_df = filtered_df.sort_values(by='count', ascending=False)
    fig.add_trace(
        go.Bar(
            x=filtered_df['char_words'], 
            y=filtered_df['count'],
            name=f"Order {order}",
            marker_color=filtered_df['gender'].map({'M': 'blue', 'F': 'pink'}),
            visible=(order == unique_orders[0])  # Only the first trace is visible
        )
    )

# Create and add slider
steps = []
for i, order in enumerate(unique_orders):
    step = dict(
        method="update",
        args=[{"visible": [order == o for o in unique_orders]},
              {"title": f"Most frequent names for a {ordinal_suffix(order)} character"}],
        label=f"{order}"
    )
    steps.append(step)

sliders = [dict(
    active=0,
    currentvalue={"prefix": "Order: "},
    pad={"t": 50},
    steps=steps
)]

# Update the layout
fig.update_layout(
    sliders=sliders,
    title_text=f"Most frequent names for a {ordinal_suffix(1)} character",
    xaxis_title="Character Name",
    yaxis_title="Count"
)

# Show the figure
fig.show()


In [10]:
# Export to html
fig.write_html('top_character_per_order.html', include_plotlyjs="cdn")