# Build a character importance score

The idea is to gather data from the main characters of a movie, to then understand how important the character is, and therefore how important the actor is. This way we can build an "importancy metric" to display how women position according to it, and try to extract meaningful conclusions from it.

In [52]:
# Imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import glob

# Get data from characters

Use character.metadata.tsv and collect the interesting data to then build a metric for each movie:
- Number of characters data
- Proportion of women in this set
- ImdB rating
- Number of votes


In [53]:
# Get character data from Data/MovieSummaries/character.metadata.tsv
character_data = pd.read_csv('Data/MovieSummaries/character.metadata.tsv', sep='\t', header=None)
character_data.columns = ['wiki_movie_ID','freebase_movie_ID','movie_release_date',\
    'character_name','actor_DOB','actor_gender','actor_height','actor_ethnicity',\
        'actor_name','actor_age_movie','freebase_char/actor_map_ID','freebase_char_ID','freebase_actor_ID']

# Drop all characters without a actor_gender
character_data = character_data.drop(character_data[character_data.actor_gender == '\\N'].index)

In [62]:
# Get all movie that appear in character_data variable
movie_list = character_data['wiki_movie_ID'].unique()

# Remove empty values from movie_list
movie_list = movie_list[~pd.isnull(movie_list)]

# Create a dataframe with all movie ID
movie_character = pd.DataFrame({'wiki_movie_ID': movie_list})

# Add a column "freebase_movie_id" with the corresponding freebase ID
movie_character['freebase_movie_ID'] = movie_character['wiki_movie_ID'].map(dict(zip(character_data['wiki_movie_ID'],\
     character_data['freebase_movie_ID'])))

# Add a column "freebase_actor_id" with the corresponding freebase ID
movie_character['freebase_actor_ID'] = movie_character['wiki_movie_ID'].map(dict(zip(character_data['wiki_movie_ID'],\
        character_data['freebase_actor_ID'])))
movie_character['freebase_actor_ID'] = movie_character['freebase_actor_ID'].astype(str)

# Get number of character per movie
character_per_movie = character_data.groupby('wiki_movie_ID').size().reset_index(name='counts')

# Get number of character played by a female actor per movie
character_f_per_movie = character_data[character_data['actor_gender'] == 'F'].groupby('wiki_movie_ID').size().reset_index(name='counts')

# Build dataframe with movie ID and columns for the number of character, the number of character played by a woman
movie_character = pd.merge(movie_character, character_per_movie, on='wiki_movie_ID', how='left')
movie_character = pd.merge(movie_character, character_f_per_movie, on='wiki_movie_ID', how='left')

# Change column name to nb_character and nb_f_character
movie_character = movie_character.rename(columns={'counts_x': 'nb_character', 'counts_y': 'nb_f_character'})

# Create new column with the ratio of female actor in percentage
movie_character['F_actor_ratio'] = movie_character['nb_f_character'] / movie_character['nb_character']

# Keep only movie with at least 5 character
movie_character = movie_character[movie_character['nb_character'] >= 5]

# fill NaN with 0
movie_character = movie_character.fillna(0)

# Convert nb_f_character to int
movie_character['nb_f_character'] = movie_character['nb_f_character'].astype(int)



In [63]:
# Open Data/Imdb/processed_imdb_dataset.csv as a dataframe
imdb_data = pd.read_csv('Data/Imdb/processed_imdb_dataset.csv')

# From imdb_data, add to movie_character the columns averageRating and numVotes
movie_character = pd.merge(movie_character, imdb_data[['freebase_movie_ID','averageRating','numVotes']], on='freebase_movie_ID', how='left')

# Drop all rows with NaN
movie_character = movie_character.dropna()

# Re-index movie_character
movie_character = movie_character.reset_index(drop=True)

movie_character

Unnamed: 0,wiki_movie_ID,freebase_movie_ID,freebase_actor_ID,nb_character,nb_f_character,F_actor_ratio,averageRating,numVotes
0,975900,/m/03vyhn,/m/03wc_yl,17,6,0.352941,4.9,55235.0
1,2238856,/m/06yc6v,/m/0dm38rk,15,8,0.533333,7.2,36604.0
2,24229100,/m/07kjkz6,/m/03ccs9p,5,2,0.400000,6.3,3811.0
3,156558,/m/014k4y,/m/05cgxx,37,12,0.324324,6.4,14981.0
4,9633533,/m/02pml15,/m/02hkw6,9,2,0.222222,7.9,1197.0
...,...,...,...,...,...,...,...,...
26362,21772949,/m/05mspcd,/m/02gl9l,10,3,0.300000,6.0,518.0
26363,24997872,/m/09g6klx,/m/0288crq,7,1,0.142857,5.0,1285.0
26364,2828945,/m/085bgh,/m/0lcrrbv,9,3,0.333333,6.2,2645.0
26365,22545667,/m/05zrrsp,/m/0gc2b6f,11,5,0.454545,6.0,68.0


# Elements to build a score

Get all the movies for which we have character metadata
For each movie get:
- Movie ID
- number of characters
- Ratio of female actors
- Number of movies done by each actors

$$ score(actor) =\#movie(actor) * \sum\limits_{movie \in career(actor)} IMdB_{vote}(movie)  $$

# Career building

It can be assumed that an actor who has been in multiple films has had a more successful career than one who has only been in one, and therefore has a higher importancy score.

For each actor build its score, then show the distribution of the score overall, for men, and for women


In [85]:
# Get number of movies in which the actor played
movie_by_character=character_data.groupby(character_data['freebase_actor_ID'])[['wiki_movie_ID']].count().add_suffix("_count")

# Add actor_gender column
movie_by_character['actor_gender'] = movie_by_character.index.map(dict(zip(character_data['freebase_actor_ID'],\
        character_data['actor_gender'])))
movie_by_character = movie_by_character.reset_index()

# Drop all entries with actor_gender is NaN
movie_by_character = movie_by_character.drop(movie_by_character[movie_by_character.actor_gender.isnull()].index)

movie_by_character['sumVotes'] = movie_by_character['freebase_actor_ID'].map(dict(zip(movie_character['freebase_actor_ID'],\
        movie_character['numVotes'])))

movie_by_character['sumVotes'] = movie_by_character['sumVotes'].fillna(0)

movie_by_character['sumVotes'] = movie_by_character['sumVotes'].astype(int)

movie_by_character.sort_values(by=['sumVotes'], ascending=False)

movie_by_character

Unnamed: 0,freebase_actor_ID,wiki_movie_ID_count,actor_gender,sumVotes
0,/m/010p3,12,M,3435
1,/m/010q36,1,M,0
2,/m/010wx,1,F,0
3,/m/010xjr,54,M,113028
4,/m/0112yl,48,M,9492
...,...,...,...,...
135755,/m/0z58,1,M,0
135756,/m/0z9q0,1,M,0
135757,/m/0zcb7,6,M,0
135758,/m/0zcbl,54,M,37709
