In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns

# Importing data

We import the CMU data and add the names of the columns by looking at the data documentation.

In [2]:
data_folder = './data/'

character = pd.read_csv(data_folder + 'character.metadata.tsv', sep='\t', 
                        names=['wiki_id', 'freebase_id', 'dor', 'character_name', 'actor_dob',
                               'actor_gender', 'actor_height', 'actor_ethnicity', 'actor_name', 
                               'actor_age', 'map1', 'map2', 'map3'])

movie = pd.read_csv(data_folder + 'movie.metadata.tsv', sep='\t', 
                    names=['wiki_id', 'freebase_id', 'name', 'release_date', 'bor', 'runtime', 
                           'languages', 'countries', 'genre'])

# Cleaning data

We will first remove all the missing values in the features we will use. 
As we need information about both the movie and the characters in it for our analysis, we will keep only the rows which are common in both movie and character dataframes. 

In [3]:
# NaNs handling

# Suppress rows in the movie dataframe containing NaNs on features we will use
cleaned_movie = movie.copy()
cleaned_movie.dropna(subset=['bor', 'release_date', 'runtime', 'countries', 'genre'], inplace=True)

# Suppress rows in the character dataframe without actor names
cleaned_character = character.copy()
cleaned_character.dropna(subset=['actor_name'], inplace=True)

In [4]:
# Remove rows which are not common in the movie and character dataframes

cleaned_character = cleaned_character[cleaned_character['wiki_id'].isin(cleaned_movie['wiki_id'])]
cleaned_movie = cleaned_movie[cleaned_movie['wiki_id'].isin(cleaned_character['wiki_id'])]

# Preparing movie features

Let's create our own dataframe with some features which could have an effect on the multiplier of the money earned relative to the money invested in the film.

First we will use features which already exist in the movie metadata : the runtime, movie country and genre. We will also need to keep the wikipedia id, the name of the movie, the release date and the box office revenue for our analysis.

In [5]:
# Create a new dataframe using movie information
movie_features = cleaned_movie[['wiki_id', 'name', 'release_date', 'bor', 'runtime', 'countries', 'genre']]
movie_features.head()

Unnamed: 0,wiki_id,name,release_date,bor,runtime,countries,genre
0,975900,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
7,10408933,Alexander's Ragtime Band,1938-08-16,3600000.0,106.0,"{""/m/09c7w0"": ""United States of America""}","{""/m/04t36"": ""Musical"", ""/m/01z4y"": ""Comedy"", ..."
13,171005,Henry V,1989-11-08,10161099.0,137.0,"{""/m/07ssc"": ""United Kingdom""}","{""/m/04xvh5"": ""Costume drama"", ""/m/082gq"": ""Wa..."
17,77856,Mary Poppins,1964-08-27,102272727.0,139.0,"{""/m/09c7w0"": ""United States of America""}","{""/m/0hj3myq"": ""Children's/Family"", ""/m/04t36""..."
21,612710,New Rose Hotel,1999-10-01,21521.0,92.0,"{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."


We will also create new features from the character metadata : 
- number of characters per movie
- the average experience of actors : the average upon all actors of the number of movies they appear in (before the movie)
- the male ratio of actors
- the ratio of young actors (below 18 years old)
- the ratio of old actors (above 60 years old)

In [6]:
# Total number of characters, number by age category and by gender

counts = cleaned_character.groupby('wiki_id').agg(
    nb_characters=('actor_name', 'size'),
    below18=('actor_age', lambda x: (x < 18).sum()),
    above60=('actor_age', lambda x: (x > 60).sum()),
    male=('actor_gender', lambda x: (x == 'M').sum())
).reset_index()

# We change the number of characters by age/gender by the ratio above total number of characters
counts['below18'] = counts['below18'] / counts['nb_characters']
counts['above60'] = counts['above60'] / counts['nb_characters']
counts['male'] = counts['male'] / counts['nb_characters']

# Add the new columns to our movie_features dataframe
movie_features = pd.merge(movie_features, counts, left_on='wiki_id', right_on='wiki_id', how='left')
movie_features.head()

Unnamed: 0,wiki_id,name,release_date,bor,runtime,countries,genre,nb_characters,below18,above60,male
0,975900,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",17,0.0,0.058824,0.647059
1,10408933,Alexander's Ragtime Band,1938-08-16,3600000.0,106.0,"{""/m/09c7w0"": ""United States of America""}","{""/m/04t36"": ""Musical"", ""/m/01z4y"": ""Comedy"", ...",4,0.0,0.0,0.5
2,171005,Henry V,1989-11-08,10161099.0,137.0,"{""/m/07ssc"": ""United Kingdom""}","{""/m/04xvh5"": ""Costume drama"", ""/m/082gq"": ""Wa...",21,0.047619,0.142857,0.809524
3,77856,Mary Poppins,1964-08-27,102272727.0,139.0,"{""/m/09c7w0"": ""United States of America""}","{""/m/0hj3myq"": ""Children's/Family"", ""/m/04t36""...",13,0.153846,0.076923,0.538462
4,612710,New Rose Hotel,1999-10-01,21521.0,92.0,"{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",8,0.0,0.0,0.5


In [7]:
# Actors' experience

# Sort by the date of release (dor) within each actor group
sorted_character = cleaned_character.sort_values(['actor_name', 'dor'])

# Calculate cumulative count of movies for each actor up to the current movie
sorted_character['cumulative_movie_count'] = sorted_character.groupby('actor_name').cumcount()

# Group by movie (wiki_id) and calculate the mean upon all actors of cumulative movie counts for each movie
experience_per_movie = sorted_character.groupby('wiki_id')['cumulative_movie_count'].mean().reset_index()
experience_per_movie.columns = ['wiki_id', 'average_actors_experience']

# Merge with the 'movie_features' dataframe
movie_features = pd.merge(movie_features, experience_per_movie, on='wiki_id', how='left')
movie_features.head()


Unnamed: 0,wiki_id,name,release_date,bor,runtime,countries,genre,nb_characters,below18,above60,male,average_actors_experience
0,975900,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",17,0.0,0.058824,0.647059,4.647059
1,10408933,Alexander's Ragtime Band,1938-08-16,3600000.0,106.0,"{""/m/09c7w0"": ""United States of America""}","{""/m/04t36"": ""Musical"", ""/m/01z4y"": ""Comedy"", ...",4,0.0,0.0,0.5,0.25
2,171005,Henry V,1989-11-08,10161099.0,137.0,"{""/m/07ssc"": ""United Kingdom""}","{""/m/04xvh5"": ""Costume drama"", ""/m/082gq"": ""Wa...",21,0.047619,0.142857,0.809524,1.47619
3,77856,Mary Poppins,1964-08-27,102272727.0,139.0,"{""/m/09c7w0"": ""United States of America""}","{""/m/0hj3myq"": ""Children's/Family"", ""/m/04t36""...",13,0.153846,0.076923,0.538462,2.230769
4,612710,New Rose Hotel,1999-10-01,21521.0,92.0,"{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",8,0.0,0.0,0.5,9.125
