# Cast
- This notebook intends to investigate the casts of the CMU Movie Summary Corpus dataset.
- The notebook will do some initial analyses to see how the cast and individual movie actors affects the box office revenue.

**Summary**

- By including all actors that have played in more than 15 movies, we get $R^2$=xx
- 

**Contents of Notebook**

-

In [1]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf

# turn off warning
pd.options.mode.chained_assignment = None

In [2]:
# constants
data_folder = './data/'
MOVIE_PATH = data_folder + 'movie.metadata.tsv'
CHARACTER_PATH = data_folder + 'character.metadata.tsv'
RATING_PATH = data_folder + 'title.ratings.tsv'

# Data Processing

### Loading data

In [4]:
# create dataframes

# define collumn names
colnames_movies = [
    "wikipedia_movie_ID",
    "freebase_movie_ID",
    "name",
    "release_date",
    "box_office_revenue",
    "runtime",
    "languages",
    "countries",
    "genres",
]

colnames_character = [
    "wikipedia_movie_ID",
    "freebase_movie_ID",
    "last_update",
    "character_name",
    "actor_DOB",
    "actor_gender",
    "actor_height",
    "actor_ethnicity",
    "actor_name",
    "actor_age_at_movie_release",
    "freebase_character/actor_map_ID",
    "freebase_character_ID",
    "freebase_actor_ID",
]


# load data
movies = pd.read_csv(MOVIE_PATH, sep="\t", names=colnames_movies, header=None)
characters = pd.read_csv(
    CHARACTER_PATH, sep="\t", names=colnames_character, header=None
)

In [None]:
# Removing movies from before 2000
movies = movies[movies['release_date'] >= '2000']
movies.shape

In [None]:
movies.head(1)

In [None]:
characters.head(1)

In [None]:
# Merging movies with characters on wiki_id. 
movies_characters = pd.merge(left=movies, right=characters, on='wikipedia_movie_ID')
movies_characters.head(1)

In [None]:
movies_characters.isna().sum()

In [None]:
# Removing movies with NULL value in freebase_actor_id
movies_characters = movies_characters[movies_characters['freebase_actor_ID'].notna()]
movies_characters.shape

In [None]:
# Number of unique movies
num_movies = movies_characters["wikipedia_movie_ID"].nunique()
num_movies

In [None]:
# Number of unique actors in dataset
num_actors_unique = movies_characters["freebase_actor_ID"].nunique()
num_actors_unique

In [None]:
# Checking number of actors in dataset, counting actors multiple times if they play
# in multiple movies
count_actors = movies_characters["freebase_actor_ID"].count()
count_actors

In [None]:
# Number of actors on average per movie
count_actors / num_movies

In [None]:
# One Hot Encoding of Actors
movies_characters_dummy = pd.get_dummies(data=movies_characters, columns=['freebase_actor_ID'])

In [None]:
movies_characters_dummy.head(1)

In [None]:
# Only include wiki_id and one hot encoding of actors in dataframe
del_columns = colnames_movies[1:]
del_columns.extend(colnames_character)
dummy_actor_columns = movies_characters_dummy.
            loc[ : , ~movies_characters_dummy.columns.isin(del_columns)]
dummy_actor_columns.shape

In [None]:
# Only include actor columns that correspond to actors that have played in more than xx movies
# Current threshold is 20. 
# TODO: Decide on threshold
dummy_actor_columns = dummy_actor_columns.loc[:, dummy_actor_columns.sum(axis=0) > 15]

In [None]:
# Grouping movies such that every movie correspond to only one row in the dataframe
dummy_actor_columns = dummy_actor_columns.groupby('wiki_id').agg('sum')

In [None]:
dummy_actor_columns

In [None]:
movies_characters = movies_characters[['wiki_id', 'release_date', 'freebase_id','name','release_date','box_office_revenue','length','language','country','genre']].drop_duplicates()

In [None]:
# Concatenating dummy actor columns with movies
movies_binary_actors = pd.merge(movies_characters, dummy_actor_columns, left_on='wiki_id', right_index=True)

movies_binary_actors

In [None]:
# Removing slashes in column names to avoid error in regression. 
movies_binary_actors.columns = movies_binary_actors.columns.str.replace('/', '')
dummy_actor_columns.columns = dummy_actor_columns.columns.str.replace('/', '')

In [None]:
# One Hot Encoding of Genres
import json

for dict_ in movies_characters['genre']:
    # using json.loads()
    # convert dictionary string to dictionary
    dict_ = json.loads(dict_)
    
for key, val in dict_.items():
    print(key, val)

In [None]:
# Making a dataframe with genre as columns
# Movie has 1 or 0 in genre columns depending on whether the movie includes the corresponding genre
binary_genre_df = pd.get_dummies(movies_characters['genre'])
binary_genre_df.sum()

In [None]:
# Constructing formula used for regression
formula = 'box_office_revenue ~ ' # length+', add continous variables later
for col in dummy_actor_columns.columns:
    formula += 'C(' + col + ')+'
    
formula = formula[:-1]

In [None]:
# Standardization of continuous predictors
movies_binary_actors['length'] = (movies_binary_actors['length'] - movies_binary_actors['length'].mean())/movies_binary_actors['length'].std()

In [None]:
# Linear regression
mod = smf.ols(formula=formula, data=movies_binary_actors)
res = mod.fit()
res_summary = res.summary()

In [None]:
# Note that tables is a list. The table at index 1 is the "core" table. Additionally, read_html puts dfs in a list, so we want index 0
res_as_html = res_summary.tables[1].as_html()
summary_df = pd.read_html(res_as_html, header=0, index_col=0)[0]
summary_df

In [None]:
# Only including actors with p-value < 0.05.
# Statistically significant actors
summary_df = summary_df[summary_df['P>|t|'] < 0.05]

In [None]:
# Sorting summary according to coefficients
summary_df.sort_values(by='coef', ascending=False)