In [None]:
#load the statistical libraries
from statsmodels.stats import diagnostic
from scipy import stats
import seaborn as sns
import math
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df_movie = pd.read_csv("./data/movie.metadata.csv")
df_movie.head()

In [None]:
map_dict_to_list = lambda x: [value for key, value in eval(x).items()]
df_movie['language'] = df_movie['language'].apply(map_dict_to_list)
df_movie['countries'] = df_movie['countries'].apply(map_dict_to_list)
df_movie['genre'] = df_movie['genre'].apply(map_dict_to_list)
df_movie.head()

In [None]:
df_char = pd.read_csv("./data/character.metadata.csv")
df_char.head()

In [None]:
## Find which actor bring in the highest revenue
# Store data about wiki movie ID, freebase movie ID, Movie name, Movie box office revenue
df_movie_revenue = df_movie[['Wiki Movie ID','Freebase Movie ID','Movie name','Movie box office revenue','language', 'countries','genre']]

# Clean data of movies without box offic revenue
df_movie_revenue = df_movie_revenue.dropna(subset=['Movie box office revenue'])
df_movie_revenue.head()

In [None]:
# Store data about wiki movie ID, freebase movie ID, Movie name, Movie box office revenue
df_char_revenue = df_char[['Wiki Movie ID','Freebase Movie ID','Character Name','Actor Name','Actor age at movie release','Gender']]
df_char_revenue.head()

In [None]:
# This generates data on how much box office revenue is being generated by the actors in their lifetime

# Merge the two datas
df_merged = df_char_revenue.merge(df_movie_revenue, on=['Freebase Movie ID'], how='inner')

# Determine the actor and sum the box office revenue
actor_totalRevenue = df_merged.groupby(['Actor Name'])['Movie box office revenue'].agg(['sum', 'count']).reset_index()

actor_totalRevenue.columns = ['Actor Name', 'Movie box office revenue', 'Actor Count']

# Sort the actor_revenue DataFrame in descending order
actor_revenue_sort = actor_totalRevenue.sort_values(by='Movie box office revenue', ascending=False)

actor_top10 = actor_revenue_sort.head(10)

# Merge the top 10 actors with original datas
top10_actor_data = actor_top10.merge(df_char_revenue[['Actor Name', 'Gender']], on='Actor Name', how='left')
top10_actor_data = actor_top10.merge(df_merged[['Actor Name', 'language']], on='Actor Name', how='left')
top10_actor_data = top10_actor_data.drop_duplicates(subset = ['Actor Name'])
top10_actor_data.head()

In [None]:
# Plot the data above
actor_names = top10_actor_data['Actor Name']
revenues_total = top10_actor_data['Movie box office revenue']

plt.figure(figsize=(12, 6))
plt.plot(actor_names, revenues_total)
plt.xlabel('Actor Names')
plt.ylabel('Total Box Office Revenue')
plt.title('Top 10 Actors by Box Office Revenue')
plt.xticks(rotation=45, ha='right')  # Rotate the actor names for better readability

plt.tight_layout()
plt.show()

# In the data below, we are ranking the actors according to the box office revenue of the movies they acted in.

In [None]:
###### Study the main lead actors obtaining the highest revenue - extract from plot summaries\n",
# Characters names extracted from plot smmaries, highly associated to be a main character.\n",
characters =  pd.read_csv(
    'data/character_attributes_lemmatized.csv', 
    index_col=0,
    converters={
        "adj": lambda x: x.strip("[]").replace("'","").split(", "), # need this to read list columns from csv
        "active": lambda x: x.strip("[]").replace("'","").split(", "),
        "patient": lambda x: x.strip("[]").replace("'","").split(", ")
        }
    )

characters_only = characters['character']
characters_only.head()

In [None]:
df_charac_actor = df_merged[['Character Name','Actor Name','Movie box office revenue']]

# Include characters and actors together in the same dataset
character_actor = df_charac_actor.merge(df_char_revenue[['Actor Name', 'Character Name']], on='Actor Name', how='left')

# Drop actor data that do not have character data
df_charac_actor = df_charac_actor.dropna(subset=['Character Name'])
df_charac_actor

In [None]:
# Merge only the main charac of movies\n",
main_charac = df_charac_actor [df_charac_actor['Character Name'].isin(characters_only)]
main_charac

#Add up the Box office revenue and count the occurance of movie appearance
main_charac_mergedBOR = df_merged.groupby(['Actor Name'])['Movie box office revenue'].agg(['sum', 'count']).reset_index()

main_charac_sort = main_charac_mergedBOR.sort_values(by='sum', ascending=False)

main_charac_top20 = main_charac_sort.head(20)
main_charac_top20

In [None]:
# Plot the charts for main actors\n",
main_charac_top20_names = main_charac_top20['Actor Name']
main_charac_revenue = main_charac_top20['sum']

plt.figure(figsize=(12, 6))
plt.plot(main_charac_top20_names,main_charac_revenue)
plt.xlabel('Actor Names')
plt.ylabel('Box Office Revenue')
plt.title('Top 20 Main Characters by Box average Office Revenue')
plt.xticks(rotation=45, ha='right')  # Rotate the actor names for better readability

plt.show()

In [None]:
# Gender data\n",
# Include gender into the data for box plot aalysis
gender_actor_data = actor_revenue_sort.merge(df_char_revenue[['Actor Name', 'Gender']], on='Actor Name', how='left')
gender_actor_data = gender_actor_data.drop_duplicates(subset = ['Actor Name'])

# Box plot for revenue between genders
plt.figure(figsize=(10, 7.5))
gender_boxplot = sns.boxplot(x="Gender", y="Movie box office revenue", data=gender_actor_data.loc[gender_actor_data['Gender'].isin(['M','F'])])

# Set labels
plt.ylabel("Movie box office revenue($)")
plt.title("Box office revenuebetween genders")
plt.ylim(0, 500000000)
          
#Show plot
plt.tight_layout()
plt.show()

# Given the variance and the 25th percentile, 75th percentile and medan, male actors drive higher box office revenue."

In [None]:
#-----------language------------
# Study the language used in top 50 films / Which language drive the highest revenue?
film_sort = df_movie_revenue.sort_values(by='Movie box office revenue', ascending=False)
top100film = film_sort.head(100)
top100film.head()

In [None]:
# Take top 100 earning movies, count the languages used in the movies
# Split the language into new data frames\
df_split_languages = top100film.explode('language')

# Count the occurance of the languages
language_counts = df_split_languages['language'].value_counts()
df_language_counts = pd.DataFrame({'Language': language_counts.index, 'Count': language_counts.values})

# Plot the data
df_language_counts.plot(kind='bar', x='Language', y='Count', figsize=(12, 6),log=True)
plt.title('Languagege_counts Counts')
plt.xlabel('Language')
plt.ylabel('Count (logscale)')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
# Study the types of movies that earn the highest revenue
# Split the genre into new data frames
df_split_genre = df_movie_revenue.explode('genre')[['Movie name', 'Movie box office revenue', 'genre']]

# Determine the actor and sum the box office revenue
genre_totalRevenue = df_split_genre.groupby('genre')['Movie box office revenue'].sum().reset_index()

# Sort the BOR in ascending order
genre_totalRevenue = genre_totalRevenue.sort_values(by='Movie box office revenue', ascending=False)
genre_totalRevenue_top10 = genre_totalRevenue.head(10)

genre = genre_totalRevenue_top10['genre']
genreRevenue = genre_totalRevenue_top10['Movie box office revenue']

#Plot the charts
plt.plot(genre, genreRevenue)
plt.xlabel('Genre')
plt.ylabel('Box office revenue')
plt.title('Top 10 Genre by Box Office Revenue')
plt.xticks(rotation=45, ha='right')  # Rotate the actor names for better readability

In [None]:
# Can older folks still drive in good revenue in cinema?
# Look into how age can affect the role the actors play in? And how much money they are able to drive in?
# Account for inflation among countries?