In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import seaborn as sns

# Loading the data

For the character names and linguistic features extraction pipeline, please refer to `extract_character_attributes.ipynb`. And for the clusterization pipeline as well as the different clustering methods comparison refer to `clustering.ipynb`.

In [None]:
characters =  pd.read_csv(
    'data/character_clusters.csv', 
    index_col=0,
    converters={
        "adj": lambda x: x.strip("[]").replace("'","").split(", "), # need this to read list columns from csv
        "active": lambda x: x.strip("[]").replace("'","").split(", "),
        "patient": lambda x: x.strip("[]").replace("'","").split(", ")
        }
    )
characters.head()

In [None]:
movies = pd.read_csv(
    'data/MovieSummaries/movie.metadata.tsv', 
    sep='\t', 
    names=['wiki_id', 'freebase_id', 'title', 'release_date', 'revenue', 'runtime', 'languages', 'countries', 'genres']
)
movies.head()

In [None]:
df_charac = pd.read_csv(
    'data/MovieSummaries/character.metadata.tsv', 
    sep='\t', 
    names=['wiki_id', 'freebase_id', 'release_date', 'character', 'date_of_birth', 'sex', 'height', '.','actor','age','character_map','..','...','....']
)
df_charac.head()

In [None]:
characters_and_movies = characters.merge(movies, how='left', on='wiki_id').dropna()
characters_and_movies.head()

In [None]:
characters_and_movies[characters_and_movies['title'].str.contains("Batman")][['title', 'character', 'cluster']]

# Initial analysis of the actors' success

In [None]:
map_dict_to_list = lambda x: [value for key, value in eval(x).items()]
movies['languages'] = movies['languages'].apply(map_dict_to_list)
movies['countries'] = movies['countries'].apply(map_dict_to_list)
movies['genres'] = movies['genres'].apply(map_dict_to_list)
movies.head()

Determine which character bring in the most money


In [None]:
# Merge the two datas
df_merged = df_charac.merge(movies, on=['wiki_id'], how='inner')

# Clean data of movies without box offic revenue
df_merged = df_merged.dropna(subset=['revenue'])

# Determine the actor and sum the box office revenue
actor_totalRevenue = df_merged.groupby(['actor'])['revenue'].agg(['sum', 'count']).reset_index()
actor_totalRevenue.columns = ['actor', 'bo_revenue', 'Actor Count']

# Sort the actor_revenue DataFrame in descending order
actor_revenue_sort = actor_totalRevenue.sort_values(by='bo_revenue', ascending=False)

#Extract only the top 20 best actors
actor_top20 = actor_revenue_sort.head(20)

# Merge the top 20 actors with original datas
top20_actor_data = actor_top20.merge(df_merged[['actor', 'languages','sex']], on='actor', how='left')
top20_actor_data = top20_actor_data.drop_duplicates(subset = ['actor'])
top20_actor_data.head()

In [None]:
# Plot the data above
actor_names = top20_actor_data['actor']
revenues_total = top20_actor_data['bo_revenue']

plt.figure(figsize=(12, 6))
plt.plot(actor_names, revenues_total)
plt.xlabel('Actor Names')
plt.ylabel('Total Box Office Revenue')
plt.title('Top 10 Actors by Box Office Revenue')
plt.xticks(rotation=45, ha='right')  # Rotate the actor names for better readability

plt.tight_layout()
plt.show()



In the data above, we are ranking the actors according to the box office revenue of the movies they acted in.

Study the main lead actors obtaining the highest revenue

In [None]:
def calculater_importance(x):
    a = 0
    for w in x["adj"]:
        a += w.isalpha()
    for w in x["active"]:
        a += w.isalpha()
    for w in x["patient"]:
        a += w.isalpha()
    return a

characters['importance'] = characters.apply(calculater_importance, axis=1)

characters_with_importance = characters[['wiki_id', 'character', 'importance']]
characters_with_importance.head()

In [None]:
characters_with_importance['importance_share'] = characters_with_importance['importance'] / characters_with_importance.groupby('wiki_id')['importance'].transform('sum')
characters_with_importance['is_important'] = characters_with_importance['importance_share'] >= 0.2
characters_with_importance.sample(10)

In [None]:
df_charac_actor = df_merged[['wiki_id', 'character','actor','revenue']]

# Include characters and actors together in the same dataset
character_actor = df_charac_actor.merge(characters_with_importance, on=['wiki_id', 'character'], how='left')

# Merge only the main charac of movies\n",
main_character = character_actor[character_actor['is_important'] == True]

#Add up the Box office revenue and count the occurance of movie appearance
main_charac_mergedBOR = main_character.groupby(['actor'])['revenue'].agg(['sum', 'count']).reset_index()

main_charac_sort = main_charac_mergedBOR.sort_values(by='sum', ascending=False)

main_charac_top20 = main_charac_sort.head(20)
main_charac_top20.head()

In [None]:
# Plot the charts for main actors\n",
main_charac_top20_names = main_charac_top20['actor']
main_charac_revenue = main_charac_top20['sum']

plt.figure(figsize=(12, 6))
plt.plot(main_charac_top20_names,main_charac_revenue)
plt.xlabel('Actor Names')
plt.ylabel('Box Office Revenue')
plt.title('Top 20 Main Characters by Box average Office Revenue')
plt.xticks(rotation=45, ha='right')  # Rotate the actor names for better readability

plt.show()

Male or female perform better in generating box office revenue?

In [None]:
gender_actor_data = actor_revenue_sort.merge(df_charac[['actor', 'sex']], on='actor', how='left')
gender_actor_data = gender_actor_data.drop_duplicates(subset = ['actor'])
gender_actor_data.head()

In [None]:
# Include gender into the data for box plot aalysis
gender_actor_data = actor_revenue_sort.merge(df_charac[['actor', 'sex']], on='actor', how='left')
gender_actor_data = gender_actor_data.drop_duplicates(subset = ['actor'])

# Box plot for revenue between genders
plt.figure(figsize=(10, 10))
gender_boxplot = sns.boxplot(x="sex", y="bo_revenue", data=gender_actor_data.loc[gender_actor_data['sex'].isin(['M','F'])])

# Set labels
plt.ylabel("Movie box office revenue($)")
plt.title("Box office revenue between genders")
plt.ylim(0, 500000000)
          
#Show plot
plt.tight_layout()
plt.show()

# Given the variance and the 25th percentile, 75th percentile and medan, male actors drive higher box office revenue."

Study the language used in top 50 films / Which language drive the highest revenue?

In [None]:
# Extract the top 100 movies
film_sort = df_merged[['countries','languages','revenue','wiki_id']]
film_sort = film_sort.drop_duplicates(subset = ['wiki_id'])
film_sort = film_sort.sort_values(by='revenue', ascending=False)
top100film = film_sort.head(100)

# Split the language into new data frames\
df_split_languages = top100film.explode('languages')

# Count the occurance of the languages
language_counts = df_split_languages['languages'].value_counts()
df_language_counts = pd.DataFrame({'Language': language_counts.index, 'Count': language_counts.values})

# Plot the data
df_language_counts.plot(kind='bar', x='Language', y='Count', figsize=(12, 6),log=True)
plt.title('Languagege_counts Counts')
plt.xlabel('Language')
plt.ylabel('Count (logscale)')
plt.xticks(rotation=45, ha='right')
plt.show()

Study the genre of movies that earn the highest revenue.

In [None]:
#Clean data to remove repeated movies
df_clean_split_genres = df_merged[['title','genres','revenue','wiki_id']]
df_clean_split_genres = df_clean_split_genres.drop_duplicates(subset = ['wiki_id'])

# Split the genre into new data frames
df_split_genre = df_clean_split_genres.explode('genres')[['title', 'revenue', 'genres']]

# Determine the actor and sum the box office revenue
genre_totalRevenue = df_split_genre.groupby('genres')['revenue'].sum().reset_index()

# Sort the BOR in ascending order
genre_totalRevenue = genre_totalRevenue.sort_values(by='revenue', ascending=False)
genre_totalRevenue_top10 = genre_totalRevenue.head(10)

genre = genre_totalRevenue_top10['genres']
genreRevenue = genre_totalRevenue_top10['revenue']

#Plot the charts
plt.plot(genre, genreRevenue)
plt.xlabel('Genre')
plt.ylabel('Box office revenue')
plt.title('Top 10 Genre by Box Office Revenue')
plt.xticks(rotation=45, ha='right')  # Rotate the actor names for better readability

<h4> Account for inflation for actors with NO prority for main actors

In [None]:
# Extract the relevent information required for inflation calculation
inf_revenue = df_merged[['wiki_id','release_date_x','character','actor','title','revenue']]
inf_revenue.head()

In [None]:
# Convert the values to datetime format
release_year = pd.to_datetime(inf_revenue['release_date_x'], errors='coerce')

# Extract the year
release_year = release_year.dt.year
release_year = pd.to_numeric(release_year, errors='coerce').astype('Int64')

# Append year into the data
inf_revenue['year'] = release_year

# Display the result
inf_revenue.head()

In [None]:
# Determine the earliest data available
min_year = inf_revenue['year'].min()
min_year

In [None]:
# Load the CPI data
cpi_data = pd.read_csv('data/cpi_data.csv', )
cpi_data.head()

In [None]:
# Merging CPI data based on the year
inf_revenue_cpi = pd.merge(inf_revenue, cpi_data, left_on='year', right_on='year', how='left')

# Calculate the adjusted revenue by dividing movie revenue by CPI
# formula: adjusted_revenue = (revenue / CPI)*100
inf_revenue_cpi['adjusted_revenue'] = (inf_revenue_cpi['revenue'] / (inf_revenue_cpi['cpi']) * 100)

# Determine the actor and sum the box office revenue
inf_actor_totalRevenue = inf_revenue_cpi.groupby(['actor'])['adjusted_revenue'].agg(['sum', 'count']).reset_index()
inf_actor_totalRevenue.columns = ['actor', 'adjusted_revenue', 'actor_count']

# Sort the actor_revenue DataFrame in descending order
cpi_revenue_sum = inf_actor_totalRevenue.sort_values(by='adjusted_revenue', ascending=False)

#Extract only the top 20 best actors
cpi_actor_top20 = cpi_revenue_sum.head(20)

# Merge the top 20 actors with original datas
cpi_actor_top20 = cpi_actor_top20.merge(df_merged[['actor', 'languages','sex']], on='actor', how='left')
cpi_actor_top20 = cpi_actor_top20.drop_duplicates(subset = ['actor'])
cpi_actor_top20.head()

In [None]:
# Plot the data above
actor_names_cpi = cpi_actor_top20['actor']
revenues_total_cpi = cpi_actor_top20['adjusted_revenue']

plt.figure(figsize=(12, 6))
plt.plot(actor_names_cpi, revenues_total_cpi)
plt.xlabel('Actor Names')
plt.ylabel('Total Box Office Revenue')
plt.title('Top 10 Actors by Box Office Revenue (with inflation)')
plt.xticks(rotation=45, ha='right')  # Rotate the actor names for better readability

plt.tight_layout()
plt.show()

<h4> Account for inflation for actors with prority for main actors

In [None]:
main_charac_cpi = main_charac_mergedBOR.merge(inf_revenue[['actor', 'title', 'year','revenue']], on='actor', how='left')

# remove duplicated actors
main_charac_cpi = main_charac_cpi.drop_duplicates(subset = ['actor'])

#remove data without year
main_charac_cpi = main_charac_cpi.dropna(subset=['year'])

# Merging CPI data based on the year
mc_revenue_cpi = pd.merge(main_charac_cpi, cpi_data, left_on='year', right_on='year', how='left')
mc_revenue_cpi.head()

In [None]:
# Calculate the adjusted revenue by dividing movie revenue by CPI
# formula: adjusted_revenue = (revenue / CPI)*100
mc_revenue_cpi['adjusted_revenue'] = (mc_revenue_cpi['revenue'] / (mc_revenue_cpi['cpi']) * 100)

# Determine the actor and sum the box office revenue
mc_totalRevenue_cpi = mc_revenue_cpi.groupby(['actor'])['adjusted_revenue'].agg(['sum', 'count']).reset_index()
mc_totalRevenue_cpi.columns = ['actor', 'adjusted_revenue', 'actor_count']

# Sort the actor_revenue DataFrame in descending order
mc_totalRevenue_cpi_sort = mc_totalRevenue_cpi.sort_values(by='adjusted_revenue', ascending=False)

#Extract only the top 20 best actors
cpi_mc_actor_top20 = mc_totalRevenue_cpi_sort.head(20)

# Merge the top 20 actors with original datas
cpi_mc_actor_top20 = cpi_mc_actor_top20.merge(df_merged[['actor', 'languages','sex']], on='actor', how='left')
cpi_mc_actor_top20 = cpi_mc_actor_top20.drop_duplicates(subset = ['actor'])
cpi_mc_actor_top20.head()

In [None]:
# Plot the data above
actor_names_cpi_main = cpi_mc_actor_top20['actor']
revenues_total_cpi_main = cpi_mc_actor_top20['adjusted_revenue']

plt.figure(figsize=(12, 6))
plt.plot(actor_names_cpi_main, revenues_total_cpi_main)
plt.xlabel('Actor Names')
plt.ylabel('Total Box Office Revenue')
plt.title('Top 20 Actors by Box Office Revenue (with inflation)')
plt.xticks(rotation=45, ha='right')  # Rotate the actor names for better readability

plt.tight_layout()
plt.show()

<h4> Giving weights to different importance to actors

In [None]:
characters_with_importance.head()

In [None]:
df_charac_actor.head()

In [None]:
# This data accounts for inflation, multiplying the importance foactor to the actors income
merged_char_impt = df_charac_actor.merge(characters_with_importance, on=['character'], how='inner')
merged_char_impt = merged_char_impt.drop_duplicates(subset=['actor', 'character'])
merged_char_impt.head(20)

In [None]:
# This data accounts for inflation, multiplying the importance foactor to the actors income
merged_char_impt = df_charac_actor.merge(characters_with_importance, on=['character'], how='inner')

# Remove actors that are duplicated
merged_char_impt = merged_char_impt.drop_duplicates(subset=['actor', 'character'])

# multiply importance_share with revenue. the value will give a measure of how impt the actors are
merged_char_impt['impt_revenue'] = (merged_char_impt['importance_share'] * merged_char_impt['revenue'] ) / merged_char_impt['revenue']

# Sum up the impt_revenue
merged_char_impt = merged_char_impt.groupby(['actor'])['impt_revenue'].agg(['sum', 'count']).reset_index()
merged_char_impt.columns = ['actor', 'impt_revenue', 'actor_count']

# Sort the data in decending order
merged_char_impt_sort = merged_char_impt.sort_values(by='impt_revenue', ascending=False)

merged_char_impt_sort_top20 = merged_char_impt_sort.head(20)
merged_char_impt_sort_top20.head()


In [None]:
# Plot the data above
impt_revenue_actor_main = merged_char_impt_sort_top20['actor']
impt_revenue_revenue_main = merged_char_impt_sort_top20['impt_revenue']

plt.figure(figsize=(12, 6))
plt.plot(impt_revenue_actor_main, impt_revenue_revenue_main )
plt.xlabel('Actor Names')
plt.ylabel('Measure of actors importance wrt to revenue and importance factor')
plt.title('Top 20 Actors by measure of importance and revenue')
plt.xticks(rotation=45, ha='right')  # Rotate the actor names for better readability

plt.tight_layout()
plt.show()

In [None]:
# Taking the average of the data by dividing it with the number of movies they acted in
merged_char_impt_sort [ 'impt_revenue_avg'] = merged_char_impt_sort ['impt_revenue'] / merged_char_impt_sort ['actor_count']

# Sort the data in decending order
avg_char_impt_sort = merged_char_impt_sort.sort_values(by='impt_revenue', ascending=False)
avg_char_impt_sort = avg_char_impt_sort.head(20)



In [None]:
# Plot the data above
avg_char_impt_sort_main_actor = avg_char_impt_sort['actor']
avg_char_impt_sort_main_revenue = avg_char_impt_sort['impt_revenue']

plt.figure(figsize=(12, 6))
plt.plot(avg_char_impt_sort_main_actor, avg_char_impt_sort_main_revenue )
plt.xlabel('Actor Names')
plt.ylabel('Measure of actors importance wrt to the average of revenue and importance factor')
plt.title('Top 20 Actors by measure of importance and revenue')
plt.xticks(rotation=45, ha='right')  # Rotate the actor names for better readability

plt.tight_layout()
plt.show()

<h4> Examine the effects of rating

In [None]:
df_rating = pd.read_csv('data/movies_with_rating.csv', )
df_rating.head()

<h4> Plotting the actor's success based on their rating

# Genre prediction using clusters

# Revenue prediction using clusters