In [None]:
#load the statistical libraries
from statsmodels.stats import diagnostic
from scipy import stats
import seaborn as sns
import math
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df_movie = pd.read_csv("./data/movie.metadata.csv")
df_movie.head()

In [None]:
import ast

def extract_language(language_str):
    try:
        language_dict = ast.literal_eval(language_str)
        
        if "/m/02h40lc" in language_dict:
            return language_dict["/m/02h40lc"].split()[0]  # Extract the first word of the language
        elif "/m/04306rv" in language_dict:
            return language_dict["/m/04306rv"].split()[0]  # Extract the first word of the language for German
    except (ValueError, SyntaxError, KeyError):
        pass  # Handle parsing errors or missing keys gracefully
    return None

# Apply the extract_language function to the 'Language' column to create a new column 'Cleaned Language'
df_movie['Cleaned Language'] = df_movie['Movie language'].apply(extract_language)

print(df_movie['Cleaned Language'])


In [None]:
df_char = pd.read_csv("./data/character.metadata.csv")
df_char.head()

In [None]:
## Find which actor bring in the highest revenue
# Store data about wiki movie ID, freebase movie ID, Movie name, Movie box office revenue
df_movie_revenue = df_movie[['Wiki Movie ID','Freebase Movie ID','Movie name','Movie box office revenue','Movie language', 'Movie Countries']]

# Clean data of movies without box offic revenue
df_movie_revenue = df_movie_revenue.dropna(subset=['Movie box office revenue'])
df_movie_revenue.head()

In [None]:
# Store data about wiki movie ID, freebase movie ID, Movie name, Movie box office revenue
df_char_revenue = df_char[['Wiki Movie ID','Freebase Movie ID','Character Name','Actor Name','Actor age at movie release','Gender']]
df_char_revenue.head()

In [None]:
# This generates data on how much box office revenue is being generated by the actors in their lifetime

# Merge the two datas
df_merged = df_char_revenue.merge(df_movie_revenue, on=['Freebase Movie ID'], how='inner')

# Determine the actor and sum the box office revenue
actor_totalRevenue = df_merged.groupby(['Actor Name'])['Movie box office revenue'].agg(['sum', 'count']).reset_index()

actor_totalRevenue.columns = ['Actor Name', 'Movie box office revenue', 'Actor Count']

# Sort the actor_revenue DataFrame in descending order
actor_revenue_sort = actor_totalRevenue.sort_values(by='Movie box office revenue', ascending=False)

actor_top10 = actor_revenue_sort.head(10)

# Merge the top 10 actors with original datas
top10_actor_data = actor_top10.merge(df_char_revenue[['Actor Name', 'Gender']], on='Actor Name', how='left')
top10_actor_data = actor_top10.merge(df_merged[['Actor Name', 'Movie language']], on='Actor Name', how='left')
top10_actor_data = top10_actor_data.drop_duplicates(subset = ['Actor Name'])
top10_actor_data


In [None]:
# Plot the data above
actor_names = top10_actor_data['Actor Name']
revenues_total = top10_actor_data['Movie box office revenue']

plt.figure(figsize=(12, 6))
plt.plot(actor_names, revenues_total)
plt.xlabel('Actor Names')
plt.ylabel('Total Box Office Revenue')
plt.title('Top 10 Actors by Box Office Revenue')
plt.xticks(rotation=45, ha='right')  # Rotate the actor names for better readability

plt.show()