In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import os
import json

Our metric of an actor's success can be based on two factors: 
 - awards (we consider 3 prestigiuos awards: Oscar, Golden Globe and Critics Choice) show an actor's recognition by the professional community
 - popularity shows an actor's recognition by the broad audience

We can aggregate these factors as follows:

$ Success(P, N, W, t) = P\_scale * P + N\_scale * N + W\_scale * W $

In the formula P corresponds to popularity (from TMDB or new database with the most popular actors), W is the number of times the actor became an award winner or, in case of Oscar, nominant. Scaling factors are constants for an award-related part.

In [2]:
# database with Oscar awards retrieved from https://en.wikipedia.org/wiki/List_of_actors_with_Academy_Award_nominations
oscars_df = pd.read_csv('../Data/preprocessed_data/oscars_db.csv')
# database with Golden Globe awards
gg_awards_df = pd.read_csv('../Data/preprocessed_data/gg_awards.csv')
# database with Critic Choise awards
cc_awards_df = pd.read_csv('../Data/preprocessed_data/cc_awards.csv')

In [3]:
gg_awards_df = gg_awards_df.rename(columns={"Actor": "name"})
cc_awards_df = cc_awards_df.rename(columns={"Actor": "name"})

In [4]:
# give more weight to the Oscar wins compared to nominations
oscars_df["wins"] = oscars_df["wins"].apply(lambda x: 2*x)
# some competitive categories of an award are considered more important, so we give them more weight
for col_name in ["GG_Best_Actor_Drama", "GG_Best_Actress_Drama",
                 "GG_Best_Actor_Comedy_or_Musical", "GG_Best_Actress_Comedy_or_Musical"]:
    gg_awards_df[col_name] = gg_awards_df[col_name].apply(lambda x: 2*x)
for col_name in ["Best_Actor", "Best_Actress"]:
    cc_awards_df[col_name] = cc_awards_df[col_name].apply(lambda x: 2*x)

In [5]:
oscars_df["oscar_total"] = oscars_df[["nominations", "wins"]].sum(axis=1)
oscars_df = oscars_df[["name", "oscar_total"]]
gg_awards_df["gg_total"] = gg_awards_df[["GG_Best_Actor_Drama", "GG_Best_Actress_Drama", 
                                         "GG_Best_Supporting_Actor", "GG_Best_Supporting_Actress", 
                                         "GG_Best_Actor_Comedy_or_Musical",
                                         "GG_Best_Actress_Comedy_or_Musical"]].sum(axis=1)
gg_awards_df = gg_awards_df[["name", "gg_total"]]
cc_awards_df["cc_total"] = cc_awards_df[["Best_Actor", "Best_Actress", "Best_Supporting_Actor",
                                         "Best_Supporting_Actress"]].sum(axis=1)
cc_awards_df = cc_awards_df[["name", "cc_total"]]

In [6]:
gg_awards_sum_df = gg_awards_df.groupby(by="name").sum()
cc_awards_sum_df = cc_awards_df.groupby(by="name").sum()

In [7]:
filtered_actors = pd.read_csv("../Data/preprocessed_data/tmdb_acting_in_2003_and_later.csv")

In [8]:
awards_df = pd.merge(left=oscars_df, right=gg_awards_sum_df, how='outer', on="name")
awards_df = pd.merge(left=awards_df, right=cc_awards_sum_df, how='outer', on="name")
awards_df = awards_df.fillna(0.0)
awards_df['awards_total'] = awards_df[['oscar_total', 'gg_total', 'cc_total']].sum(axis=1)
# awards_df = awards_df[["name", "awards_total"]]
display(awards_df)

Unnamed: 0,name,oscar_total,gg_total,cc_total,awards_total
0,Barkhad Abdi,1.0,1.0,1.0,3.0
1,F. Murray Abraham,3.0,2.0,0.0,5.0
2,Amy Adams,6.0,12.0,7.0,25.0
3,Nick Adams,1.0,0.0,0.0,1.0
4,Isabelle Adjani,2.0,0.0,0.0,2.0
...,...,...,...,...,...
1352,Sidney Flanigan,0.0,0.0,2.0,2.0
1353,Sterling K. Brown,0.0,0.0,1.0,1.0
1354,Tiffany Haddish,0.0,0.0,1.0,1.0
1355,Zendaya,0.0,0.0,2.0,2.0


In [9]:
filtered_actors_with_awards = pd.merge(left=filtered_actors["name"], right=awards_df, 
                                       how="outer", on="name")

In [10]:
filtered_actors_with_awards = filtered_actors_with_awards.fillna(0.0)
display(filtered_actors_with_awards)

Unnamed: 0,name,oscar_total,gg_total,cc_total,awards_total
0,Gary Oldman,5.0,4.0,4.0,13.0
1,Florence Pugh,1.0,0.0,1.0,2.0
2,Jason Statham,0.0,0.0,0.0,0.0
3,Jackie Chan,0.0,0.0,0.0,0.0
4,Scarlett Johansson,2.0,9.0,5.0,16.0
...,...,...,...,...,...
8704,Meryl Streep (TIE),0.0,0.0,4.0,4.0
8705,Michelle Williams (TIE),0.0,0.0,1.0,1.0
8706,Sandra Bullock (TIE),0.0,0.0,2.0,2.0
8707,Sidney Flanigan,0.0,0.0,2.0,2.0


In [11]:
filtered_actors_with_awards.to_csv("filtered_actors_with_awards.csv", index=False)

In [12]:
plt.figure(figsize=(8, 5))

plt.hist(awards_df['oscar_total'], bins=30, color='b', alpha=0.4, label='Oscar')
plt.hist(awards_df['cc_total'], bins=30, color='b', alpha=0.9, label='Critic Choice')
plt.hist(awards_df['gg_total'], bins=30, color='b', alpha=0.2, label='Golden Globe')

plt.xlabel('Number of awards')
plt.yscale('log')
plt.ylabel('Number of actors')
plt.title('Distribution of number of awards received by an actor during their career')
plt.legend()

: 

Popularity of the actors and films they are known for were extracted in Preprocessing/actors_preprocessing.ipynb. In our analysis we are targeting Hollywood actors, since the Oscar is Hollywood-biased. For this, we apply filtering based on the prevalent language of the most popular films ("original_language" field in the database) of each actor and leave only English ones. 

*This filtering does not reject British film industry but their movies and actors are also eligible for Oscar.

In [14]:
actors_df = pd.read_csv('../Data/preprocessed_data/actors_db.csv')
actors_df = actors_df[actors_df["original_language"] == "en"] # English movies filtering
# popularity_df = actors_df[["name", "popularity", "known_for"]] # for each actor known_for contains top-3 most recognised movies
popularity_df = actors_df[["name", "popularity"]]
display(popularity_df)
print(f"There are {popularity_df.shape[0]} actors in the filtered dataset")

Unnamed: 0,name,popularity
1,Gary Oldman,220.449
3,Florence Pugh,176.589
4,Jason Statham,162.466
6,Jackie Chan,156.714
7,Scarlett Johansson,156.460
...,...,...
9574,Richard Derr,14.134
9575,Michael Maloney,14.134
9576,Gaia Scodellaro,14.133
9578,Peter Cullen,14.133


There are 8441 actors in the filtered dataset


In [25]:
success_df = pd.merge(left=popularity_df, right=filtered_actors_with_awards[["name", "awards_total"]], 
                      how="left", on="name")
success_df = success_df.fillna(0)

In [26]:
# min-max scaling for the popularity and award
success_df["popularity"] = (success_df["popularity"] - success_df["popularity"].min()) / (success_df["popularity"].max() - success_df["popularity"].min())
success_df["awards_total"] = (success_df["awards_total"] - success_df["awards_total"].min()) / (success_df["awards_total"].max() - success_df["awards_total"].min())

# taking square root of awards emphasizing the difference between 0 awards and at least 1 award
success_df["awards_total"] = success_df["awards_total"].apply(lambda x: x ** 0.5)

display(success_df)

Unnamed: 0,name,popularity,awards_total
0,Gary Oldman,1.000000,0.380058
1,Florence Pugh,0.787413,0.149071
2,Jason Statham,0.718960,0.000000
3,Jackie Chan,0.691081,0.000000
4,Scarlett Johansson,0.689850,0.421637
...,...,...,...
8436,Richard Derr,0.000005,0.000000
8437,Michael Maloney,0.000005,0.000000
8438,Gaia Scodellaro,0.000000,0.000000
8439,Peter Cullen,0.000000,0.000000
