In [96]:
import pandas as pd
import json
import networkx as nx
from networkx.algorithms import bipartite
from tqdm import tqdm
import statsmodels.api as sm
import statsmodels.formula.api as smf
pd.options.mode.chained_assignment = None 

In [97]:
# get the clusters of the charcters
clusters = pd.read_csv('classification_result.csv')
clusters

Unnamed: 0,name,movie_id,classification
0,Shlykov,23890098,22
1,Flickerman,31186339,13
2,Everdeen,31186339,13
3,Thresh,31186339,4
4,Crane,31186339,16
...,...,...,...
182484,Eden,6040782,1
182485,Godfrey,6040782,17
182486,Jones,6040782,23
182487,Fullard,6040782,20


In [137]:
# read the charcter metadata dataset
character_metadata_header = [
    'movie_id',
    'Freebase movie ID',
    'Movie release date',
    'Character Name',
    'Actor DOB',
    'Actor gender',
    'Actor height',
    'Actor ethnicity',
    'Actor Name',
    'Actor age at movie release',
    'Freebase character map',
    'Freebase character ID',
    'Freebase actor ID'
]
character_metadata = pd.read_csv(
    'MovieSummaries/character.metadata.tsv',
    delimiter='\t',
    names=character_metadata_header
)
characters = character_metadata[['movie_id','Actor Name','Character Name']]
characters

Unnamed: 0,movie_id,Actor Name,Character Name
0,975900,Wanda De Jesus,Akooshay
1,975900,Natasha Henstridge,Lieutenant Melanie Ballard
2,975900,Ice Cube,Desolation Williams
3,975900,Jason Statham,Sgt Jericho Butler
4,975900,Clea DuVall,Bashira Kincaid
...,...,...,...
450664,913762,Dorothy Elias-Fahn,Elensh
450665,913762,Jonathan Fahn,Hibiki
450666,28308153,David Hemmings,
450667,28308153,Roberta Paterson,


In [139]:
# drop all missing charcter name
characters = characters[characters['Character Name'].notna()]
# get the list of all potential matches of characters for every movie
actors_with_clusters = pd.merge(characters,clusters,on=['movie_id'],how = 'inner')

filter the potential character matches by removing all the charcters that either don't correspond
or if a charcter name includes more than one clustered name
(Since clustered character names are only one word. So for example two charcters having the same family name contain both the same name)

In [204]:
# A filter to see if a character name in cluster(as we are taking only one word when clustering)
# is included in the character Name in that movie 
character_filter = actors_with_clusters.apply(lambda x: str(x['name']) in str(x['Character Name']), axis=1)
## remove all rows with a classified character matching more than one Character
filtered_characters = actors_with_clusters[character_filter].drop_duplicates(subset =['movie_id' ,'name'], keep = False)
## remove all rows with a character matching more than one classified Character
filtered_characters = actors_with_clusters[character_filter].drop_duplicates(subset =['movie_id' ,'Character Name'], keep = False)
filtered_characters

Unnamed: 0,movie_id,Actor Name,Character Name,name,classification
4,975900,Natasha Henstridge,Lieutenant Melanie Ballard,Ballard,24
8,975900,Ice Cube,Desolation Williams,Williams,4
55,2238856,John Hawkes,Richard Swersey,Richard,16
64,2238856,Miranda July,Christine Jesperson,Christine,16
72,2238856,Miles Thompson,Peter Swersey,Peter,6
...,...,...,...,...,...
889122,6456053,Edmund Purdom,Fane,Fane,11
889153,6456053,Art Carney,Joey Friedlander,Friedlander,9
889203,24997872,Mammootty,Raja,Raja,17
889212,22545667,Richard Dolman,Laurie,Laurie,0


In [264]:
classifications = filtered_characters.groupby('Actor Name')['classification'].apply(list).reset_index(name = 'classifications')
classifications['max_occurences_character'] = classifications['classifications'].apply( lambda x : x.count(max(x,key=x.count)))
classifications['roles played'] = classifications['classifications'].apply( lambda x : len(x))
classifications = classifications[classifications['roles played']>1].sort_values(by= 'max_occurences_character',ascending= False)
classifications

Unnamed: 0,Actor Name,classifications,max_occurences_character,roles played
10742,Mel Blanc,"[3, 6, 3, 23, 3, 20, 3, 1, 3, 8, 23, 13, 3, 17...",72,157
630,Amitabh Bachchan,"[13, 3, 6, 14, 15, 6, 5, 22, 6, 21, 8, 6, 6, 1...",13,77
9962,Mammootty,"[17, 6, 3, 5, 4, 23, 3, 21, 14, 11, 1, 21, 19,...",13,91
6682,James Stewart,"[7, 13, 20, 2, 8, 18, 22, 22, 3, 12, 3, 7, 9, ...",12,55
14079,Samuel L. Jackson,"[16, 4, 17, 15, 17, 17, 17, 7, 6, 6, 6, 6, 6, ...",12,42
...,...,...,...,...
7212,Jesse Garcia,"[5, 23]",1,2
7214,Jesse James,"[6, 20, 9, 19, 0]",1,5
7218,Jesse Metcalfe,"[20, 18, 23]",1,3
7219,Jesse Moss,"[17, 16, 9, 3]",1,4


In [265]:
## get the dataset with oscar nominees and count the wins and nominations
oscar_nominees = pd.read_csv('oscar_nominees.csv')
oscar_actors = oscar_nominees.groupby('Actor Name').agg({'ceremony': 'count',
                                                         'winner'  : 'sum'}).reset_index()
oscar_actors.sort_values(by = 'winner',ascending= False)

Unnamed: 0,Actor Name,ceremony,winner
498,Katharine Hepburn,12,4
912,Walter Brennan,4,3
638,Meryl Streep,21,3
370,Jack Nicholson,12,3
360,Ingrid Bergman,7,3
...,...,...,...
491,June Squibb,1,0
492,Justin Henry,1,0
493,Karen Black,1,0
495,Kate Hudson,1,0


In [266]:
merge_oscar_character = pd.merge(oscar_actors,classifications,on = 'Actor Name',how = 'right', indicator= True)
merge_oscar_character

Unnamed: 0,Actor Name,ceremony,winner,classifications,max_occurences_character,roles played,_merge
0,Mel Blanc,,,"[3, 6, 3, 23, 3, 20, 3, 1, 3, 8, 23, 13, 3, 17...",72,157,right_only
1,Amitabh Bachchan,,,"[13, 3, 6, 14, 15, 6, 5, 22, 6, 21, 8, 6, 6, 1...",13,77,right_only
2,Mammootty,,,"[17, 6, 3, 5, 4, 23, 3, 21, 14, 11, 1, 21, 19,...",13,91,right_only
3,James Stewart,5.0,1.0,"[7, 13, 20, 2, 8, 18, 22, 22, 3, 12, 3, 7, 9, ...",12,55,both
4,Samuel L. Jackson,1.0,0.0,"[16, 4, 17, 15, 17, 17, 17, 7, 6, 6, 6, 6, 6, ...",12,42,both
...,...,...,...,...,...,...,...
5957,Jesse Garcia,,,"[5, 23]",1,2,right_only
5958,Jesse James,,,"[6, 20, 9, 19, 0]",1,5,right_only
5959,Jesse Metcalfe,,,"[20, 18, 23]",1,3,right_only
5960,Jesse Moss,,,"[17, 16, 9, 3]",1,4,right_only


In [278]:
merge_oscar_character['is_oscar_actor'] =  (merge_oscar_character['_merge']== 'both').astype(int)
oscar_nominees_characters = merge_oscar_character[merge_oscar_character['_merge']== 'both'][['Actor Name','max_occurences_character', 'roles played','is_oscar_actor']]
non_oscar_characters = merge_oscar_character[merge_oscar_character['_merge']== 'right_only'][['Actor Name','max_occurences_character', 'roles played','is_oscar_actor']]
oscar_nominees_characters.columns

Index(['Actor Name', 'max_occurences_character', 'roles played',
       'is_oscar_actor'],
      dtype='object')

In [279]:
comparator = pd.merge(oscar_nominees_characters,non_oscar_characters,on = ['roles played'],suffixes=("_1", "_2"))
comparator

Unnamed: 0,Actor Name_1,max_occurences_character_1,roles played,is_oscar_actor_1,Actor Name_2,max_occurences_character_2,is_oscar_actor_2
0,Samuel L. Jackson,12,42,1,Vincent Price,4,0
1,Robert Duvall,10,42,1,Vincent Price,4,0
2,Ben Kingsley,10,29,1,Ray Liotta,5,0
3,Ben Kingsley,10,29,1,Val Kilmer,5,0
4,Ben Kingsley,10,29,1,Dennis Quaid,5,0
...,...,...,...,...,...,...,...
347857,Jennifer Lawrence,1,2,1,Jermaine Williams,1,0
347858,Jennifer Lawrence,1,2,1,Jerry Jewell,1,0
347859,Jennifer Lawrence,1,2,1,Jerry Reed,1,0
347860,Jennifer Lawrence,1,2,1,Jessalyn Wanlim,1,0


In [280]:
## we use perfect matching here in order to have a maximum 1-on-1 matching between movies with oscar winners 
## and movies without any oscar nomination

# Initialise graph
graph = nx.Graph()
graph.add_nodes_from(comparator['Actor Name_1'], bipartite=0)
graph.add_nodes_from(comparator['Actor Name_2'], bipartite=1)
# add edges between movies with at least one oscar nominee and movies with no oscar nomination
# with weight equal the number of genres in common between the two movies
graph.add_weighted_edges_from(
    [(row['Actor Name_1'], row['Actor Name_2'], 1) for index,row in comparator.iterrows()], weight='weight')
# maximize the weight in order to have the best matching between movies 
# matching them with most genres in common while trying to have maximum cardinality
perfect_matching = nx.max_weight_matching(graph)

In [281]:
matched_actors = pd.DataFrame(perfect_matching)
matched_actors = matched_actors.rename(columns = {0:"Actor Name_2", 1: "Actor Name_1"})
matched_actors = pd.merge(matched_actors,comparator, on = ['Actor Name_1','Actor Name_2'],how ='inner')
print('The number of matched pairs is: {}'.format(len(matched_actors)))


The number of matched pairs is: 646


In [282]:
matched_actors.columns

Index(['Actor Name_2', 'Actor Name_1', 'max_occurences_character_1',
       'roles played', 'is_oscar_actor_1', 'max_occurences_character_2',
       'is_oscar_actor_2'],
      dtype='object')

In [283]:
(matched_actors['max_occurences_character_1']>matched_actors['max_occurences_character_2']).sum()/matched_actors.shape[0]*100

42.26006191950464

In [287]:
## split id1 and id2
matched_actors_1 = matched_actors[matched_actors.filter(regex='_1|roles played').columns]
matched_actors_2 = matched_actors[matched_actors.filter(regex='_2|roles played').columns]
# removing prefix for column names so we can have the same column names for matched_actors_1 and matched_actors_2
matched_actors_1.columns = matched_actors_1.columns.str.rstrip('_1')
matched_actors_2.columns = matched_actors_2.columns.str.rstrip('_2')
# concatenating the two dataframes into one
final_matched_actors = pd.concat([matched_actors_1,matched_actors_2])
final_matched_actors['bla'] = (final_matched_actors['max_occurences_character']-1)/(final_matched_actors['roles played']-1)

In [288]:
# linear regression formula
formula = "bla ~ is_oscar_actor"
df = final_matched_actors[['bla','is_oscar_actor']]
#train the linear regression with our final matched movies dataframe
mod = smf.ols(formula=formula, data= df).fit()
print(mod.summary())

                            OLS Regression Results                            
Dep. Variable:                    bla   R-squared:                       0.118
Model:                            OLS   Adj. R-squared:                  0.118
Method:                 Least Squares   F-statistic:                     173.3
Date:                Mon, 19 Dec 2022   Prob (F-statistic):           3.09e-37
Time:                        12:21:51   Log-Likelihood:                 919.26
No. Observations:                1292   AIC:                            -1835.
Df Residuals:                    1290   BIC:                            -1824.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.0565      0.005     12.